In [1]:
import gensim
from gensim import models, corpora, similarities
from collections import defaultdict



In [2]:
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [3]:
nltk.download("stopwords")
from nltk.corpus import stopwords
esw = stopwords.words("english")
from string import punctuation

#remove "num", because "num" has the highest term frequency(45538) at the orinal file, 
#the second most frequent term only has a freq. of 3750
esw = esw + ['abstract', 'ci', 'hr','l','pubmed', 'num'] 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\D070678\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import datetime

In [5]:
####read file

#read train.docs file
train_doc_file = pd.read_csv("train.docs", encoding = 'utf-8', sep='\t', header=None)
train_doc_file.columns = ['id', 'text']

#read train.nontopic-titles.queries file
train_query_file = pd.read_csv("train.nontopic-titles.queries", encoding = 'utf-8', sep='\t', header=None)
train_query_file.columns = ['id', 'text']

#read example query file, only one query
#train_query_file = pd.read_csv("example.queries", encoding = 'utf-8', sep='\t', header=None)
#train_query_file.columns = ['id', 'text']


        

# Part 1: build functions for LDA Topic Model 
1. preprocessing
2. build DTM, query vector
3. LDA topic model clustering the docs and query
4. IR

In [6]:
## def preprocessing functions which fit with our dataset
# input file: train_doc_file["text"]/ train_query_file["text"]

def removeStopwords(file):
    i = 0
    for el in file:
        el = ' '.join([word for word in el.lower().split() if word not in esw])
        file[i] = el
        i += 1
def removePunctuation(file):
    i = 0
    for el in file:
        el = ' '.join([word for word in el.lower().split() if word not in punctuation])
        file[i] = el
        i += 1

def stemming(file):
    ps = PorterStemmer()
    i = 0
    for el in file:
        el = ' '.join([ps.stem(word) for word in el.lower().split() ])
        file[i] = el
        i += 1

In [7]:
##### build functions to generate document-term matrix

## get DTM, weighted by tfidf, the sqrt of the query vector is 1
# therefore, in the retrive phase, 
# dot product of the doc vector and query vector can be used to represent the cosine similarity

# input format: train_doc_file.text

def get_DTM_tfidf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    #print(vocabulary)

    tfidf_transformer = TfidfTransformer()
    
    X_train_tfidf = CountVectorizer(vocabulary = vocabulary)
    X_train_tfidf = X_train_tfidf.fit_transform(file)
    
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_tfidf)
    return X_train_tfidf


## get DTM, weighted by term frequency
def get_DTM_tf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)
    vocabulary = list(train_count_vect.vocabulary_.keys())

    X_train_tf = CountVectorizer(vocabulary = vocabulary)
    X_train_tf = X_train_tf.fit_transform(file)
    return X_train_tf




In [8]:
###Generate query vector for each query

# get_QueryVector_tfidf helps to get the tiidf weighted query vector, the sqrt of the query vector is 1
# therefore, in the retrive phase, 
# dot product of the doc vector and query vector can be used to represent the cosine similarity

# input format: train_doc_file.text, train_query_file.text


def get_QueryVector_tfidf(queryFile, docFile):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(docFile)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    
    if type(queryFile) is str:
        query_vect = []
        query = queryFile
        query = query.split()
        frequency = defaultdict(int)
        for el in vocabulary:
            if el in query:
                frequency[el]+= 1
            else:
                frequency[el] = 0
        query_vect = list(dict(frequency).values())/np.linalg.norm(list(dict(frequency).values()))
        
    else:
        tfidf_transformer = TfidfTransformer()
        
        query_vect = CountVectorizer(analyzer = "word", vocabulary = vocabulary)
        query_vect = query_vect.fit_transform(queryFile)
        
        query_vect = tfidf_transformer.fit_transform(query_vect)
    return query_vect



###Generate the query vector, weighted by term frequency
def get_QueryVector(queryFile, docFile):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(docFile)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    
    if type(queryFile) is str:
        query_vect = []
        query = queryFile
        query = query.split()
        frequency = defaultdict(int)
        for el in vocabulary:
            if el in query:
                frequency[el]+= 1
            else:
                frequency[el] = 0
        query_vect = list(dict(frequency).values())
        
    else:
        query_vect = CountVectorizer(analyzer = "word", vocabulary = vocabulary)
        query_vect = query_vect.fit_transform(queryFile)
        
    return query_vect

In [9]:
# get the corpus of each text file
# input: text file
# output: corpus and dictionary of the text file

def getCorpus(doc):    
    
    documents = doc

    texts = [[word for word in document.split()] for document in documents]

    frequency = defaultdict(int)
    for text in texts:
         for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 0]
              for text in texts]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    return corpus, dictionary



In [10]:
#### LDA topic model

##input: 
# doc_File: preprocessed train.docs file
# query_File: preprocessed train.nontopic-titles.queries file
# topicNumber: number of topics


## output: 
# doc_with_topic: doc_file with the most likely topic, and propability of the topic 
# query_with_topic: query_file with the most likely topic, and propability of the topic 



def ldaTopicModel(doc_File, query_File,topicNumber):
    
    print("Topic number:", topicNumber)
    documents = doc_File.text
    querys = query_File.text
    
    topicNumber = topicNumber

    corpus,dictionary = getCorpus(documents)

    #### LDA model with topic number = 10
    lda = gensim.models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=topicNumber)
    
    query_corpus, query_dictionary = getCorpus(querys)
    
    lda.update(query_corpus,True) # add query to train the lda model, the vocabulary remains at the same as before
    
    
    #### get topic-term-matrix, 
    #topic_term_matrix = lda.get_topics()
    

    #### assign topic back to eack document
    doc_topicID = list()
    doc_topicProb = list()
    for i in range(len(corpus)):
        x = lda.get_document_topics(corpus[i])
        prob = list()
        for i in range(len(x)):
            prob.append(x[i][1])        
        for i in range(len(x)):
            if x[i][1] == max(prob):
                doc_topicID.append(x[i][0])
                doc_topicProb.append(x[i][1])


    doc_topicID = pd.DataFrame(doc_topicID) 
    doc_topicID.columns = ['topicID']
    doc_topicProb = pd.DataFrame(doc_topicProb) 
    doc_topicProb.columns = ['topicProb']
    doc_with_topic = pd.concat([doc_File.reset_index(drop=True), doc_topicID], axis=1)
    doc_with_topic = pd.concat([doc_with_topic.reset_index(drop=True), doc_topicProb], axis=1)       
    print("average prob of topic assigning to docs:", + doc_with_topic.topicProb.sum()/len(doc_with_topic.topicProb))
    
    #### assign topic back to eack query
    query_topicID = list()
    query_topicProb = list()
    for i in range(len(query_corpus)):
        x = lda.get_document_topics(query_corpus[i])
        prob = list()
        for i in range(len(x)):
            prob.append(x[i][1])        
        for i in range(len(x)):
            if x[i][1] == max(prob):
                query_topicID.append(x[i][0])
                query_topicProb.append(x[i][1])


    query_topicID = pd.DataFrame(query_topicID) 
    query_topicID.columns = ['topicID']
    query_topicProb = pd.DataFrame(query_topicProb) 
    query_topicProb.columns = ['topicProb']
    query_with_topic = pd.concat([query_File.reset_index(drop=True), query_topicID], axis=1)
    query_with_topic = pd.concat([query_with_topic.reset_index(drop=True), query_topicProb], axis=1) 
    
    print("average prob of topic assigning to querys:", + query_with_topic.topicProb.sum()/len(query_with_topic.topicProb))
    
    
    return doc_with_topic, query_with_topic




In [11]:
### IR for each query within doc clutering, those docs have the same topic as the query

## input:
# doc_File: preprocessed train_doc_file
# query_File: preprocessed train_dquery_file
# doc_with_topic: doc_file with the most likely topic, and propability of the topic 
# query_with_topic: query_file with the most likely topic, and propability of the topic 

## output: three columns arrays with queryID, docID and cosine similarity between them




def IRByLDATopicclustering(doc_File, query_File,doc_with_topic, query_with_topic):
    
    train_tfidf = get_DTM_tfidf(doc_File.text)
    query_vector = get_QueryVector_tfidf(query_File.text, doc_File.text)
    print("query number:", len(query_File))
    print("docs number:", len(doc_File))
    doc_with_topic = doc_with_topic
    query_with_topic = query_with_topic    
    
    query_id = list(query_File.id)
    r = []
    
    for i in range(len(query_File)):
        sims = []
        topic_ID = query_with_topic.topicID[i]
        doc_with_topic_ID = doc_with_topic[doc_with_topic.topicID == topic_ID]
        doc_ID = list(doc_with_topic_ID.id)
        tfidf_index = doc_with_topic[doc_with_topic.topicID == topic_ID].index.tolist()
        for el in tfidf_index:
            sims.append(np.dot(query_vector[i], train_tfidf[el].transpose())[0,0])

        IR_doc_sims = []
        IR_doc = []
        for x in range(len(sims)):
            if sims[x] > 0:
                IR_doc_sims.append(sims[x])
                IR_doc.append(doc_ID[x])

        if len(IR_doc_sims) >0 and len(IR_doc)>0:
            IR_doc_sims, IR_doc= zip(*sorted(zip(IR_doc_sims, IR_doc), reverse=True))# rank the results

        for j in range(len(IR_doc)):

            r.append([str(query_id[i]) ,  str(IR_doc[j]), IR_doc_sims[j]]) 
    
    print("Total "+ str(len(r))+ " results are retrieved")
    return r

# Part 2: get functions run!
1. preprocessing
2. build lda model and assign topic to each docs and queries
3. IR

# 1. preprocessing

In [12]:
### preprocessing, without stemming

train_doc_file["text"] = train_doc_file['text'].str.replace('/', ' or ')

train_doc_file["text"] = train_doc_file['text'].str.replace('-', ' and ')

train_query_file["text"] = train_query_file['text'].str.replace('/', ' or ')

train_query_file["text"] = train_query_file['text'].str.replace('-', ' and ')

### remove stopwords
removeStopwords(train_doc_file['text'])
removeStopwords(train_query_file['text'])


### remove punctuation
removePunctuation(train_doc_file['text'])
removePunctuation(train_query_file['text'])

# 2. get the LDA topic model, and assign topic back to queries and docs

In [13]:
#test
# get the LDA model with topic number = 10
doc_with_topic, query_with_topic =ldaTopicModel(train_doc_file,train_query_file,10)

Topic number: 10
average prob of topic assigning to docs: 0.3184540804520926
average prob of topic assigning to querys: 0.5874411339147586


# 3. get similarity of query and docs with the same topic

In [14]:
################ IRByLDATopicclustering result1, need to get the performance result #########################
# without being stemmed
# test the IR on the whole query file, #docLeaders = squareroot of total docs number, "-" converted to "and"
t1 =  datetime.datetime.now()
IR_results = IRByLDATopicclustering(train_doc_file, train_query_file, doc_with_topic, query_with_topic)

t2 =  datetime.datetime.now()
t = t2-t1
print("running time:", t )

# save result
df = pd.DataFrame(IR_results, columns = ['QUERY_ID', 'DOC_ID', 'sim_results'])
df.to_csv('IRByLDATopicclustering result1.txt', header=None, index=None, sep=' ', mode='a')
df

query number: 1141
docs number: 3612
Total 52359 results are retrieved
running time: 0:02:59.687033


Unnamed: 0,QUERY_ID,DOC_ID,sim_results
0,PLAIN-100,MED-1136,0.240797
1,PLAIN-100,MED-1138,0.159398
2,PLAIN-100,MED-1818,0.139445
3,PLAIN-100,MED-14,0.139115
4,PLAIN-100,MED-2424,0.123355
5,PLAIN-100,MED-3142,0.121775
6,PLAIN-100,MED-3127,0.115864
7,PLAIN-100,MED-4826,0.109237
8,PLAIN-100,MED-5066,0.108509
9,PLAIN-100,MED-3318,0.107882


# Stemming docs and querys, and get the results

In [15]:
### preprocessing, with stemming

train_doc_file["text"] = train_doc_file['text'].str.replace('/', ' or ')

train_doc_file["text"] = train_doc_file['text'].str.replace('-', ' and ')

train_query_file["text"] = train_query_file['text'].str.replace('/', ' or ')

train_query_file["text"] = train_query_file['text'].str.replace('-', ' and ')

### remove stopwords
removeStopwords(train_doc_file['text'])
removeStopwords(train_query_file['text'])

### stemming
stemming(train_doc_file['text'])
stemming(train_query_file['text'])

### remove punctuation
removePunctuation(train_doc_file['text'])
removePunctuation(train_query_file['text'])


In [16]:
#test
# get the LDA model with topic number = 10
doc_with_topic_stemmed, query_with_topic_stemmed =ldaTopicModel(train_doc_file,train_query_file,10)

Topic number: 10
average prob of topic assigning to docs: 0.32615524908715443
average prob of topic assigning to querys: 0.5581157706280289


In [17]:
################ IRByLDATopicclustering result2, need to get the performance result #########################
# stemmed
# test the IR on the whole query file, #docLeaders = squareroot of total docs number, "-" converted to "and"
t1 =  datetime.datetime.now()
IR_results = IRByLDATopicclustering(train_doc_file, train_query_file, doc_with_topic_stemmed, query_with_topic_stemmed)

t2 =  datetime.datetime.now()
t = t2-t1
print("running time:", t )

# save result
df = pd.DataFrame(IR_results, columns = ['QUERY_ID', 'DOC_ID', 'sim_results'])
df.to_csv('IRByLDATopicclustering result2.txt', header=None, index=None, sep=' ', mode='a')
df

query number: 1141
docs number: 3612
Total 51622 results are retrieved
running time: 0:02:11.288087


Unnamed: 0,QUERY_ID,DOC_ID,sim_results
0,PLAIN-10,MED-3695,0.099397
1,PLAIN-10,MED-1143,0.055128
2,PLAIN-100,MED-5186,0.232840
3,PLAIN-100,MED-3242,0.205437
4,PLAIN-100,MED-5191,0.165826
5,PLAIN-100,MED-1812,0.163004
6,PLAIN-100,MED-3244,0.162070
7,PLAIN-100,MED-1818,0.157161
8,PLAIN-100,MED-3722,0.152419
9,PLAIN-100,MED-4643,0.149030


# ----------------------------------------------------------------------------------------------------------

# don't run code below

In [8]:
### create tfidf weighted DTM for the train.docs file
#train_tfidf = get_DTM_tfidf(train_doc_file.text)
#train_tfidf

<3612x23449 sparse matrix of type '<class 'numpy.float64'>'
	with 310355 stored elements in Compressed Sparse Row format>

In [10]:
### create query vector matrix for the train.nontopic-titles.queries file 
#query_vect = get_QueryVector(train_query_file.text, train_doc_file.text)
#query_vect

<1141x23449 sparse matrix of type '<class 'numpy.int64'>'
	with 4192 stored elements in Compressed Sparse Row format>

In [16]:
#### LDA
def ldaTopicModel(documents, stopwords, topicNumber):

    esw = stopwords
    documents = documents
    topicNumber = topicNumber


    
    texts = [[word for word in document.split()] for document in documents]

    frequency = defaultdict(int)
    for text in texts:
         for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 0]
              for text in texts]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    #### LDA model with topic number = 10
    lda = gensim.models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=topicNumber)

    #### get topic-term-matrix, 
    topic_term_matrix = lda.get_topics()

    #### assign topic back to eack document
    doc_topicID = list()
    doc_topicProb = list()
    for i in range(len(corpus)):
        x = lda.get_document_topics(corpus[i])
        prob = list()
        for i in range(len(x)):
            prob.append(x[i][1])        
        for i in range(len(x)):
            if x[i][1] == max(prob):
                doc_topicID.append(x[i][0])
                doc_topicProb.append(x[i][1])


    doc_topicID = pd.DataFrame(doc_topicID) 
    doc_topicID.columns = ['topicID']
    doc_topicProb = pd.DataFrame(doc_topicProb) 
    doc_topicProb.columns = ['topicProb']
    doc_with_topic = pd.concat([train_doc_file.reset_index(drop=True), doc_topicID], axis=1)
    doc_with_topic = pd.concat([doc_with_topic.reset_index(drop=True), doc_topicProb], axis=1)       

    #get topic-term-matrix, each topic is represented by a vector 
    topic_term_matrix = lda.get_topics()
    
    #get topic representation
    topic = lda.print_topics(topicNumber)
    
    # initiate topic clustering
    topic_clustering = []
    for t in range(topicNumber):
        topic_clustering.append([])
    
    # get the index of doc in each topic clustering
    for t in range(topicNumber):
        i = -1    
        for el in list(doc_with_topic.topicID):
            i += 1
            if el == t:
                topic_clustering[t].append(i)
    
    return(corpus, doc_with_topic, topic_clustering, topic_term_matrix,topic )




In [20]:
#query_with_topic

Unnamed: 0,id,text,topicID,topicProb
0,PLAIN-10,contaminated children,4,0.699926
1,PLAIN-100,cancer animal plant protein ratio,5,0.683334
2,PLAIN-103,plant based diets may extend lives,5,0.673156
3,PLAIN-104,low methionine diet may help starve cancer cells,5,0.444559
4,PLAIN-105,animal proteins may trigger autoimmune disease,4,0.392976
5,PLAIN-106,handling poultry tied liver pancreatic cancers,8,0.871391
6,PLAIN-107,improving attractiveness six weeks,8,0.453488
7,PLAIN-108,want healthier change taste buds,7,0.534452
8,PLAIN-109,get parents eat vegetables,6,0.555045
9,PLAIN-110,get kids eat vegetables,6,0.558683


In [21]:
#doc_with_topic

Unnamed: 0,id,text,topicID,topicProb
0,MED-10,statin breast cancer survival nationwide cohor...,5,0.322369
1,MED-14,statin diagnosis breast cancer survival popula...,0,0.234279
2,MED-118,alkylphenols human milk relations dietary habi...,7,0.334502
3,MED-301,methylmercury potential environmental risk fac...,4,0.299611
4,MED-306,sensitivity continuous performance test cpt ag...,0,0.268926
5,MED-329,phosphate vascular toxin ncbi elevated phospha...,2,0.310761
6,MED-330,dietary phosphorus acutely impairs endothelial...,2,0.212737
7,MED-332,public health impact dietary phosphorus excess...,2,0.330263
8,MED-334,differences total vitro digestible phosphorus ...,0,0.243893
9,MED-335,differences total vitro digestible phosphorus ...,0,0.286274


In [24]:
#getCorpus(train_doc_file)

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 8),
  (9, 10),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 3),
  (16, 1),
  (17, 1),
  (18, 2),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 2),
  (23, 1),
  (24, 2),
  (25, 4),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 3),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 3),
  (34, 1),
  (35, 1),
  (36, 2),
  (37, 2),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 3),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 2),
  (59, 4),
  (60, 1),
  (61, 2),
  (62, 2),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 3),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 9),
  (78, 3),
  (79, 1),
  (80, 2),
  (81, 1),
  (82, 1),
  (83, 3),
  (84, 1),
  (85, 1),
  (86, 2),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 1),
  (91, 1

In [21]:
#lda.print_topics(10)

[(0,
  '0.005*"study" + 0.005*"oil" + 0.004*"concentrations" + 0.004*"results" + 0.004*"exposure" + 0.004*"levels" + 0.004*"group" + 0.004*"food" + 0.004*"acid" + 0.003*"ncbi"'),
 (1,
  '0.007*"cells" + 0.006*"ncbi" + 0.006*"effects" + 0.005*"cell" + 0.004*"cancer" + 0.004*"study" + 0.004*"dietary" + 0.004*"patients" + 0.004*"human" + 0.004*"results"'),
 (2,
  '0.012*"diet" + 0.010*"intake" + 0.008*"dietary" + 0.008*"food" + 0.007*"protein" + 0.007*"consumption" + 0.006*"high" + 0.006*"ncbi" + 0.005*"fat" + 0.005*"total"'),
 (3,
  '0.006*"ncbi" + 0.005*"disease" + 0.005*"cholesterol" + 0.005*"food" + 0.005*"risk" + 0.004*"dietary" + 0.004*"patients" + 0.004*"study" + 0.003*"total" + 0.003*"health"'),
 (4,
  '0.013*"cancer" + 0.007*"levels" + 0.006*"risk" + 0.005*"ncbi" + 0.005*"diet" + 0.005*"dietary" + 0.004*"high" + 0.004*"study" + 0.004*"low" + 0.004*"prostate"'),
 (5,
  '0.018*"risk" + 0.018*"cancer" + 0.010*"study" + 0.010*"breast" + 0.009*"intake" + 0.009*"women" + 0.007*"consump

In [14]:
#topic_vector.shape

(10, 23706)

In [15]:
# show topics, topics is represented by 10 most likely terms
#topic

[(0,
  '0.006*"products" + 0.006*"milk" + 0.006*"intake" + 0.006*"ncbi" + 0.005*"food" + 0.005*"levels" + 0.005*"effects" + 0.005*"antioxidant" + 0.004*"risk" + 0.004*"studies"'),
 (1,
  '0.017*"cancer" + 0.014*"risk" + 0.010*"breast" + 0.007*"health" + 0.007*"intake" + 0.007*"women" + 0.006*"studies" + 0.006*"study" + 0.005*"ncbi" + 0.005*"results"'),
 (2,
  '0.018*"cancer" + 0.011*"risk" + 0.011*"consumption" + 0.009*"intake" + 0.007*"meat" + 0.006*"ncbi" + 0.006*"dietary" + 0.005*"breast" + 0.005*"study" + 0.004*"women"'),
 (3,
  '0.007*"group" + 0.006*"diet" + 0.006*"study" + 0.005*"dietary" + 0.005*"diabetes" + 0.005*"cancer" + 0.005*"ncbi" + 0.005*"risk" + 0.005*"years" + 0.004*"vegetarian"'),
 (4,
  '0.007*"cell" + 0.007*"cells" + 0.006*"human" + 0.005*"ncbi" + 0.005*"disease" + 0.004*"levels" + 0.004*"dietary" + 0.003*"diet" + 0.003*"growth" + 0.003*"high"'),
 (5,
  '0.008*"study" + 0.007*"diet" + 0.006*"patients" + 0.006*"dietary" + 0.005*"intake" + 0.005*"risk" + 0.005*"conce

In [16]:
# get the subset of docs through index, an example, an example for getting the docs vectors of topic 0
#docs_with_topic1 = train_tfidf[topic_clustering[0],]
#docs_with_topic1

<229x23449 sparse matrix of type '<class 'numpy.float64'>'
	with 19927 stored elements in Compressed Sparse Row format>

In [17]:
# print the average prob of topic assigning
#print("average prob of topic assigning:", +doc_with_topic.topicProb.sum()/len(doc_with_topic.topicProb))

#print the result of topic assigning
#doc_with_topic


average prob of topic assigning: 0.7023406420748759


Unnamed: 0,id,text,topicID,topicProb
0,MED-10,statin breast cancer survival nationwide cohor...,1,0.994116
1,MED-14,statin diagnosis breast cancer survival popula...,1,0.994039
2,MED-118,alkylphenols human milk relations dietary habi...,5,0.991260
3,MED-301,methylmercury potential environmental risk fac...,3,0.670959
4,MED-306,sensitivity continuous performance test cpt ag...,5,0.726526
5,MED-329,phosphate vascular toxin ncbi elevated phospha...,6,0.919325
6,MED-330,dietary phosphorus acutely impairs endothelial...,6,0.712417
7,MED-332,public health impact dietary phosphorus excess...,6,0.727267
8,MED-334,differences total vitro digestible phosphorus ...,1,0.509174
9,MED-335,differences total vitro digestible phosphorus ...,0,0.817977


In [37]:
### get the Silhouette score of each topic model
### use silhouetteScore to choose the best topic number

def silhouetteScore(corpus,topicNumber, topic_clustering):
    corpus = corpus
    topicNumber = topicNumber
    silhouetteScore = 0
    topic_clustering =topic_clustering
    tfidf = models.TfidfModel(corpus)
    index = similarities.MatrixSimilarity(tfidf[corpus])
    sims = index[tfidf[corpus]]
    simsList = []
    for el in sims:
        for l in el:
            simsList.append(l)
    distance = max(simsList)- sims
    for i in range(topicNumber):

        a = distance[topic_clustering[i],][:,topic_clustering[i]].sum()/len(topic_clustering[i]) #get submatrix through index index of doc in each topic clustering
        #print(a)
        b = (distance[topic_clustering[i],].sum() - a)/(sims.shape[1]-len(topic_clustering[i]))
        #print(b)
        s = (b-a)/max(b,a)
        silhouetteScore = silhouetteScore+s
    silhouetteScore = silhouetteScore/topicNumber
    print("when topic number =",topicNumber,"the silhouetteScore is:",+ silhouetteScore )

In [38]:
#test
#get the silhouetteScore of LDA model with topic number = 10, 
#print('for more info about silhouetteScore:')
#print('https://en.wikipedia.org/wiki/Silhouette_(clustering)')

#silhouetteScore(corpus, 10, topic_clustering)

for more info about silhouetteScore:
https://en.wikipedia.org/wiki/Silhouette_(clustering)
when topic number = 10 the silhouetteScore is: 0.10883925224930788


In [20]:
#test
# get the LDA model with topic number = 15
#corpus_15, doc_with_topic_15, topic_clustering_15, topic_vector_15, topic_15 =ldaTopicModel(train_doc_file.text, esw, 15)

In [21]:
#silhouetteScore(corpus_15,15, topic_clustering_15)

when topic number = 15 the silhouetteScore is: 0.07821657962068146


In [22]:
#test
# get the LDA model with topic number = 20
corpus_20, doc_with_topic_20, topic_clustering_20, topic_vector_20, topic_20 =ldaTopicModel(train_doc_file.text, esw, 20)

In [45]:
# print the average prob of topic assigning
print("average prob of topic assigning:", +doc_with_topic_20.topicProb.sum()/len(doc_with_topic_20.topicProb))

#print the result of topic assigning
doc_with_topic_20

average prob of topic assigning: 0.6335504678297676


Unnamed: 0,id,text,topicID,topicProb
0,MED-10,statin breast cancer survival nationwide cohor...,19,0.432394
1,MED-14,statin diagnosis breast cancer survival popula...,18,0.492215
2,MED-118,alkylphenols human milk relations dietary habi...,18,0.589658
3,MED-301,methylmercury potential environmental risk fac...,11,0.916745
4,MED-306,sensitivity continuous performance test cpt ag...,5,0.994099
5,MED-329,phosphate vascular toxin ncbi elevated phospha...,15,0.422864
6,MED-330,dietary phosphorus acutely impairs endothelial...,13,0.592992
7,MED-332,public health impact dietary phosphorus excess...,7,0.323568
8,MED-334,differences total vitro digestible phosphorus ...,4,0.478460
9,MED-335,differences total vitro digestible phosphorus ...,14,0.597205


In [23]:
silhouetteScore(corpus_20, 20, topic_clustering_20)

when topic number = 20 the silhouetteScore is: 0.06491614696459048


In [39]:
#test
# get the LDA model with topic number = 5
corpus_5, doc_with_topic_5, topic_clustering_5, topic_vector_5, topic_5 =ldaTopicModel(train_doc_file.text, esw, 5)

In [46]:
# print the average prob of topic assigning
print("average prob of topic assigning:", + doc_with_topic_5.topicProb.sum()/len(doc_with_topic_5.topicProb))

#print the result of topic assigning
doc_with_topic_5

average prob of topic assigning: 0.7948872049979593


Unnamed: 0,id,text,topicID,topicProb
0,MED-10,statin breast cancer survival nationwide cohor...,0,0.775929
1,MED-14,statin diagnosis breast cancer survival popula...,0,0.994592
2,MED-118,alkylphenols human milk relations dietary habi...,0,0.532053
3,MED-301,methylmercury potential environmental risk fac...,1,0.635193
4,MED-306,sensitivity continuous performance test cpt ag...,0,0.424123
5,MED-329,phosphate vascular toxin ncbi elevated phospha...,4,0.413330
6,MED-330,dietary phosphorus acutely impairs endothelial...,3,0.880159
7,MED-332,public health impact dietary phosphorus excess...,0,0.636550
8,MED-334,differences total vitro digestible phosphorus ...,0,0.994760
9,MED-335,differences total vitro digestible phosphorus ...,0,0.995205


In [40]:
#silhouetteScore(corpus_5, 5, topic_clustering_5)

when topic number = 5 the silhouetteScore is: 0.2061300455298063


In [26]:
#test
# get the LDA model with topic number = 40
#corpus_40, doc_with_topic_40, topic_clustering_40, topic_vector_40, topic_40 =ldaTopicModel(train_doc_file.text, esw, 40)

In [47]:
# print the average prob of topic assigning
#print("average prob of topic assigning:", +doc_with_topic_40.topicProb.sum()/len(doc_with_topic_40.topicProb))

#print the result of topic assigning
#doc_with_topic_40

average prob of topic assigning: 0.5809734776030281


Unnamed: 0,id,text,topicID,topicProb
0,MED-10,statin breast cancer survival nationwide cohor...,19,0.812428
1,MED-14,statin diagnosis breast cancer survival popula...,19,0.971744
2,MED-118,alkylphenols human milk relations dietary habi...,21,0.990534
3,MED-301,methylmercury potential environmental risk fac...,6,0.730653
4,MED-306,sensitivity continuous performance test cpt ag...,1,0.993944
5,MED-329,phosphate vascular toxin ncbi elevated phospha...,21,0.858269
6,MED-330,dietary phosphorus acutely impairs endothelial...,14,0.644242
7,MED-332,public health impact dietary phosphorus excess...,19,0.531302
8,MED-334,differences total vitro digestible phosphorus ...,20,0.492610
9,MED-335,differences total vitro digestible phosphorus ...,18,0.358067


In [27]:
#silhouetteScore(corpus_40, 40, topic_clustering_40)

when topic number = 40 the silhouetteScore is: 0.049813424801762596


In [112]:
### get the Silhouette score of each topic model
### use silhouetteScore to choose the best topic number

def silhouetteScore(corpus,topicNumber, topic_clustering):
    corpus = corpus
    topicNumber = topicNumber
    topic_clustering =topic_clustering
    tfidf = models.TfidfModel(corpus)
    index = similarities.MatrixSimilarity(tfidf[corpus])
    sims = index[tfidf[corpus]]
    dissims = 1- sims
    silhouetteScore = 0
    
    for i in range(topicNumber):
        
        x = dissims[topic_clustering[i],]
        s = 0
        for j in range(len(topic_clustering[i])):
            a = x[j, topic_clustering[i]].sum()/len(topic_clustering[i])
            b = (x.sum() - a)/ (sims.shape[1]-len(topic_clustering[i]))
            s = s + (b-a)/max(b,a) # calculatethe 
        s = s/len(topic_clustering[i])
        silhouetteScore = silhouetteScore+s
        
    silhouetteScore = silhouetteScore/topicNumber
    print("when topic number =",topicNumber,"the silhouetteScore is:",+ silhouetteScore )

In [113]:
#test
#get the silhouetteScore of LDA model with topic number = 10, 
#print('for more info about silhouetteScore:')
#print('https://en.wikipedia.org/wiki/Silhouette_(clustering)')

#silhouetteScore(corpus, 10, topic_clustering)

for more info about silhouetteScore:
https://en.wikipedia.org/wiki/Silhouette_(clustering)
when topic number = 10 the silhouetteScore is: 0.997082210127938


In [114]:
#silhouetteScore(corpus_15,15, topic_clustering_15)

when topic number = 15 the silhouetteScore is: 0.995763178366872


In [115]:
#silhouetteScore(corpus_20, 20, topic_clustering_20)

when topic number = 20 the silhouetteScore is: 0.9930509457756216


In [116]:
#silhouetteScore(corpus_5, 5, topic_clustering_5)

when topic number = 5 the silhouetteScore is: 0.9988246192468961


In [117]:
#test
# get the LDA model with topic number = 1
#corpus_1, doc_with_topic_1, topic_clustering_1, topic_vector_1, topic_1 =ldaTopicModel(train_doc_file.text, esw, 1)

In [118]:
#silhouetteScore(corpus_1,1, topic_clustering_1)



when topic number = 1 the silhouetteScore is: nan


In [48]:
#test
# get the LDA model with topic number = 2
#corpus_2, doc_with_topic_2, topic_clustering_2, topic_vector_2, topic_2 =ldaTopicModel(train_doc_file.text, esw, 2)

In [49]:
# print the average prob of topic assigning
print("average prob of topic assigning:", +doc_with_topic_2.topicProb.sum()/len(doc_with_topic_2.topicProb))

#print the result of topic assigning
#doc_with_topic_2

average prob of topic assigning: 0.8974943644514644


Unnamed: 0,id,text,topicID,topicProb
0,MED-10,statin breast cancer survival nationwide cohor...,0,0.890474
1,MED-14,statin diagnosis breast cancer survival popula...,0,0.666391
2,MED-118,alkylphenols human milk relations dietary habi...,1,0.854312
3,MED-301,methylmercury potential environmental risk fac...,0,0.621593
4,MED-306,sensitivity continuous performance test cpt ag...,1,0.527955
5,MED-329,phosphate vascular toxin ncbi elevated phospha...,0,0.978262
6,MED-330,dietary phosphorus acutely impairs endothelial...,0,0.533707
7,MED-332,public health impact dietary phosphorus excess...,0,0.690321
8,MED-334,differences total vitro digestible phosphorus ...,1,0.600596
9,MED-335,differences total vitro digestible phosphorus ...,1,0.538139


In [120]:
#silhouetteScore(corpus_2,2, topic_clustering_2)

when topic number = 2 the silhouetteScore is: 0.9997237511502979
