In [1]:
import gensim
from gensim import models, corpora, similarities
from collections import defaultdict



In [2]:
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from __future__ import division
import matplotlib

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [24]:
nltk.download("stopwords")
from nltk.corpus import stopwords
esw = stopwords.words("english")
from string import punctuation

#remove "num", because "num" has the highest term frequency(45538) at the orinal file, 
#the second most frequent term only has a freq. of 3750
esw = esw + ['abstract', 'ci', 'hr','l','pubmed', 'num'] 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\D070678\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:

####read file

#read train.docs file
train_doc_file = pd.read_csv("train.docs", encoding = 'utf-8', sep='\t', header=None)
train_doc_file.columns = ['id', 'text']

#read train.nontopic-titles.queries file
train_query_file = pd.read_csv("train.nontopic-titles.queries", encoding = 'utf-8', sep='\t', header=None)
train_query_file.columns = ['id', 'text']



### preprocessing

train_doc_file["text"] = train_doc_file['text'].str.replace('/', ' or ')

train_doc_file["text"] = train_doc_file['text'].str.replace('-', ' and ')

train_query_file["text"] = train_query_file['text'].str.replace('/', ' or ')

train_query_file["text"] = train_query_file['text'].str.replace('-', ' and ')


# inout file: train_doc_file["text"]/ train_query_file["text"]

def removeStopwords(file):
    i = 0
    for el in file:
        el = ' '.join([word for word in el.lower().split() if word not in esw])
        file[i] = el
        i += 1
def removePunctuation(file):
    i = 0
    for el in file:
        el = ' '.join([word for word in el.lower().split() if word not in punctuation])
        file[i] = el
        i += 1

def stemming(file):
    ps = PorterStemmer()
    i = 0
    for el in file:
        el = ' '.join([ps.stem(word) for word in el.lower().split() ])
        file[i] = el
        i += 1
        


In [38]:
### remove stopwords
removeStopwords(train_doc_file['text'], esw)
removeStopwords(train_query_file['text'], esw)

### stemming
#stemming(train_doc_file['text'])
#stemming(train_query_file['text'])

### remove punctuation
removePunctuation(train_doc_file['text'])
removePunctuation(train_query_file['text'])

In [41]:
### create DTM

 ## get DTM, weighted by tfidf
def get_DTM_tfidf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    return X_train_tfidf


## get DTM, weighted by tfidf
def get_DTM_tf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    return X_train_tf


In [42]:
### create tfidf weighted DTM for the train.docs file
train_tfidf = get_DTM_tfidf(train_doc_file.text)
train_tfidf

<3612x23500 sparse matrix of type '<class 'numpy.float64'>'
	with 311289 stored elements in Compressed Sparse Row format>

In [68]:
###Generate the query vector
def get_QueryVector(queryFile, docFile):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(docFile)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    query_vect = CountVectorizer(analyzer = "word", vocabulary = vocabulary)
    query_vect = query_vect.fit_transform(queryFile)
    return query_vect

In [69]:
### create query vector matrix for the train.nontopic-titles.queries file 
query_vect = get_QueryVector(train_query_file.text, train_doc_file.text)
query_vect

<1141x23500 sparse matrix of type '<class 'numpy.int64'>'
	with 99440 stored elements in Compressed Sparse Row format>

In [59]:
#### LDA


def ldaTopicModel(documents, stopwords, topicNumber):

    esw = stopwords
    documents = documents
    topicNumber = topicNumber


    ##### preprocessing
    texts = [[word for word in document.lower().split() if word not in esw] for document in documents]

    frequency = defaultdict(int)
    for text in texts:
         for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 1]
              for text in texts]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    #### LDA model with topic number = 10
    lda = gensim.models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=topicNumber)

    #### get topic-term-matrix, 
    topic_term_matrix = lda.get_topics()

    #### assign topic back to eack document
    doc_topicID = list()
    doc_topicProb = list()
    for i in range(len(corpus)):
        x = lda.get_document_topics(corpus[i])
        prob = list()
        for i in range(len(x)):
            prob.append(x[i][1])        
        for i in range(len(x)):
            if x[i][1] == max(prob):
                doc_topicID.append(x[i][0])
                doc_topicProb.append(x[i][1])


    doc_topicID = pd.DataFrame(doc_topicID) 
    doc_topicID.columns = ['topicID']
    doc_topicProb = pd.DataFrame(doc_topicProb) 
    doc_topicProb.columns = ['topicProb']
    doc_with_topic = pd.concat([train_doc_file.reset_index(drop=True), doc_topicID], axis=1)
    doc_with_topic = pd.concat([doc_with_topic.reset_index(drop=True), doc_topicProb], axis=1)       

    #get topic-term-matrix, each topic is represented by a vector 
    topic_term_matrix = lda.get_topics()
    
    #get topic representation
    topic = lda.print_topics(topicNumber)
    
    # initiate topic clustering
    topic_clustering = []
    for t in range(topicNumber):
        topic_clustering.append([])
    
    # get the index of doc in each topic clustering
    for t in range(topicNumber):
        i = -1    
        for el in list(doc_with_topic.topicID):
            i += 1
            if el == t:
                topic_clustering[t].append(i)
    
    return(corpus, doc_with_topic, topic_clustering, topic_term_matrix,topic )




In [60]:
#test
# get the LDA model with topic number = 10
corpus, doc_with_topic, topic_clustering, topic_vector, topic =ldaTopicModel(train_doc_file.text, esw, 10)

In [61]:
# show topics, topics is represented by 10 most likely terms
topic

[(0,
  '0.010*"food" + 0.007*"ncbi" + 0.006*"diet" + 0.005*"results" + 0.005*"patients" + 0.005*"dietary" + 0.005*"meat" + 0.004*"effects" + 0.004*"effect" + 0.004*"study"'),
 (1,
  '0.009*"diet" + 0.009*"levels" + 0.008*"cancer" + 0.008*"ncbi" + 0.006*"serum" + 0.005*"patients" + 0.005*"study" + 0.004*"vitamin" + 0.004*"human" + 0.004*"disease"'),
 (2,
  '0.008*"patients" + 0.007*"ncbi" + 0.006*"cells" + 0.006*"subjects" + 0.005*"dietary" + 0.005*"group" + 0.005*"intake" + 0.005*"cell" + 0.005*"cholesterol" + 0.005*"disease"'),
 (3,
  '0.013*"intake" + 0.009*"risk" + 0.009*"study" + 0.008*"dietary" + 0.007*"consumption" + 0.006*"fish" + 0.006*"cancer" + 0.006*"diet" + 0.005*"effects" + 0.005*"results"'),
 (4,
  '0.023*"cancer" + 0.016*"breast" + 0.015*"women" + 0.014*"risk" + 0.009*"study" + 0.007*"intake" + 0.007*"years" + 0.006*"consumption" + 0.006*"results" + 0.004*"ncbi"'),
 (5,
  '0.007*"patients" + 0.007*"intake" + 0.006*"risk" + 0.006*"food" + 0.006*"study" + 0.006*"ncbi" + 0.

In [58]:
# get the subset of docs through index, an example, an example for getting the docs vectors of topic 0
docs_with_topic1 = train_tfidf[topic_clustering[0],]
docs_with_topic1

<306x23500 sparse matrix of type '<class 'numpy.float64'>'
	with 26773 stored elements in Compressed Sparse Row format>

In [48]:
# print the average prob of topic assigning
print("average prob of topic assigning:", +doc_with_topic.topicProb.sum()/len(doc_with_topic.topicProb))

#print the result of topic assigning
doc_with_topic


average prob of topic assigning: 0.6811523292531338


Unnamed: 0,id,text,topicID,topicProb
0,MED-10,statin breast cancer survival nationwide cohor...,5,0.993570
1,MED-14,statin diagnosis breast cancer survival popula...,5,0.993704
2,MED-118,alkylphenols human milk relations dietary habi...,7,0.721706
3,MED-301,methylmercury potential environmental risk fac...,8,0.748316
4,MED-306,sensitivity continuous performance test cpt ag...,8,0.994338
5,MED-329,phosphate vascular toxin ncbi elevated phospha...,0,0.736088
6,MED-330,dietary phosphorus acutely impairs endothelial...,5,0.598827
7,MED-332,public health impact dietary phosphorus excess...,5,0.450586
8,MED-334,differences total vitro digestible phosphorus ...,4,0.696269
9,MED-335,differences total vitro digestible phosphorus ...,6,0.413519


In [66]:
### get the Silhouette score of each topic model
### use silhouetteScore to choose the best topic number

def silhouetteScore(corpus,topicNumber, topic_clustering):
    corpus = corpus
    topicNumber = topicNumber
    silhouetteScore = 0
    topic_clustering =topic_clustering
    tfidf = models.TfidfModel(corpus)
    index = similarities.MatrixSimilarity(tfidf[corpus])
    sims = index[tfidf[corpus]]
    distance = 1-sims
    for i in range(topicNumber):

        a = distance[topic_clustering[i],][:,topic_clustering[i]].sum() #get submatrix through index index of doc in each topic clustering
        #print(a)
        b = distance[topic_clustering[i],].sum() - a
        #print(b)
        s = (b-a)/max(b,a)
        silhouetteScore = silhouetteScore+s
    silhouetteScore = silhouetteScore/topicNumber
    print("when topic number =",topicNumber,"the silhouetteScore is:",+ silhouetteScore )

In [67]:
#test
#get the silhouetteScore of LDA model with topic number = 10, 
print('for more info about silhouetteScore:')
print('https://en.wikipedia.org/wiki/Silhouette_(clustering)')

silhouetteScore(corpus,10, topic_clustering)

for more info about silhouetteScore:
https://en.wikipedia.org/wiki/Silhouette_(clustering)
when topic number = 10 the silhouetteScore is: 0.8891772449016571
