# A Notebook for TOPIC Modelling (LDA), calculating TF-IDFs, Sentence ranking and extractive summary

## Name: Raeed Asif
## Class: BSCS 6B
## Reg# 199323

In [None]:
#run these to import necessary packages
import pandas as pd
import numpy as np
import re
import spacy
import math
import statistics 

from statistics import mean, median, mode, stdev


# word cloud
from wordcloud import WordCloud, STOPWORDS

# Gensim tools
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# nltk tools
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim

# matplot lib
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
%matplotlib inline 

## Reading csv

In [None]:
biorxiv_clean = pd.read_csv("data/biorxiv_clean.csv")
biorxiv_clean.head(10)

In [None]:
# cord-19-solution-toolbox: https://www.kaggle.com/gpreda/cord-19-solution-toolbox
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(background_color='white',
                          stopwords=stopwords,
                          max_words=1000,
                          max_font_size=40,
                          scale=5,
                          random_state=1).generate(str(data))
    
    fig = plt.figure( 1 ,figsize=(15,15) )
    plt.axis('off')
    
    if title: 
        fig.suptitle(title, fontsize=14)
        fig.subplots_adjust(top=2.3)
        
    plt.imshow(wordcloud)
    plt.show

## Bag of world model

In [None]:
show_wordcloud(biorxiv_clean["abstract"], title="Wordcloud of abstracts")

In [None]:
show_wordcloud(biorxiv_clean["text"], title="Wordcloud of text")

In [None]:
#sentences = [text.lower().split() for text in biorxiv_clean["text"]]
#print(sentences[0])

#Text to list
df = biorxiv_clean
df = df.text.dropna()
data = df.tolist()

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  


data_words = list(sent_to_words(data))     

## Biagram objects

In [None]:
#creating biagram model to make biagram pairs of data_words
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=20)
bigram_mod = gensim.models.phrases.Phraser(bigram)

## Preprocessing, Bigrams, lematization and required parameter for LDA

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['preprint','copyright','medrxiv','https_doi','get','copyright_holder','peer','reviewed','https','org','rights_reserved', ''])

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        #print(doc)
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def get_corpus(data_list):
    words = data_list
    data_words_nostops = remove_stopwords(words)
    data_words_bigrams = bigrams(data_words_nostops)
    data_words_lematized = lemmatization(data_words_bigrams)
    id2word = gensim.corpora.Dictionary(data_words_lematized)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in data_words_lematized]
    return corpus, id2word, bigram, data_words_lematized

In [None]:
corpus, id2word, bigram, data_lematized = get_corpus(data_words)

In [None]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

## LDA Model

In [None]:
num_topics = 5
random_state=100

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=random_state,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True
                                        )

In [None]:
#print(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lematized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print('\nCoherence Score: ', coherence_lda)

## LDA Distance Map

In [None]:
%%time
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
pyLDAvis.save_html(vis, './lda4topics_v2.html')

In [None]:
 print(lda_model.show_topic(2)) #5

## Filteing papers and extarcting text, ["coronavirus","tissue","immune","disease","gene","drug"] are those under same topic as seen in LDA model above

In [None]:
# Filter papers containing all words in list
def filter_papers_word_list(word_list):
    papers_id_list = []
    for idx, paper in biorxiv_clean.iterrows():
        if all(x in paper.text for x in word_list):
            papers_id_list.append(paper.paper_id)

    return papers_id_list

pd.set_option("display.max_colwidth", 100000) # Extend the display width to prevent split functions to not cover full text
biorxiv_environment = filter_papers_word_list(["coronavirus","tissue","immune","disease","gene","drug"])
print("Papers containing coronavirus: ", len(biorxiv_environment))

In [None]:
def extract_conclusion(df, papers_id_list):
    data = df.loc[df['paper_id'].isin(papers_id_list)]
    conclusion = []
    for idx, paper in data.iterrows():
        paper_text = paper.text
        if "\nConclusion\n" in paper.text:
            conclusion.append(paper_text.split('\nConclusion\n')[1])
        else:
            conclusion.append("No Conclusion section")
    data['conclusion'] = conclusion
        
    return data

pd.reset_option('^display.', silent=True)

## Preprocessing before computing TF-IDFS

In [None]:
def split(text):
    return text.replace('\n','.')

def remove_stopwords_2(texts):
    return [word for word in simple_preprocess(str(texts)) if word not in stop_words]

environ_trans_conclusion = extract_conclusion(biorxiv_clean, biorxiv_environment)
environ_trans_conclusion["text"]
list_old = [split(x) for x in environ_trans_conclusion["text"]]

list_new_1 = [x.split(".") for x in list_old]

lematize_list =[]
for idx,doc in enumerate(list_new_1):
    lematize_temp_list = []
    stopword_list = []
    for line in doc:
        stopword_list.append(remove_stopwords_2(line))
    lematize_list.append(lemmatization(stopword_list))



In [None]:
list_new = [x for x in environ_trans_conclusion["text"]]
list_new_nostops = remove_stopwords(list_new)
list_new_lematized = lemmatization(list_new_nostops)
#print(list_new_lematized)
lis = []
numOfWords={}
for idx,lis in enumerate(list_new_lematized):
    numOfWords[idx] = dict.fromkeys(lis, 0)
    tmp ={"doc_id":idx}
    numOfWords[idx].update(tmp)
    for word in lis:
        numOfWords[idx][word] += 1
#print(numOfWords)

In [None]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        if word == 'doc_id':
            tfDict[word] = count
        else:
            tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

def computeIDF(dictn):
    N = len(dictn)
    idfDict = dict.fromkeys(dictn.keys(), 0)
    
    for word, val in dictn.items():
        if word == 'doc_id':
            idfDict[word] = val
        elif val > 0:
            idfDict[word] = idfDict[word] + 1
    
    for word, val in idfDict.items():
        if word == 'doc_id':
            idfDict[word] = val
        else:
            idfDict[word] = math.log(N / float(val))
    return idfDict

def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        if word == 'doc_id':
            tfidf[word] = val
        else:
            tfidf[word] = val * idfs[word]
    return tfidf

## TF-IDF Data frame

In [None]:
tf = [computeTF(numOfWords[idx],list_new_lematized[idx]) for idx in range(len(numOfWords))]
idfs = [computeIDF(numOfWords[idx]) for idx in range(len(numOfWords))] 
tfidf = [computeTFIDF(tf[idx], idfs[idx]) for idx in range(len(idfs))]

df = pd.DataFrame(tfidf)
df

In [None]:
x=[]
tfidf_list=[]
c=0
for doc in lematize_list:
    x=[]
    for sen in doc:
        dic_temp={}
        for word in sen:
            for k,v in tfidf[idx].items():
                if word == k:
                    dic_temp[word]=v   
            dic_temp['doc_id'] = i
        x.append(dic_temp)
        
    tfidf_list.append(x)

In [None]:
def make_sentence(doc,i,idx):
    temp_dict={}
    temp = 0
    for key,val in doc.items():
        if key == 'doc_id':
            pass
        else:
            temp += val
        temp_dict={'doc_id':doc['doc_id'] , 'tfid_score':temp, 'key':  "".join(list_new_1[i][idx])}
    return temp_dict


In [None]:
tfidf_score = []

for i,doc in enumerate(tfidf_list):
    tfidf_score_temp=[]
    for idx,dic in enumerate(doc):
        temp_score = make_sentence(dic,i,idx)
        tfidf_score_temp.append(temp_score)
    tfidf_score.append(tfidf_score_temp)
#tfidf_document = [make_sentence(doc) for doc in tfidf]

## tfidf scoring of sentence in 4th paper text in list

In [None]:
print(tfidf_score[3])

In [None]:
def get_summary(doc_list):
    c=0
    sum = 0
    summary = []
    array =  []
    for temp_dict in doc_list:
        if temp_dict:
            sum = sum + temp_dict['tfid_score']

    avg = sum/len(doc_list)

    for temp_dict in doc_list:
        if temp_dict:
            array.append(temp_dict['tfid_score'])
        
    stdev = statistics.stdev(array)
    #avg+stdev+avg
    thres = avg+stdev+avg
    for temp_dict in doc_list:
        if temp_dict:
            if temp_dict['tfid_score'] >= thres:
                summary.append(temp_dict['key'])
                c=c+1
    summary = '.'.join(summary)
    return summary,c

## Extractive Summary for related keyword papers

In [None]:
summ = []
for i,doc in enumerate(tfidf_score):
    summary = (get_summary(doc))
    summ.append(summary)

## Extractive summary of 4th paper in the list

In [None]:
print(summ[3])

## No of sentences of the 4th paper summary in the extracted list

In [None]:
print("sentences:",len(summ[3]))

## No of sentences of the 4th paper in the extracted list (original)

In [None]:
print("sentences:",len(list_new_1[3]))