# CS5293 Spring 2020 Project 2
## By Abilash Ramesh/

## Loading packages

In [123]:
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import LatentDirichletAllocation as LDA
from gensim import corpora, models
from sklearn.metrics import silhouette_score
import networkx
import random
import re
import pandas as pd
import numpy as np
import json
import glob
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer 
from sklearn.metrics import silhouette_score

## Function to read files

In [124]:
def readfiles(path, n):
    
    filenames=glob.glob(path) ## Get filepaths
    filen=len(filenames) 
    number= random.randint(0,filen) ##Random file index
    percent = (n)/100
    reqf=(filen) * percent
    end=number+reqf
    print("The files from index %d to %d have been taken" %(number , end))
    print(int(reqf))
    taken=filenames[int(number):int(end)]
    return taken

## Function to normalize text 

In [179]:
def normalize(text):
    text = text.lower() ## Lowercase text
    sents=(nltk.sent_tokenize(text)) ##Sentence tokenization
    words=[]
    for sent in sents:
        sent.strip()
        words.extend(nltk.word_tokenize(sent)) ## Word tokenization
    stopword_list = nltk.corpus.stopwords.words('english')
    custom_stop_words = [
        'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
        'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 
        'al.', 'Elsevier', 'PMC', 'CZI', 'www' 
    ]
    filtered_tokens = [token for token in words if token not in custom_stop_words]
    filtered_tokens1 = [token for token in filtered_tokens if token not in custom_stop_words] ##Stop word removal
    txt = ' '.join(filtered_tokens1)
    
    return txt

## Function to create Dataframe using files list

In [125]:
def createDB(filepath):
    dict_ = {'paper_id': [], 'abstract': [], 'body_text': []} ## Initializing directories
    for j in range(len(filepath)):
        with open(filepath[j]) as f: ##json text extraction
                data=json.load(f)
                paper_id = data['paper_id']
                abstract = []
                body_text = []
                for entry in data['abstract']:
                    abstract.append(entry['text'])             
                for entry in data['body_text']:
                    body_text.append(entry['text'])
                    
                abstract = '\n'.join(abstract)
                body_text = '\n'.join(body_text)

                dict_['paper_id'].append(paper_id)
                if len(abstract) == 0: 
            # if no abstract is provided
                    dict_['abstract'].append("Not provided.") ##
                else:
            # abstract is provided
                    dict_['abstract'].append(abstract)
               # dict_['abstract'].append(abstract)
                dict_['body_text'].append(body_text)
                
                
    df = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text'])
    df['abstract'] = df['abstract'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x)) ## Remove special charecters
    df['abstract'] = df['abstract'].apply(lambda x: normalize(x))
    return df

## Function to perform TextRank summarization

In [169]:
def cleansed( Unfinished, txt, n):
    sent_tokens=nltk.sent_tokenize(txt)
    unfin = nltk.sent_tokenize(Unfinished)
    vectorizer = TfidfVectorizer(stop_words='english', max_features=2**12, smooth_idf=True, use_idf=True, ngram_range=(2,4))
    docu=vectorizer.fit_transform(sent_tokens)
    sim_mat= docu*docu.T
    sim_graph= networkx.from_scipy_sparse_matrix(sim_mat)
    scores = networkx.pagerank(sim_graph)
    ranked_sentences = sorted(((score, index)
                            for index, score in scores.items()), reverse=True)
    top_sentence_indices = [ranked_sentences[index][1] for index in range(0,n)]
    top_sentence_indices.sort()
    top_sentences = [unfin[index] for index in top_sentence_indices]
    summary =''.join(top_sentences)
    return summary

## Function to write summary output

In [192]:
def outputfiles(dataframe):
    for i in range(len(dataframe)):
        j=i+1
        filename = ('output_%d.md'%(j))
        with open(filename, 'w') as f:
            f.write('This is the output for cluster #%d\n\n'%(j))
            for text in dataframe['summary'][i]:
                f.write(text)

### Data collection

In [193]:
filenames = readfiles('json files/*.json', 20)
df = createDB(filenames)

The files from index 187 to 248 have been taken
61


### Text vectorization

In [194]:
vectorizer=TfidfVectorizer(stop_words='english', max_features=2**12, smooth_idf=True, use_idf=True, ngram_range=(2,4))
docu=vectorizer.fit_transform(df['abstract'].values)


### Clustering using KMeans

In [195]:
cluster=np.sqrt(int(len(filenames))/2)
print(cluster)
kmeans = MiniBatchKMeans(n_clusters=int(cluster),max_iter=5000, init='random')
preds = kmeans.fit_predict(docu)

5.522680508593631


### Dataframe with clustered text

In [196]:
df['cluster']=preds
df1 = df.groupby('cluster')['body_text'].apply(list).reset_index(name='text')
df1['text'] = df1['text'].apply(lambda x:  ' '.join(map(str, x)) )

### Normalization of text

In [197]:
df1['Normalized_text'] = 0

for j in range(len(df1)):
    df1['Normalized_text'][j]=normalize(df1['text'][j])
    
df1['summary'] = 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


### Summary creation

In [198]:
for k in range(len(df1)):
    print(k)
    df1['summary'][k]= cleansed(df1['text'][k], df1['Normalized_text'][k], 15)

0
1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


3
4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### File output

In [199]:
outputfiles(df1)