In [106]:
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import LatentDirichletAllocation as LDA
from gensim import corpora, models
from sklearn.metrics import silhouette_score
import networkx
import random
import re
import pandas as pd
import numpy as np
import json
import glob
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer 
from sklearn.metrics import silhouette_score

In [107]:
def readfiles(path, n):
    filenames=glob.glob(path)
    filen=len(filenames)
    number= random.randint(0,filen)
    percent = (n)/100
    reqf=(filen)* percent
    end=number+reqf
    print("The files from index %d to %d have been taken" %(number , end))
    print(int(reqf))
    taken=filenames[int(number):int(end)]
    return taken

In [108]:
def createDB(filepath):
    dict_ = {'paper_id': [], 'abstract': [], 'body_text': []}
    for j in range(len(filepath)):
        with open(filepath[j]) as f:
                data=json.load(f)
                paper_id = data['paper_id']
                abstract = []
                body_text = []
                for entry in data['abstract']:
                    abstract.append(entry['text'])             
                for entry in data['body_text']:
                    body_text.append(entry['text'])
                    
                abstract = '\n'.join(abstract)
                body_text = '\n'.join(body_text)

                dict_['paper_id'].append(paper_id)
                if len(abstract) == 0: 
            # no abstract provided
                    dict_['abstract'].append("Not provided.")
                else:
    # abstract is short enough
                    dict_['abstract'].append(abstract)
               # dict_['abstract'].append(abstract)
                dict_['body_text'].append(body_text)
                
                
    df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text'])
    return df_covid

In [109]:
def lower_case(input_str):
    input_str = input_str.lower()
    return input_str

In [110]:
def normalize(text):
    sents=(nltk.sent_tokenize(text))
    words=[]
    for sent in sents:
        sent.strip()
        words.extend(nltk.word_tokenize(sent))
    stopword_list = nltk.corpus.stopwords.words('english')
    custom_stop_words = [
        'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
        'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 
        'al.', 'Elsevier', 'PMC', 'CZI', 'www' 
    ]
    filtered_tokens = [token for token in words if token not in custom_stop_words]
    filtered_tokens1 = [token for token in filtered_tokens if token not in custom_stop_words]
    txt = ' '.join(filtered_tokens1)
    
    return txt

In [111]:
def cleansed(txt, n):
    sent_tokens=nltk.sent_tokenize(txt)
    vectorizer = TfidfVectorizer(stop_words='english', max_features=2**12, smooth_idf=True, use_idf=True, ngram_range=(2,4))
    docu=vectorizer.fit_transform(sent_tokens)
    sim_mat= docu*docu.T
    sim_graph= networkx.from_scipy_sparse_matrix(sim_mat)
    scores = networkx.pagerank(sim_graph)
    ranked_sentences = sorted(((score, index)
                            for index, score in scores.items()), reverse=True)
    top_sentence_indices = [ranked_sentences[index][1] for index in range(0,n)]
    top_sentence_indices.sort()
    top_sentences = [sent_tokens[index] for index in top_sentence_indices]
    summary =''.join(top_sentences)
    return summary

In [112]:
def outputfiles(dataframe):
    for i in range(len(dataframe)):
        j=i+1
        filename = ('output%d.md'%(j))
        with open(filename, 'w') as f:
            f.write('This is the output for cluster #%d\n\n'%(j))
            for text in dataframe['summary'][i]:
                f.write(text)

In [114]:
filenames = readfiles('json files/*.json', 50)
df = createDB(filenames)

The files from index 21 to 175 have been taken
154


In [115]:
df['abstract'] = df['abstract'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
df['abstract'] = df['abstract'].apply(lambda x: lower_case(x))


In [116]:
vectorizer=TfidfVectorizer(stop_words='english', max_features=2**12, smooth_idf=True, use_idf=True, ngram_range=(2,4))
docu=vectorizer.fit_transform(df['abstract'].values)
cluster=np.sqrt(int(len(filenames))/2)
print(cluster)


8.774964387392123


In [117]:
kmeans = MiniBatchKMeans(n_clusters=int(cluster),max_iter=5000, init='random')
preds = kmeans.fit_predict(docu)
preds

array([5, 5, 5, 5, 5, 1, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 7,
       5, 5, 5, 1, 1, 5, 1, 5, 5, 1, 5, 2, 1, 5, 7, 5, 5, 5, 5, 5, 1, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 4, 5, 1, 5, 5, 5, 3, 5,
       5, 5, 1, 0, 5, 5, 5, 5, 5, 2, 5, 5, 5, 6, 5, 5, 5, 5, 5, 3, 5, 5,
       5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 5,
       5, 5, 5, 5, 5, 7, 5, 5, 1, 5, 5, 5, 5, 5, 1, 1, 2, 5, 5, 6, 1, 5,
       5, 5, 5, 5, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 7, 5, 5, 5, 5, 5, 1, 5],
      dtype=int32)

In [118]:
from sklearn.metrics import silhouette_score
silhouette_score(docu, kmeans.predict(docu))

0.03252186772996112

In [119]:
df['cluster']=preds
df1 = df.groupby('cluster')['body_text'].apply(list).reset_index(name='text')
df1['text'] = df1['text'].apply(lambda x:  ' '.join(map(str, x)) )

In [120]:
df1['Normalized_text'] = 0

for j in range(len(df1)):
    df1['Normalized_text'][j]=normalize(df1['text'][j])
    
df1['summary'] = 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [121]:
for k in range(0, len(df1['summary'])):
    print(k)
    df1['summary'][k]= cleansed(df1['Normalized_text'][k], 15)

0
1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


3


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


6


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


7


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [80]:
df1

Unnamed: 0,cluster,text,Normalized_text,summary
0,0,Abstract. We have evaluated the fate of misfol...,Abstract . We have evaluated the fate of misfo...,The hybrid protein carrying the wild-type repr...
1,1,I nfectious diseases continue to be major heal...,I nfectious diseases continue to be major heal...,"Additionally , MERS-CoV has been found to grow..."
2,2,Feline infectious peritonitis (FIP) is an immu...,Feline infectious peritonitis ( FIP ) is an im...,Clinical signs associated with CNS disease in ...
3,3,"In December 2019, several patients with pneumo...","In December 2019 , several patients with pneum...","Here , we describe our understanding of the 20..."


In [81]:
outputfiles(df1)