In [1]:
%%time

import glob
import json
import pandas as pd
import tensorflow as tf
import spacy
import re
import string
from sklearn.model_selection import train_test_split
import numpy as np

CPU times: user 1.51 s, sys: 586 ms, total: 2.09 s
Wall time: 1.67 s


In [2]:
%%time

path = '/home/prajakta/Documents/SharpestMinds/COVID-analysis/data/*.json'
files = glob.glob(path)
papers = []
for file in files:
    with open(file) as json_file:
            text = json.load(json_file)
            papers.append([text['paper_id'],text['title'], text['bodytext'], text['abstract']])
data = pd.DataFrame(papers, columns = ['paper_id','title', 'bodytext', 'abstract'])
filter = data.abstract != ""
data = data[filter]
data['len_bt'] = data.bodytext.map(lambda x: len(x.split(" ")))
data['len_ab'] = data.abstract.map(lambda x: len(x.split(" ")))
data.query('len_bt <= 10000 and len_ab <= 500', inplace = True)
#first_10 = data[:10]

CPU times: user 4.29 s, sys: 550 ms, total: 4.84 s
Wall time: 4.87 s


In [3]:
def clean_text(bodytext):
    cleaned = list()
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table 
    table = str.maketrans('', '', string.punctuation)
    for word in bodytext:
        if word.is_stop == False:
            words = str(word.lemma_)       
            words = words.lower()
            words = words.translate(table)
            words = re_print.sub('', words) 
            if words.isalpha() == True:
                cleaned.append(words)
    return cleaned

In [4]:
nlp = spacy.load("en_core_web_sm")
ab_str = []
for i in range(len(data)):
    abstract = nlp(data.iloc[i].abstract)
    ab_clean = clean_text(abstract)
    ab_str.append((' ').join(ab_clean))
data['ab_clean'] = ab_str

In [5]:
data.head()

Unnamed: 0,paper_id,title,bodytext,abstract,len_bt,len_ab,ab_clean
0,17413f651645c2b9c92555e9ce1404b9290eccab,Non-human primate orthologues of TMPRSS2 cleav...,a1111111111 a1111111111 a1111111111 a11111111...,"The cellular serine protease TMPRSS2, a membe...",3544,267,cellular serine protease member type ii transm...
3,ab8c1e32b66b02cd703799df3d2ee37a1cb369b7,Enhanced protection in mice induced by immuniz...,Middle East respiratory syndrome coronavirus ...,The persistent public health threat of infect...,4250,224,persistent public health threat infection midd...
4,685efeb0ad4c214b8295dc4f723c3269464772d8,viruses Isolation of a Novel Fusogenic Orthore...,Bats have been increasingly associated with e...,We report on the isolation of a novel fusogen...,4515,155,report isolation novel fusogenic orthoreovirus...
5,09ccb3b9fece55e72c3acb85c4259de62a9c9e0c,Association of herd BRSV and BHV-1 seroprevale...,Bovine respiratory disease (BRD) incorporates...,Background: The aim of this study was to dete...,4393,275,background aim study detect association bovine...
7,93d080273b1f33330243dd140a35ba890ddd2973,The determinants and consequences of adult nur...,Nurses leaving their jobs or leaving the prof...,Background: Nurses leaving their jobs and the...,7160,298,background nurse leave job profession issue in...


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer()

%time tfidf_matrix = tfidf_vectorizer.fit_transform(data.ab_clean) #fit the vectorizer to synopses
pd.DataFrame(tfidf_vectorizer.transform(data.ab_clean).toarray(), columns = sorted(tfidf_vectorizer.vocabulary_.keys()))

#print(tfidf_matrix.shape)

CPU times: user 788 ms, sys: 7.96 ms, total: 796 ms
Wall time: 796 ms


MemoryError: Unable to allocate 2.03 GiB for an array with shape (8059, 33749) and data type float64

In [None]:
terms = tfidf_vectorizer.get_feature_names()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [None]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

data['clusters'] = km.labels_.tolist()

In [None]:
cluster = pd.DataFrame(np.sort(data.clusters.value_counts()), columns = ['num_papers'])
cluster

In [None]:
from __future__ import print_function

vocab_frame = pd.DataFrame(tfidf_vectorizer.vocabulary_.keys())

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :10]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.iloc[ind].tolist()[0], end=',')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

In [None]:
data.query('clusters == 2', inplace = False)