In [1]:
import numpy as np
import pandas as pd
import re
import json
import sys
import os
import ast
import random

In [2]:
!{sys.executable} -m pip install nltk gensim

import nltk
import gensim



In [2]:
## Set up NLTK stuff

from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import gensim

p_stemmer = PorterStemmer()
stop_words = stopwords.words('english')

In [3]:
class process_articles:
    
    def __init__(self, file_locs, stopwords):
    
        self.file_locs = file_locs
        self.stopwords = stopwords
    
    def read_files(self):
        
        self.title_text = []
        
        for file_loc in self.file_locs:
        
            ## Get all files from specified location
            print('Processing Files at the below location:')
            print(file_loc)
            self.raw_root_files = os.listdir(file_loc)
            
            # Randomly sample files to reduce training file size - 10% of files from each supply
            self.root_files = []
            for val in np.arange(int(len(self.raw_root_files) / 10)):
                
                self.root_files.append(random.choice(self.raw_root_files))
            
            print('There are {} files to process'.format(len(self.root_files)))
            print('There were {} files in the dataset'.format(len(self.raw_root_files)))
            file_root = file_loc.split('/')[1]

            ## Loop through each file and grab the title and text
    
            for file in self.root_files:

                with open('{}/{}'.format(file_loc, file)) as f:
                    art_text_fin = []

                    try:
                        ## Load article and extract title
                        article = json.load(f)
                        art_title = article['metadata']['title']

                        ## Text is stored in multiple blocks - loop through each one
                        art = article['body_text']
                        art_text = []
                        for text in np.arange(len(art)):
                            raw_text = art[text]['text']
                            art_text.append(raw_text)

                        ## Condense each block together in a single form
                        ## Store raw text and titles in list
                        art_text_fin.append(" ".join(str(text_block) for text_block in art_text))
                        self.title_text.append([file_root, art_title, art_text_fin])

                    except:

                        print('FAILURE !!! \n')
                        print(article)
            
    def process_text(self):
        
        p_stemmer = PorterStemmer()
        articles = [article[2] for article in self.title_text]
        
        ## Process Each document - remove junk
        
        print('Cleaning out Junk')
        
        articles = [str(article).lower() for article in articles]
        articles = [re.sub('<[^<]+?>', '', article) for article in articles]
        articles = [re.sub(r'http\S+', '', article) for article in articles]
        articles = [re.sub(r'[^A-Za-z0-9]+', ' ', article) for article in articles]
        articles = [re.sub(r'\\', '', article) for article in articles]
        articles = [re.sub(r'\[.*?\]', '', article) for article in articles]
        articles = [re.sub(r'\d+', '', article) for article in articles]
        
        ## Tokenize
        ## deacc=True drops out punctuation
        
        print('Tokenizing words')
        
        articles = [gensim.utils.simple_preprocess(str(article), deacc=True) for article in articles]
        articles = [ast.literal_eval(str(article)) for article in articles]
        
        ## Convert into words
        ## Clean out stop words
        
        print('Converting to list of words and removing stop words')
        
        articles = [[word.strip() for word in article] for article in articles] 
        articles = [[word for word in article if word not in self.stopwords] for article in articles]
        
        ## Stem words
        
        print('Creating word stems')
        
        articles = [[p_stemmer.stem(word) for word in article] for article in articles]
        
        self.processed_article = articles
        
                    
def train_test_splitter(processed_text, train_prop):
    
    train = processed_text[0:int(len(processed_text) * train_prop)]
    test = processed_text[int(len(processed_text) * 0.8):]
    
    return train, test

In [4]:
file_locations = ['biorxiv_medrxiv/biorxiv_medrxiv', 
                  'noncomm_use_subset/noncomm_use_subset', 
                  'comm_use_subset/comm_use_subset',
                  'custom_license/custom_license']

processed_articles = process_articles(file_locations, stop_words)
processed_articles.read_files()

print('Example File Name')
print(processed_articles.root_files[0])
print('Number of files')
print(len(processed_articles.root_files))
print('Example Article Information')
print(processed_articles.title_text[2])

Processing Files at the below location:
biorxiv_medrxiv/biorxiv_medrxiv
There are 88 files to process
There were 885 files in the dataset
Processing Files at the below location:
noncomm_use_subset/noncomm_use_subset
There are 235 files to process
There were 2353 files in the dataset
Processing Files at the below location:
comm_use_subset/comm_use_subset
There are 911 files to process
There were 9118 files in the dataset
Processing Files at the below location:
custom_license/custom_license
There are 1695 files to process
There were 16959 files in the dataset
Example File Name
9b8b585778de84e7ced9af95e4977ff678970471.json
Number of files
1695
Example Article Information
['biorxiv_medrxiv', 'North Carolina at Chapel Hill, 125 Mason Farm Rd', ["In spite of the substantial resources that have been allocated by the National Institute of Allergy and 54 Infectious Diseases and the Centers for Disease Control and Prevention to support prediction of 55 emerging viral pathogens (https://www.niaid

In [5]:
processed_articles.process_text()
print('Example Processed Article')
processed_articles.processed_article[2]

Cleaning out Junk
Tokenizing words
Converting to list of words and removing stop words
Creating word stems
Example Processed Article


['spite',
 'substanti',
 'resourc',
 'alloc',
 'nation',
 'institut',
 'allergi',
 'infecti',
 'diseas',
 'center',
 'diseas',
 'control',
 'prevent',
 'support',
 'predict',
 'emerg',
 'viral',
 'pathogen',
 'plaqu',
 'format',
 'rel',
 'kh',
 'pv',
 'fig',
 'inde',
 'specif',
 'infect',
 'kh',
 'pv',
 'equival',
 'wt',
 'addit',
 'ps',
 'kh',
 'reduc',
 'effici',
 'plaqu',
 'format',
 'specif',
 'infect',
 'ps',
 'kh',
 'pv',
 'reduc',
 'twofold',
 'rel',
 'kh',
 'pv',
 'fig',
 'therefor',
 'exagger',
 'behavior',
 'ps',
 'kh',
 'pv',
 'pfu',
 'base',
 'growth',
 'assay',
 'rel',
 'replicon',
 'assay',
 'like',
 'reflect',
 'addit',
 'defect',
 'viru',
 'assembl',
 'spread',
 'base',
 'log',
 'differ',
 'sensit',
 'ribavirin',
 'fig',
 'kh',
 'ps',
 'kh',
 'pv',
 'also',
 'exhibit',
 'higher',
 'fidel',
 'wt',
 'pv',
 'fig',
 'importantli',
 'mutant',
 'exhibit',
 'essenti',
 'equival',
 'fidel',
 'phenotyp',
 'fig',
 'consist',
 'suggest',
 'reduc',
 'effici',
 'plaqu',
 'format',
 

In [6]:
## Create single list with all words for trigrams

class create_bigram_trigram:
    
    def __init__(self, tokenized_articles):
        
        self.tokenized_articles = tokenized_articles
        
    def bigrams(self, min_count=3, threshold=30):
        
        ## Create bigrams from raw tokenized text provided
        
        bigram = gensim.models.Phrases(self.tokenized_articles, min_count = min_count, threshold = threshold)
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        bigram_fin = [bigram_mod[article] for article in self.tokenized_articles]
        
        self.bigram = bigram_fin
        
    def trigrams(self, min_count=3, threshold=30):
        
        print('Creating Bigrams')
        
        ## Create bigrams from raw tokenized text provided
        
        bigram = gensim.models.Phrases(self.tokenized_articles, min_count = min_count, threshold = threshold)
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        
        print('Creating Trigrams from Bigrams')
        
        ## Create trigrams from the bigram model and raw text
        
        trigram = gensim.models.Phrases(bigram[self.tokenized_articles])
        trigram_mod = gensim.models.phrases.Phraser(trigram)
        trigram_fin = [trigram_mod[bigram_mod[article]] for article in self.tokenized_articles]
        trigram_fin = [str(trigram) for trigram in trigram_fin]
        
        self.trigrams = trigram_fin


#articles_collapsed = [article for articles in processed_articles.processed_article for article in articles]
article_trigrams = create_bigram_trigram(processed_articles.processed_article)
article_trigrams.trigrams()

Creating Bigrams
Creating Trigrams from Bigrams


In [7]:
train, test = train_test_splitter(article_trigrams.trigrams, 0.8)
train[0]

"['atyp_pneumonia', 'case', 'caus_novel_coronaviru', 'ncov', 'first', 'report', 'confirm', 'wuhan_china', 'decemb_januari', 'gmt', 'confirm_case', 'ncov', 'infect', 'mainland_china', 'includ', 'death', 'ncov', 'case', 'also', 'report', 'thailand_japan', 'republ_korea', 'hong_kong_taiwan', 'us', 'case', 'export', 'wuhan', 'see', 'news', 'releas', 'januari', 'outbreak', 'still', 'go', 'recent_publish', 'preprint', 'imai_et_al', 'estim', 'total', 'ci', 'case', 'ncov', 'infect', 'wuhan', 'onset_symptom', 'januari', 'likelihood', 'travel', 'relat', 'risk', 'diseas', 'spread', 'suggest', 'indic', 'potenti', 'region', 'global_spread', 'best_knowledg', 'exist', 'peer_review_literatur', 'quantifi', 'ncov', 'januari', 'studi', 'estim', 'ncov', 'via', 'basic_reproduct_number', 'base', 'limit', 'data', 'earli_phase', 'outbreak', 'obtain', 'number', 'ncov', 'case', 'time_seri', 'data', 'mainland_china', 'releas', 'wuhan', 'municip', 'health_commiss_china', 'nation_health', 'commiss_china', 'januari

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

train, test = train_test_splitter(article_trigrams.trigrams, 0.8)

vectorizer = CountVectorizer(min_df = 50, max_df = 0.8, max_features = 50000)
tf = vectorizer.fit_transform(train) ## Vectorize training set
tf_feature_names = vectorizer.get_feature_names() ## Pull out words for use in eval

In [9]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=20,
                                learning_method = 'online',
                                verbose = 1,
                                learning_offset = 25.,
                                random_state = 100
                               )

ldamod = lda.fit(tf)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [10]:
class LDA_Evaluator:
    
    def __init__(self, lda_model, vectorizer):
        
        self.lda_model = lda_model
        self.feature_names = vectorizer.get_feature_names()
        self.vectorizer = vectorizer
    
    def create_df(self):
        
        components = pd.DataFrame(self.lda_model.components_).copy()
        components['fullsum'] = components.sum(axis=1)
        
        self.components = components
        
    def component_contribution(self):
        
        allwords = self.components['fullsum'].sum()
        self.topic_distro = self.components['fullsum'] / allwords
        
        for col in self.components.columns:
            self.components[col] = self.components[col] / self.components['fullsum']
        self.components.drop(['fullsum'], inplace = True, axis = 1)
        self.components = self.components.transpose()
        self.components['wordmean'] = self.components.mean(axis=1)
        self.components.index = self.feature_names
        
    def word_distribution(self):
        
        words = pd.DataFrame(self.lda_model.components_).copy()
        words = words.transpose()
        words['fullsum'] = words.sum(axis=1)
        wordstotal = words['fullsum'].sum()
        word_distribution = words['fullsum'] / wordstotal
        word_rank = word_distribution.rank() / len(word_distribution)
        
        ## Add back to DF
        
        self.components['word_rank'] = word_rank.values
        self.components['word_distro'] = word_distribution.values
        
    def eval_raw_frequency(self, topic, num_words, threshold=0):
        
        ## Returns words that show up most per topic
        
        raw_vals = self.components.copy()
        raw_vals = raw_vals[raw_vals['word_rank'] >= threshold]
        
        return raw_vals.sort_values(by=topic, ascending=False).head(n=num_words)
    
    def eval_rel_frequency(self, topic, num_words, threshold = 0):
        
        ## Returns words that show up disproportionately by topic
        
        rel_freq = self.components.copy()
        rel_freq = rel_freq[rel_freq['word_rank'] >= threshold]
        for col in rel_freq[0:(len(rel_freq.columns)-1)]:
            rel_freq[col] = rel_freq[col] / rel_freq['wordmean'] ## Calc how much higher/lower prop is
        
        return rel_freq.sort_values(by=topic, ascending = False).head(n=num_words)

In [11]:
evalinfo = LDA_Evaluator(lda_model = ldamod, vectorizer = vectorizer)

In [13]:
evalinfo.create_df()
evalinfo.component_contribution()
evalinfo.word_distribution()

## Evaluate words that show up the most per topic

evalinfo.eval_raw_frequency(3, 20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,wordmean,word_rank,word_distro
compound,2.378372e-06,4.299055e-07,6e-05,0.029647,2.5122e-05,4.048074e-07,0.000261,0.000201,1e-06,0.000102,...,9.602585e-07,2e-06,2.603507e-06,4e-06,3.965605e-05,1e-05,4.388028e-05,0.001527,0.951987,0.000854
activ,0.00400499,0.0001429707,7.2e-05,0.025195,0.002535385,0.000951335,0.004028,0.016394,0.001702,0.003338,...,0.003826432,0.000421,0.001023025,0.001039,0.0003333132,0.000189,0.004279128,0.004102,0.998758,0.004304
drug,0.0001928923,4.477816e-07,6.2e-05,0.021801,0.0004618777,1.654541e-06,0.003088,0.00073,1e-06,0.000289,...,4.22459e-05,2e-06,0.001723052,2.1e-05,0.0002631599,1.3e-05,0.0003284653,0.001475,0.964404,0.001097
product,0.000297407,6.970666e-05,6.4e-05,0.01781,0.002375925,9.111492e-05,0.002791,0.004639,2e-06,0.001964,...,0.0009533387,0.000286,0.0001806539,0.00037,0.002012084,0.003789,0.0003529432,0.002231,0.99048,0.002364
deriv,0.00178083,0.001228248,6.2e-05,0.015198,0.0002534186,4.35316e-05,0.000538,0.001337,0.000236,0.000857,...,5.654957e-05,2e-06,9.220056e-05,0.000767,0.0001138779,0.001896,0.0005697852,0.001414,0.966474,0.001125
treatment,0.001271742,4.956725e-07,6.1e-05,0.013839,0.0006018379,0.0006209366,0.015195,0.001684,7e-06,0.003636,...,0.0004521663,4.8e-05,0.007499634,3e-06,0.002335601,8.2e-05,0.001706152,0.002506,0.991308,0.002581
reaction,0.000106828,4.900101e-07,6.2e-05,0.013385,8.162575e-05,0.0002430729,0.000159,0.000159,1e-06,9.9e-05,...,1.414908e-06,2e-06,0.0002111661,7.7e-05,0.0004981544,8e-05,0.0004948442,0.001064,0.950538,0.000824
inhibit,0.004994053,4.274497e-07,6.3e-05,0.00976,5.065707e-07,6.379119e-06,0.000519,0.004784,1e-06,0.003986,...,3.219831e-06,9.9e-05,1.59782e-05,1.2e-05,3.407183e-05,1.1e-05,0.0006938061,0.001583,0.978684,0.001549
group,6.45722e-05,0.0003051181,6.3e-05,0.009675,0.001906543,0.005803008,0.017252,0.000827,0.004347,0.006512,...,8.095351e-05,0.001439,0.0007378253,0.008524,0.0009944792,0.001114,0.0008224209,0.003231,0.995654,0.003172
yield,0.0001720339,0.0008266878,6e-05,0.009116,0.0001065269,0.0004978216,3e-05,3e-05,1e-06,0.000294,...,0.0002978567,2e-06,0.0001028826,0.000145,0.0003474477,0.00133,0.000345301,0.000767,0.923634,0.000564


In [14]:
## Evaluate the words that show up the most relative to other topics for each topic

evalinfo.eval_rel_frequency(3, 20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,wordmean,word_rank,word_distro
compound,0.001558,0.000282,0.039277,19.417418,0.016454,0.000265,0.170771,0.131859,0.000893,0.066838,...,0.000629,0.001275,0.001705,0.002614,0.025973,0.006237,0.028739,1.0,0.951987,0.000854
polysaccharid,0.164213,0.00248,0.344334,18.649797,0.006465,0.107825,0.002208,0.641423,0.007157,0.020636,...,0.003748,0.010943,0.00159,0.002287,0.009129,0.00153,0.003973,1.0,0.622103,9.9e-05
analogu,0.057057,0.004464,0.618247,18.394102,0.001142,0.003017,0.003223,0.353717,0.012884,0.013224,...,0.006695,0.019694,0.067138,0.030913,0.001913,0.002859,0.00511,1.0,0.409147,5.4e-05
nitrogen,0.065218,0.005624,0.783181,17.557488,0.00129,0.003872,0.214119,0.028807,0.016286,0.002683,...,0.008498,0.024924,0.003024,0.006008,0.551541,0.003933,0.237148,1.0,0.327815,4.3e-05
solvent,0.390939,0.003447,0.481391,17.406194,0.000796,0.002348,0.002325,0.001954,0.009993,0.003241,...,0.005208,0.015264,0.001808,0.11263,0.001557,0.002143,0.864715,1.0,0.503932,6.8e-05
scheme,0.001267,0.001243,0.167334,17.172601,0.685486,0.002285,0.228553,0.003368,0.003496,0.010079,...,0.001905,0.005355,0.000689,0.112955,0.107119,0.12917,0.001659,1.0,0.807947,0.000241
oxid,0.00227,0.0021,0.292732,17.11484,0.000608,0.001462,0.514945,0.79715,0.006099,0.003299,...,0.003199,0.00931,0.001156,0.002483,0.003629,0.002133,0.701649,1.0,0.682947,0.000126
radic,0.007193,0.006961,0.968325,17.051376,0.537198,0.004724,0.052872,0.442048,0.020153,0.003911,...,0.010468,0.630125,0.004177,0.007482,0.003186,0.004434,0.005663,1.0,0.258278,3.7e-05
zinc,0.00638,0.003158,0.439853,16.976532,0.00072,0.002826,0.161048,0.261649,0.009217,0.030444,...,0.00476,0.745847,0.272138,0.004169,0.775423,0.001956,0.002591,1.0,0.549876,7.9e-05
ring,0.193842,0.002117,0.29436,16.933742,0.079481,0.001459,0.035455,0.189041,0.006123,0.122827,...,0.00321,0.009347,0.001759,1.015015,0.35603,0.001374,0.075987,1.0,0.676118,0.000123


# TO DO

1. Generate topic scores on test data - find most similar articles from training data
2. Evalaute perplexity, experiment with different number of topics
3. Create word clouds
4. Experiment with BERT as features
5. Scale up to avoid random sampling

A general note - random sampling really not ideal on each iteration, will change the topics each time the model is run.

In [28]:
with open("processed_covid.json", 'w', encoding='utf-8') as f:
    json.dump(processed_articles.processed_article, f, ensure_ascii=False, indent=4)

In [None]:
with open("trigrams_covid.json", 'w', encoding='utf-8'): as f2:
    json.dump(article_trigrams.trigrams, f2, ensure_ascii=False, indent=4)

In [4]:
## Read processed data

with open('processed_covid.json', 'r') as fi:
    processed_data = json.load(fi)