In [64]:
from os.path import join as pj
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
import nltk
nltk.download('wordnet')
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Data with english abstract and published in 2020

In [15]:
path = 'data/'
file_name = 'metadata_complet_with_abstract.csv'
df = pd.read_csv(pj(path, file_name))
df_en = df.copy()
df_en = df[df['language']=='en']
df_en_20 = df_en[df_en.publish_time.str[:4]=='2020']

Deal with duplicates

In [12]:
df_en_20 = df_en_20.drop_duplicates(['title','abstract'])

In [14]:
df_en_20.shape

(2702, 18)

Train on abstract

In [65]:
train, test = train_test_split(df_en_20.abstract, test_size=0.1, random_state=42)

In [66]:
def lemmatize_stemming(text):
    return PorterStemmer().stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

processed_docs=[]
for t in train:
    processed_docs.append(preprocess(t))
    
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

Results

In [71]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic {}: \nWords: {}".format(idx, topic ))
    print("\n")

Topic 0: 
Words: 0.010*"citi" + 0.009*"hubei" + 0.009*"travel" + 0.007*"quarantin" + 0.007*"reproduct" + 0.006*"contact" + 0.006*"predict" + 0.006*"dynam" + 0.006*"intervent" + 0.006*"social"


Topic 1: 
Words: 0.015*"mortal" + 0.012*"swab" + 0.011*"outcom" + 0.010*"women" + 0.009*"pregnant" + 0.008*"evid" + 0.008*"children" + 0.008*"fatal" + 0.008*"type" + 0.007*"asymptomat"


Topic 2: 
Words: 0.029*"lung" + 0.019*"imag" + 0.019*"lesion" + 0.017*"chest" + 0.013*"express" + 0.011*"grind" + 0.011*"involv" + 0.011*"glass" + 0.010*"opac" + 0.010*"anim"


Topic 3: 
Words: 0.014*"blood" + 0.013*"count" + 0.011*"significantli" + 0.010*"lymphocyt" + 0.010*"injuri" + 0.010*"critic" + 0.010*"decreas" + 0.010*"acid" + 0.009*"admiss" + 0.008*"cytokin"


Topic 4: 
Words: 0.025*"vaccin" + 0.020*"sequenc" + 0.017*"genom" + 0.014*"mer" + 0.013*"strain" + 0.010*"immun" + 0.010*"coronavirus" + 0.009*"host" + 0.009*"structur" + 0.008*"gene"


Topic 5: 
Words: 0.015*"bind" + 0.014*"drug" + 0.013*"target"

-Topic 0: Hubei, la quarantaine, contamination par le voyage (fermeture des frontières), prediction, éviter le contact social 

-Topic 1: Mortalité, Femmes enceintes dangerosité ? et enfants, asymptomatiques ?

-Topic 2: Chest and Lung Scans, lesions on images, opac => traces sur les images (grind ??)

-Topic 3: Nombres de cellules cytokin, lymphocyt dans le sang, blessure 

-Topic 4: vaccin, gene, genome etc.

-Topic 5: Tests medicaments

-Topic 6: Autres maladies, personnes à risque, diagnostic

-Topic 7: Corps médical

Test

In [75]:
# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(test[37380]))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic {}: {}".format(score, index, lda_model.print_topic(-1)))

Score: 0.9831529855728149	 Topic 0: 0.016*"manag" + 0.011*"staff" + 0.009*"recommend" + 0.008*"worker" + 0.007*"anxieti" + 0.007*"nurs" + 0.007*"practic" + 0.007*"psycholog" + 0.007*"experi" + 0.007*"work"


In [74]:
test[37380]

'Abstract Only a month after the outbreak of pneumonia caused by 2019-nCoV, more than forty-thousand people were infected. This put enormous pressure on the Chinese government, medical healthcare provider, and the general public, but also made the international community deeply nervous. On the 25th day after the outbreak, the Chinese government implemented strict traffic restrictions on the area where the 2019-nCoV had originated—Hubei province, whose capital city is Wuhan. Ten days later, the rate of increase of cases in Hubei showed a significant difference (p = 0.0001) compared with the total rate of increase in other provinces of China. These preliminary data suggest the effectiveness of a traffic restriction policy for this pandemic thus far. At the same time, solid financial support and improved research ability, along with network communication technology, also greatly facilitated the application of epidemic prevention measures. These measures were motivated by the need to provi

In [73]:
test

37380    Abstract Only a month after the outbreak of pn...
36175    While the coronavirus death rate may be lower ...
570      An outbreak of novel betacoronavirus, SARS-CoV...
380      Objective: To investigate the correlation betw...
1788     As of 28 February 2020, Italy had 888 cases of...
1469     In the recent issues of IRGEE, we have been as...
26151    Abstract The National Bioforensic Analysis Cen...
520      AbstractA new coronavirus SARS-CoV-2, recently...
1991     Objective To understand the possible transmiss...
402      AbstractThe 2013-2016 West Africa EBOV epidemi...
37167    A novel bat-origin coronavirus emerged in Wuha...
164      Cases from the ongoing outbreak of atypical pn...
1597     As the world is witnessing the epidemic of COV...
34317    Abstract Importation and transmission of measl...
452      SUMMARYIn the analysis of genomic sequence dat...
1924     Objective The purpose of this review of COVID-...
734      Background.This research aims to analyze the c.

In [76]:
test[570]

'An outbreak of novel betacoronavirus, SARS-CoV-2 (formerly named 2019-nCoV), began in Wuhan, China in December 2019 and the COVID-19 disease associated with infection has since spread rapidly to multiple countries. Here we report the development of SARS-CoV-2 DETECTR, a rapid (~30 min), low-cost, and accurate CRISPR-Cas12 based lateral flow assay for detection of SARS-CoV-2. We validated this method using contrived reference samples and clinical samples from infected US patients and demonstrated comparable performance to the US CDC SARS-CoV-2 real-time RT-PCR assay.'

In [77]:
# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(test[570]))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic {}: {}".format(score, index, lda_model.print_topic(-1)))

Score: 0.5852038860321045	 Topic 6: 0.016*"manag" + 0.011*"staff" + 0.009*"recommend" + 0.008*"worker" + 0.007*"anxieti" + 0.007*"nurs" + 0.007*"practic" + 0.007*"psycholog" + 0.007*"experi" + 0.007*"work"
Score: 0.19930173456668854	 Topic 4: 0.016*"manag" + 0.011*"staff" + 0.009*"recommend" + 0.008*"worker" + 0.007*"anxieti" + 0.007*"nurs" + 0.007*"practic" + 0.007*"psycholog" + 0.007*"experi" + 0.007*"work"
Score: 0.1786642223596573	 Topic 0: 0.016*"manag" + 0.011*"staff" + 0.009*"recommend" + 0.008*"worker" + 0.007*"anxieti" + 0.007*"nurs" + 0.007*"practic" + 0.007*"psycholog" + 0.007*"experi" + 0.007*"work"


In [78]:
test[380]

'Objective: To investigate the correlation between clinical characteristics and cardiac injury of COVID-2019 pneumonia. Methods: In this retrospective, single-center study, 41 consecutive corona virus disease 2019 (COVID-2019) patients (including 2 deaths) of COVID-2019 in Beijing Youan Hospital, China Jan 21 to Feb 03, 2020, were involved in this study. The high risk factors of cardiac injury in different COVID-2019 patients were analyzed. Computed tomographic (CT) imaging of epicardial adipose tissue (EAT) has been used to demonstrate the cardiac inflammation of COVID-2019. Results：Of the 41 COVID-2019 patients, 2 (4.88%), 32 (78.05%), 4 (9.75%) and 3 (7.32%) patients were clinically diagnosed as light, mild, severe and critical cases, according to the 6th guidance issued by the National Health Commission of China. 10 (24.4%) patients had underlying complications, such as hypertension, CAD, type 2 diabetes mellites and tumor. The peak value of TnI in critical patients is 40-fold more

In [79]:
# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(test[380]))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic {}: {}".format(score, index, lda_model.print_topic(-1)))

Score: 0.8044670820236206	 Topic 3: 0.016*"manag" + 0.011*"staff" + 0.009*"recommend" + 0.008*"worker" + 0.007*"anxieti" + 0.007*"nurs" + 0.007*"practic" + 0.007*"psycholog" + 0.007*"experi" + 0.007*"work"
Score: 0.1229139044880867	 Topic 0: 0.016*"manag" + 0.011*"staff" + 0.009*"recommend" + 0.008*"worker" + 0.007*"anxieti" + 0.007*"nurs" + 0.007*"practic" + 0.007*"psycholog" + 0.007*"experi" + 0.007*"work"
Score: 0.06313582509756088	 Topic 2: 0.016*"manag" + 0.011*"staff" + 0.009*"recommend" + 0.008*"worker" + 0.007*"anxieti" + 0.007*"nurs" + 0.007*"practic" + 0.007*"psycholog" + 0.007*"experi" + 0.007*"work"
