In [1]:
!python -m pip install -U gensim --user

import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt



In [2]:
train = pd.read_csv('../data/FullLitCovid/train.tsv',sep='\t',header=None,dtype={0:str,1:str})
val = pd.read_csv('../data/FullLitCovid/val.tsv',sep='\t',header=None,dtype={0:str,1:str})
test = pd.read_csv('../data/FullLitCovid/test.tsv',sep='\t',header=None,dtype={0:str,1:str})
train.head(5)

Unnamed: 0,0,1
0,1100000,structural conservation among variants sars-co...
1,1000,effective management idiopathic intracranial h...
2,1000,`` 's whole different atmosphere '' qualitativ...
3,1100000,modification spike protein vaccines enveloped ...
4,1010,analysis prediction covid-19 outbreak pakistan...


In [3]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [4]:
import nltk
import string
from nltk.corpus import stopwords

def break_word(text):
    text = text.split(" ")
    texted = [ w for w in text if w.isalpha()]
    #print(texted)
    return texted

train_tagged = train.apply(
    lambda r: TaggedDocument(words=break_word(r[1]), tags=[r[0]]), axis=1)
val_tagged = val.apply(
    lambda r: TaggedDocument(words=break_word(r[1]), tags=[r[0]]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=break_word(r[1]), tags=[r[0]]), axis=1)

In [5]:
#the CORD19 Dataset
eval1 = pd.read_csv('../data/cord19_test.tsv',sep='\t',header=None,dtype={0:str,1:str})

eval_tagged = eval1.apply(
    lambda r: TaggedDocument(words=break_word(r[1]), tags=[r[0]]), axis=1)


In [6]:
train_tagged

0        ([structural, conservation, among, variants, s...
1        ([effective, management, idiopathic, intracran...
2        ([whole, different, atmosphere, qualitative, e...
3        ([modification, spike, protein, vaccines, enve...
4        ([analysis, prediction, outbreak, pakistan, st...
                               ...                        
24713    ([seroprevalence, healthcare, workers, swiss, ...
24714    ([surgical, response, pandemic, singapore, per...
24715    ([use, ct, artificial, intelligence, suspected...
24716    ([effect, famotidine, hospitalized, patients, ...
24717    ([clinical, implications, coronavirus, disease...
Length: 24718, dtype: object

In [7]:
test_tagged

0       ([impacted, immunisation, service, delivery, a...
1       ([new, insights, seriousness, acute, myocardia...
2       ([guidelines, adaptation, outbreak, management...
3       ([implementing, strategies, workplace, level, ...
4       ([global, healthcare, resource, efficiency, ma...
                              ...                        
7058    ([big, seroprevalence, data, pakistan, herd, i...
7059    ([effect, prehabilitation, enhanced, recovery,...
7060    ([cardiovascular, system, simply, viewer, lead...
7061    ([precision, medicine, potential, target, apri...
7062    ([comparative, analysis, diagnostic, performan...
Length: 7063, dtype: object

In [8]:
eval_tagged

0     ([the, influenza, pandemic, in, england, and, ...
1     ([buried, treasure, evolutionary, perspectives...
2     ([polyether, ionophores, and, promising, biolo...
3     ([stillbirth, during, infection, with, middle,...
4     ([approved, antiviral, drugs, over, the, past,...
                            ...                        
95    ([proposed, calfhood, immunization, program, f...
96    ([risk, of, bacterial, coinfections, in, febri...
97    ([early, days, genomics, and, human, responses...
98    ([autophagic, machinery, activated, by, dengue...
99    ([development, of, a, multiplex, one, step, th...
Length: 100, dtype: object

Distributed Bag of Words (DBOW)

In [9]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 24718/24718 [00:00<00:00, 3542621.09it/s]


In [10]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 24718/24718 [00:00<00:00, 2423894.28it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1080170.93it/s]
100%|██████████| 24718/24718 [00:00<00:00, 760279.89it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1549860.32it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1304873.46it/s]
100%|██████████| 24718/24718 [00:00<00:00, 2006868.10it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1477396.28it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1181048.58it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1378452.70it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1639568.05it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1667011.94it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1810059.99it/s]
100%|██████████| 24718/24718 [00:00<00:00, 873021.59it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1173309.56it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1476849.09it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1528856.34it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1771250.02it/s]

Wall time: 11min 4s


In [11]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, epochs=20)) for doc in sents])
    return targets, regressors


In [12]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)
logreg = LogisticRegression(n_jobs=1, C=1e5, solver='lbfgs', max_iter=500)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.3645759592241257
Testing F1 score: 0.29086556907259203


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:

y_val, X_val = vec_for_learning(model_dbow, val_tagged)
logreg = LogisticRegression(n_jobs=1, C=1e5, solver='lbfgs', max_iter=500)
logreg.fit(X_train, y_train)
y_val_pred = logreg.predict(X_val)
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_val, y_val_pred))
print('Testing F1 score: {}'.format(f1_score(y_val, y_val_pred, average='weighted')))

Testing accuracy 0.36409966024915064
Testing F1 score: 0.2892470996855133


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
y_eval, X_eval = vec_for_learning(model_dbow, eval_tagged)
logreg = LogisticRegression(n_jobs=1, C=1e5, solver='lbfgs', max_iter=500)
logreg.fit(X_train, y_train)
y_evl_pred = logreg.predict(X_eval)
#from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_eval, y_evl_pred))
print('Testing F1 score: {}'.format(f1_score(y_eval, y_evl_pred, average='weighted')))

Testing accuracy 0.11
Testing F1 score: 0.05894736842105264


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Distributed Memory (DM)

In [15]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 24718/24718 [00:00<00:00, 1679733.09it/s]


In [16]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 24718/24718 [00:00<00:00, 2441302.81it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1180887.15it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1400613.42it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1692926.29it/s]
100%|██████████| 24718/24718 [00:00<00:00, 805691.78it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1653110.20it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1430905.21it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1239906.79it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1221629.47it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1372594.48it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1240040.26it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1434687.27it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1449855.35it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1075397.86it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1651661.72it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1627394.69it/s]
100%|██████████| 24718/24718 [00:00<00:00, 1196740.27it/s

Wall time: 16min 15s


In [17]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.5078578507716268
Testing F1 score: 0.5333509791502056


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
y_val, X_val = vec_for_learning(model_dmm, val_tagged)
logreg = LogisticRegression(n_jobs=1, C=1e5, solver='lbfgs', max_iter=500)
logreg.fit(X_train, y_train)
y_val_pred = logreg.predict(X_val)
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_val, y_val_pred))
print('Testing F1 score: {}'.format(f1_score(y_val, y_val_pred, average='weighted')))

Testing accuracy 0.5234994337485843
Testing F1 score: 0.5477758524629028


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
y_eval, X_eval = vec_for_learning(model_dmm, eval_tagged)
logreg = LogisticRegression(n_jobs=1, C=1e5, solver='lbfgs', max_iter=500)
logreg.fit(X_train, y_train)
y_evl_pred = logreg.predict(X_eval)
#from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_eval, y_evl_pred))
print('Testing F1 score: {}'.format(f1_score(y_eval, y_evl_pred, average='weighted')))

Testing accuracy 0.19
Testing F1 score: 0.20756536017174312


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
