## Extracting body text
Extract the body text and keep adding that to the corpus file

In [1]:
import json   # to read the initial files
import os
import regex as re  # for pre processing
from collections import Counter  
from nltk import ngrams, sent_tokenize, word_tokenize
import joblib   # saving and loading files

In [3]:
# function as stated in the pdf
def extract_body_text( filename : str ) -> str :

    file = open(filename)
    paper_content = json.load(file)
    body_text = " "

    if 'body_text' in paper_content :
        for bt in paper_content['body_text']:
            body_text = body_text + bt['text']
    return ( body_text + '\n').lower()


# this function opens each json file, extracts the body text and add that text to
# an output file. This creates a single txt file which will be our corpus.
# We use a buffer of 200, 200 files are opened and extracted to a single variable, 
# this is added to output, and then the variable is reset.
def create_corpus(file):
    basepath = 'pdf_json'   # folder location
    length = len(os.listdir(basepath))   # number of files in the folder
    num = 200    # buffer length

    with open(file,'w',encoding="utf-8") as f:  # open output file
        f.write("")

    for i in range(0,length//num+1):
        corpus = ""   # buffer variable
        j = length - i*num if ((i+1)*num > length) else num  
        for entry in os.listdir(basepath)[(i*num):(j+i*num)]:
            pth = os.path.join(basepath, entry)
            if os.path.isfile(pth):
                corpus += extract_body_text(pth)  # add text to buffer
        with open(file,'a',encoding="utf-8") as f:
            f.write(corpus)  # when buffer reached, add to output file

In [4]:
create_corpus("corpus_covid.txt")  # creating corpus and naming it corpus_covid.txt

# Preprocessing

For text preprocessing, we do the following :
1. Remove brackets and the text within them. This is done by the regex rule :  \\([^()]\*\\) | \\[[^\[\]]*\\]
2. Remove all occurances of the text of the form : a1111... (these appear in some documents)
3. Remove occurances of 'et al.' followed by reference numbers if they occur
4. Remove occurances of these : &nbsp; j o u r n a l p r o o f &nbsp; , &nbsp; j o u r n a l p r e -p r o o f &nbsp; , &nbsp; a c c e p t e d m a n u s c r i p t
5. Remove all numbers, and these symbols : @  #  $  \  /  *  :  ;  %
The above 5 are done by using a regex pattern to find them and then substitute them with whitespace
6. Then the result is passed through a map which removes all the non ascii charecters
7. That is then passed through .split() and then joined to remove all the unnecessary whitespace which is created while the regex substitution

In [5]:
# function to do the process above
def clean(text):
    return " ".join( 
        ''.join(
            map(
                lambda x: x if ord(x)<128 else ' ', 
                re.sub(r'a1+|\([^()]*\)|\[[^\[\]]*\]|(et al\. [0-9]*)|j o u r n a l p r o o f|j o u r n a l p r e -p r o o f|a c c e p t e d m a n u s c r i p t|[0-9@#\$\\/\*,:;%]*','',text)
                )).split())

# applies the function to each line of the corpus, which is a single file each
# it saves the result with "preprocessed_" prefixed to the original file name
def prepreocess_corpus(file):
    corpus = open(file,'r',encoding="utf-8")
    preprocessed_corpus = open("preprocessed_"+file,'w',encoding="utf-8")

    line = corpus.readline()

    while True:
        if not line:  #stop when ended
            break
        preprocessed_corpus.write(clean(line)+'\n')
        line = corpus.readline()

In [6]:
prepreocess_corpus("corpus_covid.txt") # preprocessing the corpus

# Vocab count

We generate all the unigrams and add those to the counter. This gives us all the words and all their counts. These can be used for vocabulary and for calculating laplace smoothing for bigrams.

In [7]:
# function to generate all the unigrams from the file given
def gen_unigrams(file):
    x = Counter()   # Counter to store unigrams
    with open(file, 'r',encoding="utf-8") as f:
        for line in f:  # going line by line
             for sent in sent_tokenize(line): # extract sentences from each line
                x.update([i[0] for i in ngrams(word_tokenize(sent), 1)])  # create unigrams and add them to counter
    return x


In [8]:
# generate the unigrams of the preprocessed corpus
vocab_counts = gen_unigrams("preprocessed_corpus_covid.txt")

# dumping this into the uni_model file for future use
joblib.dump(vocab_counts, './models/uni_model.pickle')

In [9]:
# code block to load the file
vocab_counts = joblib.load('./models/uni_model.pickle')

In [10]:
# the size of vocab will be the number of entries in vocab_counts
print("Size of vocab is",len(vocab_counts))

Size of vocab is 1990740


# Reading vocab

Following code block generates the vocab as a list and dumps it into a file. This be be used for future reference as the bigram and trigram models don't directly save the words, but instead use the position of the word in this vocab to reduce their size in memory.

In [12]:
# code block to load the unigrams
uni = joblib.load('./models/uni_model.pickle')

In [13]:
# vocab is all the keys in above along with '<s>' and '</s>', which denote the start and stop of sentences
vocab = list(uni.keys())
vocab.extend(['<s>','</s>'])

# save this
joblib.dump(vocab, './models/vocab.pickle')

['./models/vocab.pickle']

In [14]:
# code block to load vocab
vocab = joblib.load('./models/vocab.pickle')

In [15]:
# create an inverted list so that position of word can be obtained faster
invert_vocab = dict((vocab[i],i) for i in range(len(vocab)))

# Bigram model

(_bigram and trigram were generated usning python scripts to speed up the process_)

For both, the bigram and trigram models, we load the corpus line-by-line, tokenize the sentences from that line, generate bi- or trigrams from that sentence with padded symbols and then add those grams to the counter after the words are converted to numbers using the inverted vocab

In [16]:
def gen_bigrams(file):
    x = Counter() 
    with open(file, 'r',encoding="utf-8") as f: 
        for line in f:   # reading line by line
             for sent in sent_tokenize(line):  # extract sentences from line
                gram = list(ngrams(word_tokenize(sent), 2,
                                    pad_left=True, pad_right=True,
                                    left_pad_symbol='<s>',
                                    right_pad_symbol='</s>'))
                x.update(list(map(lambda t: tuple(map(lambda y: invert_vocab[y], t)),gram)))  
                # changes the list of tuples of words to list of tuples of numbers which correspond to inverted_vocab
    return x

In [17]:
# generate the bigrams and dump the counter object into bi_model.pickle
bi = gen_bigrams("preprocessed_corpus_covid.txt")

filenm = "./models/bi_model.pickle"
joblib.dump(bi, filenm)

# Trigram model

In [18]:
def gen_trigrams(file):
    x = Counter()
    with open(file, 'r',encoding="utf-8") as f:
        for line in f:   # reading line by line
             for sent in sent_tokenize(line):  # extract sentences from line
                gram = list(ngrams(word_tokenize(sent), 3,
                                    pad_left=True, pad_right=True,
                                    left_pad_symbol='<s>',
                                    right_pad_symbol='</s>'))
                x.update(list(map(lambda t: tuple(map(lambda y: invert_vocab[y], t)),gram)))
                # changes the list of tuples of words to list of tuples of numbers which correspond to inverted_vocab
    return x

In [19]:
# generate the bigrams and dump the counter object into tri_model.pickle
tri = gen_trigrams("preprocessed_corpus_covid.txt")

filenm = "./models/tri_model.pickle"
joblib.dump(tri, filenm)

# Predictions

## Bigram

In [None]:
# code block to load unigram model
uni = joblib.load("./models/uni_model.pickle")

In [20]:
# loading the bigram model
bi = joblib.load("./models/bi_model.pickle")

# vocab size
v1 = len(uni.keys())

In [21]:
# takes bigrams in terms of numbers with the mapping in vocab 
# and returns the laplace smoothened probability of the bigram
def bi_prob(bigram):
    return ((bi[bigram]+1)/(uni[bigram[0]]+v1))

bi_predict takes in a string of the form 'x _ y', 'x \_' or '\_ x', takes probability of each of them happening and returns the 10 highest ones

In [22]:
# probability if the black is to the right
def bi_predict_right(word):
    word = invert_vocab[word]   # convert word to number
    # Take all the bigrams which match the first word to that before blank and take their probabilities
    # then takes the most common 10 from them
    counts =  Counter({i[1]:bi_prob(i) for i in bi.keys() if i[0]==word}).most_common(10)
    # converts the numbers back to word
    return list(map(lambda t1: (vocab[t1[0]],t1[1]),counts))


# probability if the blank is to the left
# function same as above, matching second word to the forst word after blank
def bi_predict_left(word):
    word = invert_vocab[word]
    counts =  Counter({i[0]:bi_prob(i) for i in bi.keys() if i[1]==word}).most_common(10)
    return list(map(lambda t1: (vocab[t1[0]],t1[1]),counts))


# takes a string, splits along with blank and check position of the blank
# if there is only one word, we take only one probability
# otherwise we combine both and return the highers 10
def bi_predict(blank):
    all=[]
    blank = blank.split()
    if(blank[1] == "_"):
        all +=  bi_predict_right(blank[0])
        if(len(blank)==3):
            all += bi_predict_left(blank[2])
    else:
        all = bi_predict_left(blank[1])
    all.sort(key=lambda x: x[1], reverse=True)
    return all[0:9]

In [23]:
# first sentence
print("all houses were ____ ventilated\n", *bi_predict("were _ ventilated"),sep='\n')


all houses were ____ ventilated

('not', 0.017188080814169605)
('used', 0.01670835970543617)
('also', 0.01301626530837779)
('performed', 0.012497362789716387)
('found', 0.010994906416709364)
('collected', 0.010508655072987934)
('obtained', 0.008821845143012146)
('observed', 0.008150737916553644)
('identified', 0.007983965761475632)


In [24]:
# second sentence
print("\n\ndevelop an integrated ____ to reach\n", *bi_predict("integrated _ to"),sep='\n')



develop an integrated ____ to reach

('due', 0.07302761787074154)
('<s>', 0.04589298451831982)
('used', 0.044714025940102674)
('compared', 0.041695550398344335)
('according', 0.03419381737444367)
('related', 0.02958296914715131)
('order', 0.025755749118418277)
('able', 0.023921255412560155)
('and', 0.02348724594874268)


In [25]:
print("\n\ndiagnosis and treatment ____ by involved\n", *bi_predict("treatment _ by"),sep='\n')



diagnosis and treatment ____ by involved

('followed', 0.02083345891477541)
('caused', 0.020452695982398506)
('of', 0.019566091001336185)
('<s>', 0.012695781468197756)
('.', 0.010257492188834302)
('characterized', 0.009442719792639923)
('determined', 0.009106663853642364)
('induced', 0.008085937892442007)
('affected', 0.007775500567628118)


In [26]:
print("\n\ninvolving non-health ____ stakeholders from . . .\n", *bi_predict("non-health _ stakeholder"),sep='\n')



involving non-health ____ stakeholders from . . .

('the', 6.731165295317319e-05)
('and', 6.17860695017933e-05)
('of', 5.726513758702794e-05)
('a', 3.365582647658659e-05)
('each', 3.265117493997207e-05)
('care', 2.4111636878748605e-05)
('to', 1.7581401890754193e-05)
('<s>', 1.7581401890754193e-05)
('different', 1.6576750354139668e-05)


In the above prediction, the 5th prediction seems to fit the most, but it is overshadowed by the the other more popular bigrams which occur with stakeholders

In [27]:
# third sentence
print("\n\nthis is because engineers do not work in ____ but rather as a team\n", *bi_predict("in _ but"), sep='\n')



this is because engineers do not work in ____ but rather as a team

('the', 0.5839969056732672)
('a', 0.10926640344796408)
('this', 0.07213197102584969)
('addition', 0.041320815375187114)
('patients', 0.03648291590061987)
('our', 0.02647256798979274)
('order', 0.023836864683484532)
('which', 0.02352391572982911)
('vitro', 0.021041421782854616)


## Trigram

All the functions listed below are similar to the ones used for bigrams

In [28]:
tri = joblib.load("./models/tri_model.pickle")

In [29]:
bi = joblib.load("./models/bi_model.pickle")
v2 = len(bi.keys())

In [30]:
def tri_prob(word):
    return ((tri[word]+1)/(bi[word[0:2]]+v2))

In [31]:
# if blank is to the right
def tri_predict_right(word):
    word = tuple(map(lambda x: invert_vocab[x], word))
    counts =  Counter({i[2]:tri_prob(i) for i in tri.keys() if i[0:2]==word}).most_common(10)
    return list(map(lambda t1: (vocab[t1[0]],t1[1]),counts))

# if blank is in the middle
def tri_predict_mid(word):
    word = tuple(map(lambda x: invert_vocab[x], word))
    counts =  Counter({i[2]:tri_prob(i) for i in tri.keys() if (i[0],i[2])==word}).most_common(10)
    return list(map(lambda t1: (vocab[t1[0]],t1[1]),counts))

# if blank is to the left
def tri_predict_left(word):
    word = tuple(map(lambda x: invert_vocab[x], word))
    counts =  Counter({i[2]:tri_prob(i) for i in tri.keys() if i[1:3]==word}).most_common(10)
    return list(map(lambda t1: (vocab[t1[0]],t1[1]),counts))

# takes strings of the form "x y _", "x y _ z", "x y _ z w", "y _ z w" or "_ z w"
def tri_predict(blank):
    all=[]
    blank = list(map(lambda x: x.split(),blank.split('_')))

    if(len(blank[0])==2):
        all += tri_predict_right(tuple(blank[0]))
    if(len(blank[0]) > 0 and len(blank[1]) > 0):
        all += tri_predict_mid((blank[0][-1],blank[1][0]))
    if(len(blank[1])==2):
        all += tri_predict_right(tuple(blank[1]))

    all.sort(key=lambda x: x[1], reverse=True)
    return all[0:9]

In [32]:
# first sentence
print("all houses were ____ ventilated\n", *tri_predict("houses were _ ventilated"), sep='\n')

all houses were ____ ventilated

('made', 1.3778844403226307e-07)
('built', 1.3778844403226307e-07)
('ventilated', 9.185906816415932e-08)
('investigated', 9.185896268817538e-08)
('malaria', 9.185896268817538e-08)
('contacted', 9.185896268817538e-08)
('tested', 9.185896268817538e-08)
('then', 9.185896268817538e-08)
('no', 9.185896268817538e-08)


'made ventilated' works, but the prediction of bigrams of 'not ventilated' is a better completion

In [33]:
# second sentence
print("\n\n. . . develop an integrated ____ to reach\n", *tri_predict("an integrated _ to reach"), sep='\n')



. . . develop an integrated ____ to reach

('the', 7.727704527842534e-05)
('a', 4.444692800327732e-05)
('their', 7.897594645210433e-06)
('an', 7.576180909649543e-06)
('approach', 7.394074599874934e-06)
('out', 6.290525967405984e-06)
('.', 5.509949752472395e-06)
('this', 5.326284760723315e-06)
('its', 5.1885360169115054e-06)


trigram model predicts 'approach' as a possible word, which is better than any suggestions of the bigram model

In [34]:
print("\n\n. . . diagnosis and treatment ____ by involving\n", *tri_predict("and treatment _ by involving"), sep='\n')



. . . diagnosis and treatment ____ by involving

('of', 0.00018251225083646476)
('.', 4.980527971769725e-05)
('with', 2.157463729706701e-05)
('for', 1.2348037091300056e-05)
('and', 1.1521774386305999e-05)
('strategies', 9.685635041874764e-06)
('in', 9.593828074653203e-06)
('is', 9.042986271323833e-06)
('are', 7.84949569744353e-06)


Both the models make some possible suggestions

In [35]:
print("\n\ninvolving non-health ____ stakeholders from . . .\n", *tri_predict("involving non-health _ stakeholders from"), sep='\n')



involving non-health ____ stakeholders from . . .

('the', 1.1941608108388649e-06)
('different', 5.970804054194324e-07)
('across', 5.511511434640915e-07)
('a', 3.674340956427277e-07)
('multiple', 3.674340956427277e-07)
('industry', 3.215048336873867e-07)
('diverse', 2.7557557173204575e-07)
('government', 2.2964630977670478e-07)
('health', 1.8371704782136384e-07)


here, industry is a possible guess

In [36]:
# third sentence
print("\n\nthis is because engineers do not work in ____ but rather as a team\n", *tri_predict("work in _ but rather"), sep='\n')



this is because engineers do not work in ____ but rather as a team

('the', 4.403638314445215e-05)
('this', 1.565840109724524e-05)
('to', 1.5200334205293694e-05)
('a', 1.478593886602043e-05)
('a', 1.3684893030747797e-05)
('the', 1.0286631002978209e-05)
('as', 5.418850260497449e-06)
('ensuring', 4.959258998541013e-06)
('that', 4.500401063802966e-06)


This has no good guesses, same as the bigram model

# Perplexity


In [37]:
from numpy import prod
# for product of lists

## Bigram

In [39]:
# takes a sentence, calculates the perplexity using bigram model
def bi_perplexity(sent):
    bigrams = list(ngrams(word_tokenize(sent), 2,
                                    pad_left=True, pad_right=True,
                                    left_pad_symbol='<s>',
                                    right_pad_symbol='</s>'))
    return prod([1/bi_prob(tuple(
        map(lambda x: invert_vocab[x] if x in vocab else -1, bg)) # creates an array of the bigram reciprocal probabilities of each of the bigram
        ) for bg in bigrams]) ** (1/len(bigrams))                 # multiples them all, and then raises it to the power 1/N

In [40]:
sent1 = "it appears that the overall code stroke volume has decreased since the covid- pandemic."
sent2 = "half a century ago hypertension was not treatable."
sent3 = "sarahs tv is broadcasting an advert for private healthcare"
bi_per = [bi_perplexity(sent) for sent in [sent1, sent2, sent3]]
print("perplecities using bigram model are:", *bi_per, sep='\n')

perplecities using bigram model are:
462.72741438050633
3273.700745887551
224632.82214701292


The perplexitis increase from sentence 1 to sentence 3

## Trigram

In [41]:
# takes a sentence, calculates the perplexity using trigram model
# function same as of bigram
def tri_perplexity(sent):
    trigrams = list(ngrams(word_tokenize(sent), 3,
                                    pad_left=True, pad_right=True,
                                    left_pad_symbol='<s>',
                                    right_pad_symbol='</s>'))
    return prod([1/tri_prob(tuple(
        map(lambda x: invert_vocab[x] if x in vocab else -1, bg))
        ) for bg in trigrams]) ** (1/len(trigrams))

In [42]:
sent1 = "it appears that the overall code stroke volume has decreased since the covid- pandemic."
sent2 = "half a century ago hypertension was not treatable."
sent3 = "sarahs tv is broadcasting an advert for private healthcare"
tri_per = [tri_perplexity(sent) for sent in [sent1, sent2, sent3]]
print("perplecities using bigram model are:", *tri_per, sep='\n')

perplecities using bigram model are:
52677.135507587234
374582.0995995889
12675616.670506928


The perplexities increase, and are also greater than those of the trigram model