## Loading the data

In [None]:
#mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')
%cd gdrive/MyDrive

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive


In [None]:
#import the required libraries for loading the file
import json
import zipfile
import os

In [None]:
#Extract all the files from zip folder
from zipfile import ZipFile
with ZipFile('pdf_json.zip','r') as z:
  z.extractall('/content/files/')
z.close()

In [None]:
#load the file
files = os.listdir('/content/files/pdf_json/')
len(files)

56529

In [None]:
#function to extract text from the key body_text to create a corpus
def extract_body_text(filename):
    file = open(filename)
    paper_content = json.load(file)
    body_text =""
    if 'body_text' in paper_content: #look at the text that comes after the key 'body_text'
        for bt in paper_content['body_text']:
            body_text = body_text +bt['text']
    return (body_text+'\n').lower()


In [None]:
#import the required libraries for preprocessing
import nltk
nltk.download('punkt')
import re
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk import tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
#function to perform preprocessing
def data_preprocessing(text):
    text_low=text.lower() #converts the entire text to lower case
    text_cit=re.sub(r"[\[0-9]+]","",text_low) #removes the citations
    text_space = re.sub(' +', ' ', text_cit) #removes extra space
    text_cit=re.sub("[\x00-\x2F\x3A-\x40\x5B-\x60\x7B-\x7F]+"," ",text_space) #remove the punctuations
    text_num = re.sub(r'[0-9]+', '', text_cit) #removes numbers
    text_dash = re.sub('\-|\+','',text_num)
    return text_dash

In [None]:
!pip install langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#create a list of all the extracted english texts
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
corpus_list = []
for i in files:
    try:
        text = extract_body_text('/content/files/pdf_json/'+ i)
        if detect(text) == 'en': #look at only english texts
            corpus_list.append(data_preprocessing(text)) #preprocess each file
    except:
        pass

In [None]:
with open("NLP_Processed/processed.txt", 'w') as file: #save the preprocessed text in a text file
    for item in corpus_list:
        file.write(item + '\n')

In [None]:
#Number of text files in the corpus
len(corpus_list)

54822

In [None]:
n_files = len(corpus_list)

## Creating a Unigram Model and the vocabulary

In [None]:
from collections import Counter
from nltk import ngrams

#Create a dictionary storing the count of each word in the corpus
vocab = Counter()
for text in corpus_list:
    tokens = [token for token in text.split(" ") if token != ""] #split each file into unigrams
    for word in tokens:
        vocab.update({word:1})


In [None]:
#Get the size of the vocabulary
vocab_size = len(vocab)
vocab_size

821789

In [None]:
total_count = 0 #total number of words in the corpus
for w1,count in vocab.items():
    total_count+=count

In [None]:
#function to calculate the probability of occurence of each word
def unigram_prob(word):
  if word not in vocab.keys():  #performing laplace smoothing
    unigram_prob = 1/(total_count+vocab_size)
  else:
    unigram_prob = (vocab[word]+1)/(total_count+vocab_size) #P(w1) = (count(w1)+1)/(N+|V|)
  return unigram_prob

## Bigram Model on all the files

In [None]:
bigram_model = Counter() #counter to store the bigram counts
for i in range(n_files):
    process_text = corpus_list[i]
    bigrams_text = nltk.bigrams(process_text.split(" ")) #split each file into bigrams
    for words in bigrams_text:
        bigram_model.update({words:1}) #increases the count for each bigram

In [None]:
bigram_first = Counter() #counter to store the list of words that appear after some word w1
for i,j in bigram_model.items():
    w1 = i[0]
    w2 = i[1]
    list1 = [w2]
    if w1 not in bigram_first.keys():
        bigram_first[w1] = list1
    else:
        bigram_first[w1].append(w2)

In [None]:
#function to calculate the probability of each bigram
def bigram_prob(w1,w2):
  if (w1,w2) not in bigram_model.keys():
    bigram_prob = 1/(vocab[w1]+vocab_size) #laplace smoothing
  else:
    bigram_prob = (bigram_model[(w1,w2)]+1)/(vocab[w1]+vocab_size) #P(w2|w1) = (count(w1,w2)+1)/(count(w1)+|V|)
  return bigram_prob

In [None]:
import pickle

with open('NLP_Processed/bigram_model_file.pickle', 'wb') as f: #saving the bigram model
    pickle.dump(bigram_model, f)

In [None]:
with open('NLP_Processed/bigram_first_file.pickle', 'wb') as f: #saving the bigram second words
  pickle.dump(bigram_first,f)

In [None]:
#Function to predict the next word of a sentence
def predict_next_bigram(sentence):
    word2_list = Counter()
    sentence = data_preprocessing(sentence) #preprocess the sentence
    sent_tokens = word_tokenize(sentence)
    n = len(sent_tokens)
    if bigram_first[sent_tokens[n-1]]==0: #if no word appears after the previous word then return 0
      return 0
    for w_n in bigram_first[sent_tokens[n-1]]: #look at words coming after w_n-1
        word2_list.update({w_n: unigram_prob(sent_tokens[0])*bigram_prob(sent_tokens[n-1],w_n)})
        for i in range(1,n):
            word2_list[w_n] *= bigram_prob(sent_tokens[i-1],sent_tokens[i]) #use P(w1,w2,...w_n) = P(w1)P(w2|w1).....P(w_n|w_n-1)

    return word2_list.most_common(10)


In [None]:
#sentence 21
predict_next_bigram("all houses were")

[('not', 1.7251617712948942e-14),
 ('used', 1.6782872380634335e-14),
 ('also', 1.2920569907586064e-14),
 ('', 1.291112540990422e-14),
 ('performed', 1.2472204807111218e-14),
 ('found', 1.0959593941540319e-14),
 ('collected', 1.049482523982859e-14),
 ('obtained', 8.821160834841006e-15),
 ('observed', 8.195338593712591e-15),
 ('identified', 8.004460324774299e-15)]

In [None]:
#sentence 2
predict_next_bigram("it aims to develop an integrated")

[('into', 4.42129173749273e-20),
 ('with', 2.3017679175427988e-20),
 ('in', 1.3156525949512529e-20),
 ('and', 1.041038201318164e-20),
 ('dna', 7.988782360235311e-21),
 ('approach', 7.339693793466193e-21),
 ('the', 5.167743589277217e-21),
 ('to', 4.668444691762509e-21),
 ('care', 4.543619967383833e-21),
 ('moving', 3.619917006981625e-21)]

In [None]:
possible1 = []
for i in predict_next_bigram("it aims to develop an integrated"): #based on the gap the words that fill up the second gap
  possible1.append("it aims to develop an integrated "+i[0]+' to reach mmps exposed to malaria with prevention diagnosis and treatment')
predicted1 = []
for i in possible1:
  for j in predict_next_bigram(i):
    predicted1.append(j)

In [None]:
sorted1 = sorted(predicted1, key = lambda x: x[1])
top_10_1 = sorted1[-10:]
top_10_1

[('for', 9.947503619600165e-62),
 ('is', 1.0665808851660208e-61),
 ('and', 1.2079111538085919e-61),
 ('in', 1.3935217409130527e-61),
 ('with', 1.7345056441416422e-61),
 ('for', 1.87934974150911e-61),
 ('and', 2.282067543261063e-61),
 ('with', 3.276945511777266e-61),
 ('of', 4.302893718306996e-61),
 ('of', 8.129318175172942e-61)]

In [None]:
possible2 = []
for i in possible1:
  for j in top_10_1:
    possible2.append(i+' '+j[0]+" by involving non-health ")
predicted2 = []
 #based on the previous gaps the words that fill up the third gap
for i in possible2:
  for j in predict_next_bigram(i):
    predicted2.append(j)
sorted2 = sorted(list(set(predicted2)), key = lambda x: x[1])
top_10_2 = sorted2[-10:]
top_10_2

[('services', 8.654908489557309e-79),
 ('organization', 9.464091456963222e-79),
 ('system', 9.958182355239804e-79),
 ('care', 1.0670493948463225e-78),
 ('care', 1.228543850522251e-78),
 ('and', 1.6819986225658078e-78),
 ('organization', 1.788020242407041e-78),
 ('care', 2.7551060025264833e-78),
 ('and', 3.1777456911996464e-78),
 ('care', 5.205132816917198e-78)]

In [None]:
predict_next_bigram("this is because engineers do not work in")

[('the', 1.8222372751045392e-25),
 ('a', 3.4239926456058674e-26),
 ('', 3.0937223390173425e-26),
 ('this', 2.4620413318692996e-26),
 ('addition', 1.3828992948122097e-26),
 ('patients', 9.339450012332356e-27),
 ('our', 8.601503486469278e-27),
 ('order', 8.03802501183751e-27),
 ('which', 7.634386699425513e-27),
 ('vitro', 6.834119271419552e-27)]

In [None]:
#funtion to calculate perplexity of a sentence with bigram model
def perplexity_bigram(sentence):
    process_text = data_preprocessing(sentence)
    sent_tokens = word_tokenize(process_text)
    n = len(sent_tokens)
    prod = unigram_prob(sent_tokens[0]) #calculating the product of probablities
    for i in range(1,n):
        prod *= bigram_prob(sent_tokens[i-1],sent_tokens[i])  #calculating the probability
    perplexity_bigram = (1/prod)**(1/n)  #calculating the perplexity
    return perplexity_bigram

In [None]:
#sentence 1
perplexity_bigram('it appears that the overall code stroke volume has decreased since the covid- pandemic.')

1115.5835591162884

In [None]:
#sentence 2
perplexity_bigram('half a century ago hypertension was not treatable.')

5077.3920193259155

In [None]:
#sentence 3
perplexity_bigram('sarahs tv is broadcasting an advert for private healthcare.')


215654.68380117096

## Trigram Model on 10000 files

In [None]:
n_files = 10000

In [None]:
#counter to store the count of each trigram
trigram_model = Counter()
for i in range(n_files):
    process_text = corpus_list[i]
    trigrams_text = nltk.trigrams(process_text.split(" "))
    for words in trigrams_text:
        trigram_model.update({words:1})

In [None]:
trigram_first = Counter() #dictionary storing the count of w3 if (w1,w2) occured
for i,j in trigram_model.items():
    w1 = i[0]
    w2 = i[1]
    w3 = i[2]
    list1 = [w3]
    if (w1,w2) not in trigram_first.keys():
        trigram_first[(w1,w2)] = list1 #create a dictionary for each (w1,w2)
    else:
      trigram_first[(w1,w2)].append(w3)

In [None]:
#function calculating the probability of a trigram
#uses the equation: P(w1,w2,....wn) = P(w1)P(w2|w1)P(w3|w1,w2)...P(wn|w_n-2,w_n-1)
def trigram_prob(w1,w2,w3):
  if (w1,w2,w3) not in trigram_model.keys():
    trigram_prob = 1/(bigram_model[(w1,w2)]+vocab_size)
  else:
    trigram_prob = (trigram_model[(w1,w2,w3)]+1)/(bigram_model[(w1,w2)]+vocab_size)
  return trigram_prob

In [None]:
#Function to predict the next word of a sentence
def predict_next_trigram(sentence):
  word3_list = Counter()
  sentence = data_preprocessing(sentence) #preprocess the text
  sent_tokens = word_tokenize(sentence)
  n = len(sent_tokens)
  if trigram_first[(sent_tokens[n-2],sent_tokens[n-1])] == 0:  #if no word appears after the previous word then return 0
    return 0
  for w_n in trigram_first[(sent_tokens[n-2],sent_tokens[n-2])]:
      word3_list.update({w_n: unigram_prob(sent_tokens[0])*bigram_prob(sent_tokens[0],sent_tokens[1])*trigram_prob(sent_tokens[n-2],sent_tokens[n-1],w_n)})
      for i in range(2,n):
          word3_list[w_n] *= trigram_prob(sent_tokens[i-2],sent_tokens[i-1],sent_tokens[i])
  return word3_list.most_common(10)


In [None]:
#sentence 1
predict_next_trigram("all houses were")

In [None]:
#sentence 2
predict_next_trigram("it aims to develop an integrated")

[('and', 9.503699058615132e-30),
 ('mrna', 7.91974921551261e-31),
 ('angel', 7.91974921551261e-31),
 ('', 7.91974921551261e-31),
 ('would', 7.91974921551261e-31),
 ('an', 7.91974921551261e-31),
 ('interactive', 7.91974921551261e-31),
 ('essential', 7.91974921551261e-31),
 ('erent', 7.91974921551261e-31),
 ('m', 7.91974921551261e-31)]

In [None]:
possible1 = []
for i in predict_next_trigram("it aims to develop an integrated"): #based on the gap the words that fill up the second gap
  possible1.append("it aims to develop an integrated "+i[0]+' to reach mmps exposed to malaria with prevention diagnosis and treatment')
predicted1 = []
for i in possible1:
  for j in predict_next_trigram(i):
    predicted1.append(j)

sorted1 = sorted(predicted1, key = lambda x: x[1])
top_10_1 = sorted1[-10:]
top_10_1

[('on', 2.2612794439702113e-88),
 ('these', 2.2612794439702113e-88),
 ('a', 4.070302999146378e-88),
 ('will', 4.9748147767344645e-88),
 ('to', 7.688350109498716e-88),
 ('was', 9.045117775880845e-88),
 ('are', 1.1306397219851053e-87),
 ('the', 1.6281211996585512e-87),
 ('', 1.6281211996585512e-87),
 ('in', 2.3969562106084226e-87)]

In [None]:
predict_next_trigram("to reach mmps exposed to malaria with prevention diagnosis and treatment")

[('of', 8.64399824510451e-55),
 ('with', 1.0169409700122955e-55),
 ('for', 6.944962722035188e-56),
 ('in', 6.572911147640445e-56),
 ('is', 4.836670467131649e-56),
 ('the', 4.464618892736906e-56),
 ('', 4.464618892736906e-56),
 ('are', 3.100429786622853e-56),
 ('services', 2.72837821222811e-56),
 ('was', 2.480343829298282e-56)]

In [None]:
possible2 = []
for i in possible1:
  for j in top_10_1:
    possible2.append(i+' '+j[0]+" by involving non-health ")
predicted2 = []
 #based on the previous gaps the words that fill up the third gap
for i in possible2:
  for j in predict_next_trigram(i):
    predicted2.append(j)
sorted2 = sorted(list(set(predicted2)), key = lambda x: x[1])
top_10_2 = sorted2[-10:]
top_10_2

In [None]:
predict_next_trigram("this is because engineers do not work in")

[('and', 9.993780333687907e-40), ('associated', 4.9968901668439536e-40)]

In [None]:
#funtion to calculate perplexity of a sentence with bigram model
def perplexity_trigram(sentence):
    process_text = data_preprocessing(sentence)
    sent_tokens = word_tokenize(sentence)
    n = len(sent_tokens)
    prod = unigram_prob(sent_tokens[0])*bigram_prob(sent_tokens[0],sent_tokens[1])
    for i in range(2,n):
        prod *= trigram_prob(sent_tokens[i-2], sent_tokens[i-1],sent_tokens[i]) #calculatiung the probability
    perplexity_trigram = (1/prod)**(1/n)
    return perplexity_trigram

In [None]:
#sentence 1
perplexity_trigram("it appears that the overall code stroke volume has decreased since the covid- pandemic.")

83959.6184190862

In [None]:
#sentence 3
perplexity_trigram('half a century ago hypertension was not treatable.')

87466.6507931325

In [None]:
#sentence 3
perplexity_trigram('sarahs tv is broadcasting an advert for private healthcare.')

1430193.9754503905

In [None]:
with open('NLP_Processed/trigram_model_file.pickle', 'wt') as f: #saving the trigram_model
  pickle.dump(trigram_model,f)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-93-62917a568404>", line 1, in <module>
    with open('NLP_Processed/trigram_model_file.pickle', 'wt') as f:
OSError: [Errno 107] Transport endpoint is not connected: 'NLP_Processed/trigram_model_file.pickle'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 2040, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'OSError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_

OSError: ignored

In [None]:
#saving the trigram third word corresponding to first two
with open('NLP_Processed/trigram_first_file.pickle', 'wt') as f:
  pickle.dump(trigram_first,f)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-94-4a4b15e2beaf>", line 1, in <module>
    with open('NLP_Processed/trigram_first_file.pickle', 'wt') as f:
OSError: [Errno 107] Transport endpoint is not connected: 'NLP_Processed/trigram_first_file.pickle'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 2040, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'OSError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_

OSError: ignored