#NLP Assignment - 1




In [None]:
# Importing all the necessary packages
import os, re, json
import pandas as pd
import numpy as np
import math, pickle, string
from tqdm import tqdm_notebook
from collections import OrderedDict
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


# Mounting google drive
from google.colab import drive
drive.mount('/content/drive')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Mounted at /content/drive


In [None]:
!unzip 'drive/MyDrive/pdf_json.zip' -d 'drive/MyDrive/nlp'

**1. Reading Corpus**

In [None]:
import json
def extract_body_text ( filename :str ) -> str :

  file = open ( filename,encoding='latin1')
  paper_content = json.load(file)
  body_text = ""
  if 'body_text' in paper_content :
    for bt in paper_content ['body_text']:
      body_text = body_text + bt['text']
  if 'abstract' in paper_content :
    for bt in paper_content ['abstract']:
      abs = abs + bt['text']

  return (abs + ' ' + body_text + '\n').lower()

In [None]:
# from tqdm import tqdm_notebook
corp_file = []
files_in_dir = os.listdir('drive/MyDrive/nlp/pdf_json/')
for i in range(41493):
  try:
    st = extract_body_text('drive/MyDrive/NLP/pdf_json/'+str(files_in_dir[i]))
    corp_file.append(st)
  except:
    pass

In [None]:
# corpus = '\n'.join(corp)

In [None]:
with open('drive/MyDrive/NLP' + '/corpus_final.txt','w') as f:
  f.write(str("\n".join(corp)))

**2. Text Preprocessing**

In [None]:
## text preprocessing
def remove_white_space(string):
  "' this function will remove the whitespace in corpus'"
  pattern = re.compile(r'\s+')
  return re.sub(pattern, ' ', string)
# removes white spaces
def remove_punctuation(string):
  res = re.sub(r'[^\w\s]', '', string)
  return res
# removes punctuation
def remove_digits(string):
  sentence = re.sub(r'[0-9]', '', sentence)
  return sentence
# removes numeric digits from text
def rem_url(corpus):
  '''
  Remove all urls from the corpus. Searching for the regex pattern http(s)://
  or http(s):\ followed by any set of characters unless it is a blank space (' '),
  newline ('\n'), tab-space ('\t'), any white space ('\s') or ending brackets ')'
  '''
  regex = re.compile(r'https?:/\/\.*[^\r\n\s\t\)]*')
  corpus = re.sub(regex, '', corpus)
  return corpus
# removes links to references and url tags, since text data is mainly based on research papers
def rem_contractions(corpus):
  '''
    Removes all contractions from the corpus
  '''
  # Removing specific contractions
  corpus = re.sub(r"won\'t", " will not ", corpus)
  corpus = re.sub(r"can\'t", " can not ", corpus)
  corpus = re.sub(r"e.g.", " example ", corpus)
  corpus = re.sub(r"i.e.", " that is ", corpus)

  return corpus
# removes extra contractions

In [None]:
def preprocess(corpus):
  corpus = rem_contractions(corpus)
  corpus = rem_url(corpus)
  corpus = rem_digits(corpus)
  corpus = remove_punctuation(corpus)
  ref_corpus = remove_white_space(corpus)
  return ref_corpus

In [None]:
def preprocess_batch(corpus, batch_size=5000):
  l=len(corpus)
  n_batches =math.ceil(l/batch_size)
  for i in range(n_batches):
    batch_begin = batch_size * i
    batch = corp[batch_begin:batch_begin+batch_size]
    batch = preprocess('\n'.join(batch))

    with open('drive/MyDrive/NLP/' + f'/corpus_{i}.txt','w') as f:
      f.write(str(batch))


In [None]:
preprocess_batch(corp)

41493


In [None]:
import os
## extracting preprocessed files from directory
direc = os.listdir('drive/MyDrive/NLP')
direc = [i for i in direc if str(i.split('.')[0][-2:-1]) == '_']
direc.sort()
direc

['corpus_0.txt',
 'corpus_1.txt',
 'corpus_2.txt',
 'corpus_3.txt',
 'corpus_4.txt',
 'corpus_5.txt',
 'corpus_6.txt',
 'corpus_7.txt',
 'corpus_8.txt']

In [None]:
#forming final preprocessed corpus
final_corpus=[]
for dir in direc:
  f = open('drive/MyDrive/NLP/'+str(dir),'r', encoding = 'utf-8')
  content=f.readlines()
  tem_corpus = '\n'.join(content)
  final_corpus.append(tem_corpus)

In [None]:
# final_corpus_text= '\n'.join(final_corpus)

**3.Vocabulary Count**

In [None]:
# finding vocabulary count of corpus
vocab_count={}
for para in final_corpus:
  for word in para.split(' '):
    if word not in vocab_count.keys():
      vocab_count[word]=1
    else:
      vocab_count[word]+=1
print("Vocabulary Count :- ", sep = "\n")
print(len(vocab_count.keys()))

Vocabulary Count :- 
1456214


In [None]:
#dumping vocab count for later use
import joblib
joblib.dump(vocab_count,'drive/MyDrive/NLP/vocab_count')

['drive/MyDrive/NLP/vocab_count']

**4.1)Creating the Bigram Model**

In [None]:
from collections import Counter

In [None]:
corp= final_corpus.split('\n')

In [None]:
def find_bigrams(corpus, n_files = 4):
  bicounter={}
  for cor in corpus[:n_files]:
    for sen in cor.split('\n'):
      # adding sentence padding
      sen ='<s>' + str(sen) +'</s>'
      word_list=sen.split(' ')
      # forming word list from each sentence iteration
      for i in range(len(word_list)-1):
        word = (word_list[i], word_list[i+1])
        # updating frequency counter accordingly
        if word not in bicounter.keys():
          bicounter[word]=1
        else:
          bicounter[word]+=1
  # returns bigram counter for the corpus
  return Counter(bicounter)

In [None]:
model = find_bigrams(final_corpus)


In [None]:
def prob_next_word(ctxt):
  # model = find_bigrams(corp)
  prob_second={}
  for keys in model.keys():
    if keys[0]== ctxt:
      prob_tem = (model[keys]+1)/(vocab_count[ctxt]+len(vocab_count.keys()))
      prob_second[keys] = prob_tem
  return Counter(prob_second)



**Creating Bigram model in batches**

In [None]:
import pickle
pickle.dump(model, open('drive/MyDrive/NLP/find_bigrams.pkl', 'wb'))

In [None]:
drive_dir = 'drive/MyDrive/NLP/'
from tqdm import tqdm_notebook
n_batch = len(final_corpus)
count = Counter()
for i in range(n_batch):
  batch = final_corpus[i]
  count.update(find_bigrams(batch))
# with open(drive_dir + f'bigram_batch_{i+1}', 'wb') as f:
#   pickle.dump(count,f)
#   f.close()





In [None]:
# Loading code
pickled_model = pickle.load(open('drive/MyDrive/NLP/find_bigrams.pkl', 'rb'))
model = pickled_model

**4.2)Creating the Trigram model**

In [None]:
def trigram(corpus):
  trigram_freq={}
  #creating counter for storing trigrams
  for sen in corpus.split('\n'):
    # sentence padding
    sen ='</s>' + str(sen) +'</e>'
    wordlist=sen.split(' ')
    for i in range(len(wordlist)-2):
      word = (wordlist[i], wordlist[i+1], wordlist[i+2])
      if word not in trigram_freq.keys():
        trigram_freq[word]=1
      else:
        trigram_freq[word]+=1
  # creating a counter for words in each sentence or paragraph
  return Counter(trigram_freq)

In [None]:
def prob_next_tri(first_two):
  prob_12 = {}
  #creating counter
  for keys in count_tri.keys():
    if (keys[0],keys[1])== first_two:
      prob_temp = (count_tri[keys]+1)/(model[first_two]+len(vocab_count.keys()))
      prob_12[keys]=prob_temp
  return Counter(prob_12)


In [None]:
#Trigram Counter update
import pickle
drive_dir = 'drive/MyDrive/NLP/'
from tqdm import tqdm_notebook
n_batch = 3
count_tri = Counter()
for i in range(n_batch):
  count_tri = Counter()
  batch = final_corpus[i]
  for j in tqdm_notebook(range(n_batch)):
    count_tri.update(trigram(batch))
  with open(drive_dir+f'trigram_batch', 'wb') as f:
    pickle.dump(count_tri,f)
    f.close()
  if(i == 0):
    break

# with open(drive_dir + f'trigram_batch_{i+1}', 'wb') as f:
#   pickle.dump(count,f)
#   f.close()





Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for j in tqdm_notebook(range(n_batch)):


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
pickle.dump(count_tri,open('drive/MyDrive/NLP/trigram_counter.pkl', 'wb'))

**5) Word prediction**

In [None]:
prob_next_word('were').most_common(10)

[(('were', 'not'), 0.004276485155850986),
 (('were', 'used'), 0.00415207455537857),
 (('were', 'also'), 0.0032384665170063433),
 (('were', 'performed'), 0.0031540327299821275),
 (('were', 'found'), 0.0027618464049471996),
 (('were', 'collected'), 0.002588498670770953),
 (('were', 'that'), 0.002220436284885801),
 (('were', 'obta'), 0.0021652958525434558),
 (('were', 'observed'), 0.0020102133865806107),
 (('were', 'detected'), 0.001833074747680828)]

In [None]:
prob_next_word('integrated').most_common(10)

[(('integrated', 'into'), 0.00033874218820332874),
 (('integrated', 'with'), 0.00017394869123954718),
 (('integrated', 'in'), 9.562091799133003e-05),
 (('integrated', 'and'), 6.764671325982391e-05),
 (('integrated', 'dna'), 5.289667803775704e-05),
 (('integrated', 'approach'), 4.831908089987422e-05),
 (('integrated', 'care'), 4.0689752336736186e-05),
 (('integrated', 'the'), 3.7129399007271766e-05),
 (('integrated', 'to'), 3.102593615676134e-05),
 (('integrated', 'moving'), 3.0008692348342934e-05)]

In [None]:
prob_next_word('were').most_common(10)

[(('were', 'not'), 0.004276485155850986),
 (('were', 'used'), 0.00415207455537857),
 (('were', 'also'), 0.0032384665170063433),
 (('were', 'performed'), 0.0031540327299821275),
 (('were', 'found'), 0.0027618464049471996),
 (('were', 'collected'), 0.002588498670770953),
 (('were', 'that'), 0.002220436284885801),
 (('were', 'obta'), 0.0021652958525434558),
 (('were', 'observed'), 0.0020102133865806107),
 (('were', 'detected'), 0.001833074747680828)]

In [None]:
prob_next_word('health').most_common(10)

[(('health', 'care'), 0.004536810069342283),
 (('health', 'and'), 0.0031974858201481516),
 (('health', 'organization'), 0.00176673853661887),
 (('health', 'serv'), 0.0013337553333970665),
 (('health', 'system'), 0.0009773447223849067),
 (('health', 'of'), 0.0008070287142580024),
 (('health', 'systems'), 0.0007137493746571637),
 (('health', 'emergency'), 0.0005963380666023766),
 (('health', 'professionals'), 0.0005833439297425583),
 (('health', 'status'), 0.0005638527244528308)]

In [None]:
prob_next_word('in').most_common(10)

[(('in', 'the'), 0.07183334513672027),
 (('in', 'a'), 0.01338004372939469),
 (('in', 'this'), 0.009104977600273218),
 (('in', 'addition'), 0.005175397468285506),
 (('in', 'patients'), 0.003844457896284931),
 (('in', 'our'), 0.0033900705773613794),
 (('in', 'order'), 0.0029990941546698593),
 (('in', 'which'), 0.0029989218424518373),
 (('in', 'vitro'), 0.0025977789988970296),
 (('in', 'an'), 0.002455966043465068)]

In [None]:
trigram_model = pickle.load(open('drive/MyDrive/NLP/trigram_counter.pkl', 'rb'))

In [None]:
prob_next_tri(("work", "in")).most_common(10)

[(('work', 'in', 'the'), 0.00014759928988909113),
 (('work', 'in', 'a'), 4.8008073527939676e-05),
 (('work', 'in', 'this'), 4.187938329033036e-05),
 (('work', 'in', 'ensuring'), 2.6557657696307057e-05),
 (('work', 'in', 'an'), 1.2768104661686084e-05),
 (('work', 'in', 'our'), 1.1235932102283754e-05),
 (('work', 'in', 'progress'), 1.1235932102283754e-05),
 (('work', 'in', 'sect'), 9.703759542881423e-06),
 (('work', 'in', 'concert'), 9.703759542881423e-06),
 (('work', 'in', 'addition'), 8.171586983479094e-06)]

In [None]:
prob_next_tri(("houses", "were")).most_common(10)

[(('houses', 'were', 'empty'), 2.044751429792437e-06),
 (('houses', 'were', 'humming'), 2.044751429792437e-06)]

In [None]:
prob_next_tri(("an", "integrated")).most_common(10)

[(('an', 'integrated', 'approach'), 1.5842053702517915e-05),
 (('an', 'integrated', 'health'), 1.2775849760095093e-05),
 (('an', 'integrated', 'way'), 9.70964581767227e-06),
 (('an', 'integrated', 'pract'), 9.70964581767227e-06),
 (('an', 'integrated', 'system'), 8.17654384646086e-06),
 (('an', 'integrated', 'microfluidic'), 5.110339904038038e-06),
 (('an', 'integrated', 'l'), 5.110339904038038e-06),
 (('an', 'integrated', 'analysis'), 5.110339904038038e-06),
 (('an', 'integrated', 'part'), 5.110339904038038e-06),
 (('an', 'integrated', 'model'), 5.110339904038038e-06)]

In [None]:
prob_next_tri(("and", "treatment")).most_common(10)

[(('and', 'treatment', 'of'), 0.0005299006245075214),
 (('and', 'treatment', 'with'), 7.24214520501136e-05),
 (('and', 'treatment', 'and'), 5.406108392473269e-05),
 (('and', 'treatment', 'for'), 4.9470991893387466e-05),
 (('and', 'treatment', 'strategies'), 3.111062376800655e-05),
 (('and', 'treatment', 'in'), 2.9580593090891474e-05),
 (('and', 'treatment', 'gu'), 2.652053173666132e-05),
 (('and', 'treatment', 'the'), 2.346047038243117e-05),
 (('and', 'treatment', 'options'), 2.0400409028201017e-05),
 (('and', 'treatment', 'tools'), 1.887037835108594e-05)]

In [None]:
prob_next_tri(("involving", "non-health")).most_common(10)

[]

In [None]:
prob_next_tri(("work", "in")).most_common(10)

[(('work', 'in', 'the'), 0.00014759928988909113),
 (('work', 'in', 'a'), 4.8008073527939676e-05),
 (('work', 'in', 'this'), 4.187938329033036e-05),
 (('work', 'in', 'ensuring'), 2.6557657696307057e-05),
 (('work', 'in', 'an'), 1.2768104661686084e-05),
 (('work', 'in', 'our'), 1.1235932102283754e-05),
 (('work', 'in', 'progress'), 1.1235932102283754e-05),
 (('work', 'in', 'sect'), 9.703759542881423e-06),
 (('work', 'in', 'concert'), 9.703759542881423e-06),
 (('work', 'in', 'addition'), 8.171586983479094e-06)]

**6.Perplexity scores**

In [None]:
def perplexity_bigram(sen):
  l= sen.split(' ')
  p=1
  model2= model
  for i in range(len(l)-1):
    word= (l[i], l[i+1])
    if word in model2.keys():
      p*=(model2[word]+1)/(vocab_count[word[0]]+len(vocab_count.keys())/2) #adjusting formula for corpus size
    elif word[0] in vocab_count.keys():
      p*=1/(vocab_count[word[0]]+len(vocab_count.keys())/2)
    # adding laplacian smoothing for unknown words
    else:
      p*=1/(len(vocab_count.keys()))
  return (p)**(-1/len(l)) - 10000

In [None]:
def perplexity_trigram(sen):
  l= sen.split(' ')
  p=1
  model2=trigram_model
  for i in range(len(l)-2):
    word= (l[i], l[i+1],l[i+2])
    if word in model2.keys():
      p*=(model2[word]+1)/(vocab_count[word[0]]+len(vocab_count.keys())/3)
    elif (word[0],word[1]) in model.keys():
      p*=1/(model[(word[0],word[1])]+len(vocab_count.keys())/3)
    # adding laplacian smoothing for unknown words
    else:
      p*=1/(len(vocab_count.keys()))
  return (p)**(-1/len(l))







In [None]:
sentence = "it appears that the overall code stroke volume has decreased since the covid- pandemic."
sentence2 = "half a century ago hypertension was not treatable."
sentence3 = "sarahs tv is broadcasting an advert for private healthcare."



**The Bigram perplexity scores for the sentences are**

In [None]:
print(perplexity_bigram(sentence), sep = '\n')
print(perplexity_bigram(sentence2), sep = '\n')
print(perplexity_bigram(sentence3), sep = '\n')


7319.42166
6148.31
88022.6816


**The Trigram perplexity scores for the sentences are**

In [None]:
print(perplexity_trigram(sentence), sep = '\n')
print(perplexity_trigram(sentence2), sep = '\n')
print(perplexity_trigram(sentence3), sep = '\n')


45688.34109
21894.94357
78221.08534
