This file is responsible for individually training models on each year of literature we have in our corpus and saving those models for later analysis. 
- Trained a Gensim Model with Lemmatization and Removal of Stopwords 
- Trained a Gensim Model with Stemming and Removal of Stopwards 
- Trained a GloVe Model with Lemmatization and Removal of Stopwords 
- Trained a GloVe Model with Stemming and Removal of Stopwards 

In [None]:
!pip install python-docx

Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 4.3 MB/s 
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py) ... [?25l[?25hdone
  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184508 sha256=ac0e62aa83422580fd13ffbfdca2630fdf98f693b6d28bc689c8a48f9e125b8d
  Stored in directory: /root/.cache/pip/wheels/f6/6f/b9/d798122a8b55b74ad30b5f52b01482169b445fbb84a11797a6
Successfully built python-docx
Installing collected packages: python-docx
Successfully installed python-docx-0.8.11


In [None]:
!pip install glove_python-binary

Collecting glove_python-binary
  Downloading glove_python_binary-0.2.0-cp37-cp37m-manylinux1_x86_64.whl (948 kB)
[?25l[K     |▍                               | 10 kB 20.1 MB/s eta 0:00:01[K     |▊                               | 20 kB 10.7 MB/s eta 0:00:01[K     |█                               | 30 kB 8.7 MB/s eta 0:00:01[K     |█▍                              | 40 kB 7.8 MB/s eta 0:00:01[K     |█▊                              | 51 kB 4.3 MB/s eta 0:00:01[K     |██                              | 61 kB 4.5 MB/s eta 0:00:01[K     |██▍                             | 71 kB 4.5 MB/s eta 0:00:01[K     |██▊                             | 81 kB 5.1 MB/s eta 0:00:01[K     |███                             | 92 kB 5.2 MB/s eta 0:00:01[K     |███▌                            | 102 kB 4.2 MB/s eta 0:00:01[K     |███▉                            | 112 kB 4.2 MB/s eta 0:00:01[K     |████▏                           | 122 kB 4.2 MB/s eta 0:00:01[K     |████▌                     

In [None]:
from docx import Document
import nltk
nltk.download('punkt')
import re
from nltk import sent_tokenize
import pandas as pd
from nltk.corpus import stopwords
nltk.download('stopwords')
import pickle
import numpy as np
import glob

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
from nltk.stem import WordNetLemmatizer
import nltk 
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
!git clone 'https://github.com/igorbrigadir/stopwords.git'

Cloning into 'stopwords'...
remote: Enumerating objects: 149, done.[K
remote: Total 149 (delta 0), reused 0 (delta 0), pack-reused 149[K
Receiving objects: 100% (149/149), 85.27 KiB | 1.09 MiB/s, done.
Resolving deltas: 100% (52/52), done.


In [None]:
alir3z4_data = '/content/stopwords/en/alir3z4.txt'

more_stops = pd.read_csv('/content/stopwords/en/alir3z4.txt')
new_stops = list(more_stops["'ll"])

In [None]:
DOMAIN_STOPS = {'pubmed', 'et', 'al', 'page'}
STOPWORDS =  set(stopwords.words('english') + stopwords.words('german') +  stopwords.words('dutch') + stopwords.words('french') +  stopwords.words('spanish')  + new_stops) | DOMAIN_STOPS
STOPWORDS = set(STOPWORDS)

In [None]:
len(STOPWORDS)

2011

In [None]:
'a' in STOPWORDS

True

In [None]:
ROOT = "/content/drive/MyDrive/regen_x"

In [None]:
# for lemmatization 
import spacy
# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])

# Gensim Model Function

In [None]:
from gensim.models import Word2Vec

In [None]:
def get_docx(file_path):
    doc = []
    for para in Document(file_path).paragraphs:
        if para.text == "":
            continue
        doc += (sent_tokenize(para.text.lower()))
    return doc


def get_proc_docs(training_paper_year_path, STOPWORDS, max_papers=None, verbose=True, use_porter=False, removeStopWords=True):
  file_paths = glob.glob(training_paper_year_path + "*.docx")

  print("Number of files: {}".format(len(file_paths)))
  # if len(file_paths) == 0:
    # raise Exception("Folder has no files - maybe drive was not mounted?")
  ## -- Collecting Papers from Given Year -- ##
  proc_docs = [] 

  counter = 1
  length = len(file_paths)
  for f in file_paths:
    doc = get_docx(f)
    
    for sentence in doc:
      # don't think we need to remove stopwords and such if we're training embeddings 
      # do lemmatization here as well 

      proc_sentence = [] 
      if removeStopWords:
        proc_sentence = [word for word in re.findall(r'\w+', sentence) if ((len(word) > 2) and (word not in STOPWORDS))]
      else:
        proc_sentence = [word for word in re.findall(r'\w+', sentence)]

      if use_porter:
        proc_sentence = do_stemming(proc_sentence) 
      else:
        proc_sentence = do_lemmatizing(proc_sentence) 

      proc_docs.append(proc_sentence)  

    if(verbose):
      print("\t{}/{}".format(counter, length))
    counter += 1

    if max_papers != None:
      if counter == max_papers+1:
        break 

  return proc_docs

def do_stemming(filtered):
	stemmed = []
	for f in filtered:
		stemmed.append(PorterStemmer().stem(f))
		#stemmed.append(LancasterStemmer().stem(f))
		#stemmed.append(SnowballStemmer('english').stem(f))
	return stemmed

def do_lemmatizing(filtered):
  # convert list to string 
  spacy_parsed_text = nlp(" ".join(filtered)) 
  # Get the lemma for each token in the parsed text 
  
  # I wanted to keep pronouns so not taking lemma if it's a pronoun but if you want to remove pronouns use below commented line 
  # return " ".join([token.lemma_ for token in doc])

  # return as list of words again 
  return [token.lemma_ if token.lemma_ != '-PRON-' else token.lower_ for token in spacy_parsed_text]

# Training Gensim Model

In [None]:
MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/NLP - Lab/WordEmbeddings/Models/SingleYear/Gensim_Lemmatized_Stopwords_Removed/"

In [None]:
from natsort import natsorted
import os

In [None]:
all_paths = natsorted(glob.glob(ROOT + '/data/ocr_paper_COMPREHENSIVE/*/'))

In [None]:
# Number of years 
len(all_paths)

130

## Gensim Model - Lemmatization with Removal of StopWords

In [None]:
gensim_models = {} 

year_counter = 0 
num_years = 50

for folder_path in all_paths:
  print(folder_path)
  proc_docs = get_proc_docs(folder_path, STOPWORDS, verbose=True, use_porter=False, removeStopWords=True)

  if proc_docs == []:
    continue 

  gensim_model = Word2Vec(sentences=proc_docs, min_count=1) 
  gensim_models[folder_path.split("/")[-2]] = gensim_model.wv
  
  # year_counter += 1 
  # if year_counter == num_years:
  #   break
  
  

/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1776/
Number of files: 1
	1/1
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1795/
Number of files: 1
	1/1
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1820/
Number of files: 1
	1/1
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1824/
Number of files: 2
	1/2
	2/2
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1826/
Number of files: 1
	1/1
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1827/
Number of files: 1
	1/1
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1828/
Number of files: 2
	1/2
	2/2
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1831/
Number of files: 1
	1/1
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1832/
Number of files: 2
	1/2
	2/2
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1835/
Number of files: 2
	1/2
	2/2
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1837/
Number of file

In [None]:
word = "nerve" 
df = pd.DataFrame()
for year,wv in gensim_models.items():
  try:
    df[year] = [word_tuple[0] for word_tuple in wv.most_similar(word, topn=20)]
  except:
    df[year] = ["Word not found"] * 20

df

Unnamed: 0,1776,1795,1820,1824,1826,1827,1828,1831,1832,1835,1837,1840,1841,1842,1843,1845,1846,1850,1852,1855,1856,1857,1858,1859,1860,1863,1867,1868,1869,1870,1871,1874,1875,1877,1878,1879,1881,1883,1884,1885,...,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1949,1952,1954,1956,1973,1974,1977,1980,1981,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2014,2015
0,6th,affirm,jelly,recouvrent,scirrhoos,do,doctrine,Word not found,ether,Word not found,vascular,contraction,Word not found,Word not found,kitten,cheval,contraction,affirm,branch,incomplete,doctrine,distrilmlion,pupil,doctrine,fibre,direction,opinion,nejm,ether,progress,contraction,ncrvenrcgcncralion,Word not found,Word not found,fibre,kitten,systems,nerven,doctrine,neighboring,...,fibre,tissue,fibre,vitamin,fibre,day,day,component,atrophy,form,current,ted,1875,Word not found,direction,ted,Word not found,correlation,neighboring,Word not found,ted,detection,dyke,cns,ganglioside,day,growth,axonal,cell,axon,tectum,axon,axon,axon,crush,injury,injure,fiber,neuropathy,neuropathy
1,branch,direction,branch,wundartzeneykunst,pupil,pupil,pupil,Word not found,terminating,Word not found,fibre,doctrine,Word not found,Word not found,compact,incomplete,fibre,branch,tree,ether,ether,6th,branch,substantiate,branch,branch,cord,glio,congest,lead,current,gewi,Word not found,Word not found,branch,attainment,aim,regeneration,sclerose,tedious,...,cell,vitamin,day,spinal,fiber,fiber,fibre,fiber,fiber,process,record,direction,vorkomman,Word not found,variation,correlation,Word not found,systems,direction,Word not found,ritchie,correlation,correlation,branch,graft,axonal,axon,regeneration,axon,cell,day,crush,cell,regenerate,transect,observe,keywords,lesion,neuritis,neuropathies
2,express,sal,prevent,cette,patpd,wise,elliotson,Word not found,branch,Word not found,pressure,express,Word not found,Word not found,mucous,direction,muscle,direction,aim,lymphatics,direction,direction,contraction,6th,muscle,resophageal,time,11c1,pupil,reparation,influence,affirm,Word not found,Word not found,anterior,eyep,direction,nervennaht,6th,direction,...,day,plaque,spinal,virus,day,cell,vitamin,efferent,neuritis,increase,atory,aim,neighboring,Word not found,labelling,tedious,Word not found,direction,sal,Word not found,supported,yao,incomplete,1985,lesion,time,cell,cns,1989,optic,tract,develop,lesion,graft,tract,graft,graft,graft,electric,biomechanical
3,lead,tedious,perfectly,montrant,progress,unite,unite,Word not found,contraction,Word not found,brain,direction,Word not found,Word not found,publication,imper,ganglia,tree,contraction,revue,branch,branch,sympathetic,direction,tissue,entire,leg,comprchoncl,contraction,respects,head,ruit,Word not found,Word not found,cell,entire,branch,kerne,unintelligible,prevent,...,vitamin,condition,fiber,lesion,vitamin,thiamin,time,cranial,disk,function,rise,variation,entire,Word not found,justifie,stimulation,Word not found,tedious,lead,Word not found,tial,tree,neural,cell,cell,axon,cns,axon,cadherin,axonal,regeneration,lesion,regeneration,target,transection,rgc,regeneration,axon,nerves,retinotopic
4,entire,branch,entertain,pupil,lead,perior,perfectly,Word not found,neuralgia,Word not found,water,progress,Word not found,Word not found,sharp,stimulation,muscular,progress,mucous,cubic,procedure,express,aneurism,permanence,divide,oppose,limb,prevent,thegerminal,prevent,condition,lagcn,Word not found,Word not found,commissure,example,lenti,nervenmark,frece,contraction,...,brain,day,cord,mice,tissue,time,virus,preganglionic,tract,axon,fibre,jelly,universidad,Word not found,branch,contraction,Word not found,typewriter,aut,Word not found,neurosci,ramsey,aim,pns,gml,lesion,cone,cell,cns,growth,lesion,fiber,study,cell,chiasm,animal,piece,treatment,chiasm,disc
5,perfectly,express,grasp,éditeur,wise,nt1mber,black,Word not found,unite,Word not found,tube,6th,Word not found,Word not found,line,express,centre,callu,fibre,direction,throw,congest,cranial,express,nucleus,commissure,complete,lrnvo,progress,wliid1,direction,direction,Word not found,Word not found,pass,perfectly,a1ulgeuiculam,table,startiug,branch,...,condition,lesion,ration,olfactory,diet,pigeon,lesion,visceral,lesion,nucleus,phase,stimulation,mittheilungen,Word not found,agranoff,supported,Word not found,thor,neurosci,Word not found,lead,express,supported,oligodendrocyte,axon,study,axonal,crush,axonal,neurite,prior,axonal,cns,fiber,distal,axon,cns,regeneration,tectum,chiasm
6,unite,seve1,black,porta,seve1,person,invisible,Word not found,intervene,Word not found,time,attainment,Word not found,Word not found,fracture,lead,tissue,contraction,october,lead,commissure,pupil,pressure,senso,supply,difforent,day,spoken,athology,unite,centre,ncuen,Word not found,Word not found,ventricle,publication,pretension,degeneration,qpt,anastomotic,...,lesion,disease,sheath,day,lesion,lesion,skin,somatic,orbital,stump,authors,branch,800,Word not found,lead,11e,Word not found,branch,contraction,Word not found,neural,borgisser,postsynaptic,1984,neuron,protein,goldfish,gap43,regeneration,1990,myelination,day,develop,neuron,section,day,optic,day,disc,behbehani
7,throw,ultimate,invisible,tize,branch,oppose,person,Word not found,oppose,Word not found,grain,wise,Word not found,Word not found,assign,branch,influence,perfectly,voluminous,ductus,anu,progress,fibre,assemble,report,fourth,sensation,vo1,direction,perfectly,treatment,abgerundete,Word not found,Word not found,nucleus,rema,unite,neuralgia,tree,antiseptically,...,degeneration,diet,black,time,time,tissue,cell,conductor,head,movement,table,ohm,peel,Word not found,neural,mice,Word not found,assemble,neural,Word not found,prevent,respects,branch,outgrowth,cns,regeneration,fiber,segment,brain,fiber,oligodendrocyte,fibre,gene,study,optic,response,regenerate,study,33protection,linical
8,gentleman,lead,line,agitait,prevent,gregori,oppose,Word not found,morning,Word not found,branch,example,Word not found,Word not found,deposition,ignate,spinal,mesenteric,ether,branch,anterior,collyrium,neck,branch,power,future,head,throw,prevent,formative,fibre,quiescent,Word not found,Word not found,posterior,stomosis,mucous,operation,stimulation,collll,...,dixon,deficiency,tissue,cell,degeneration,muscle,animal,conduction,branch,lead,direction,twig,eur,Word not found,compact,table,Word not found,innervates,tial,Word not found,table,branch,lead,property,growth,optic,brain,1989,1990,day,segment,cns,retina,cns,sciatic,rgcs,axon,regenerate,neuropathies,imal
9,heal,perfectly,globular,obliterated,olight,sue,surg,Word not found,heal,Word not found,blood,connection,Word not found,Word not found,disposition,198,chord,congest,ted,assemble,fibre,ultimate,greatly,ultimate,capillary,gan,fatal,e11uclcation,branch,nucleolus,branch,vercinigung,Word not found,Word not found,layer,colourless,versity,condition,direction,express,...,myelin,nutrition,lesion,cord,unintelligible,brain,diet,243,degeneration,tion,ohm,sharp,compact,Word not found,mice,800,Word not found,compact,procedure,Word not found,198,neurosci,neurosci,fiber,1984,control,rat,lam,protein,regeneration,injury,observe,regenerate,regeneration,lesione,study,distance,model,larva,goldfish


In [None]:
model_path = "/content/drive/MyDrive/Colab Notebooks/NLP - Lab/WordEmbeddings/Models/SingleYear/Gensim_Lemmatized_Removed_Stopwords"

In [None]:
for key,value in gensim_models.items():
  gensim_models[key].save(model_path + '/{}.kv'.format(key))

## Gensim Model - Stemming with Removal of StopWords

In [None]:
gensim_models = {} 

year_counter = 0 
num_years = 20

for folder_path in all_paths:
  print(folder_path)
  proc_docs = get_proc_docs(folder_path, STOPWORDS, verbose=True, use_porter=True, removeStopWords=True)

  if proc_docs == []:
    continue 
    
  gensim_model = Word2Vec(sentences=proc_docs, min_count=1) 
  gensim_models[folder_path.split("/")[-2]] = gensim_model.wv
  
  year_counter += 1 
  if year_counter == num_years:
    break
  
  

/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1776/
Number of files: 1
	1/1
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1795/
Number of files: 1
	1/1
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1820/
Number of files: 1
	1/1
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1824/
Number of files: 2
	1/2
	2/2
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1826/
Number of files: 1
	1/1
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1827/
Number of files: 1
	1/1
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1828/
Number of files: 2
	1/2
	2/2
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1831/
Number of files: 1
	1/1
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1832/
Number of files: 2
	1/2
	2/2
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1835/
Number of files: 2
	1/2
	2/2
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1837/
Number of file

In [None]:
word = "nerve" 
df = pd.DataFrame()
for year,wv in gensim_models.items():
  try:
    df[year] = [word_tuple[0] for word_tuple in wv.most_similar(do_stemming([word])[0], topn=10)]
  except KeyError:
    df[year] = ["Word not found"] * 10

df

Unnamed: 0,1776,1795,1820,1824,1826,1827,1828,1831,1832,1835,1837,1840,1841,1842,1843,1845,1846,1850,1852,1855
0,convert,figur,figur,progressif,convert,ving,figur,Word not found,symptom,Word not found,matter,muscl,Word not found,medicin,head,chaussier,muscl,minut,convert,245
1,minut,symptom,minut,minut,asspa,minut,drawn,Word not found,purpos,Word not found,globul,mass,Word not found,370,trace,minut,contract,head,consider,cholesterin
2,ration,drawn,drawn,osseus,figur,thicken,head,Word not found,natur,Word not found,structur,organ,Word not found,regener,complet,245,spinal,equal,complet,mass
3,symptom,minut,equal,complet,fun,head,symptom,Word not found,oblig,Word not found,figur,natur,Word not found,gunther,physiolog,mass,forc,natur,equal,exit
4,head,equal,mass,mass,produ,cica,erat,Word not found,previou,Word not found,organ,retch,Word not found,proquest,prove,physiolog,fibr,middlesex,black,minut
5,drawn,standard,head,head,tractu,purpos,superiorli,Word not found,complet,Word not found,substanc,supposit,Word not found,union,hole,prolong,muscular,complet,lingual,equal
6,debil,natur,purpos,_veau,ration,tnere,black,Word not found,foot,Word not found,minut,spinal,Word not found,1842,ischiat,forc,ventricl,mass,prolong,prolong
7,consider,contriv,black,cfrconseript,trom,operandi,natur,Word not found,consider,Word not found,tube,opposit,Word not found,surgeri,divid,chiasma,minut,organ,spinal,drawn
8,fif,physiolog,natur,symptom,minut,natur,complet,Word not found,dose,Word not found,spinal,previou,Word not found,1855,duce,ration,action,connexion,exert,lingual
9,complet,fre,complet,lue,head,physiolog,previou,Word not found,prove,Word not found,nervou,equal,Word not found,journal,wholli,equal,cell,papillari,minut,pfeuffer


## GloVe Model Lemmatization and Removal of StopWords

In [None]:
from glove import Corpus, Glove

In [None]:
def get_proc_docs_glove(training_paper_year_path, STOPWORDS, max_papers=None, verbose=True, use_porter=False, removeStopWords=True):
  file_paths = glob.glob(training_paper_year_path + "*.docx")

  print("Number of files: {}".format(len(file_paths)))
  # if len(file_paths) == 0:
    # raise Exception("Folder has no files - maybe drive was not mounted?")
  ## -- Collecting Papers from Given Year -- ##
  proc_docs = [] 

  counter = 1
  length = len(file_paths)
  for f in file_paths:
    doc = ' '.join(get_docx(f))
    # proc_doc = [word for word in re.findall(r'\w+', doc.lower()) if ((word in STARTWORDS) and (len(word) > 2) and (word not in STOPWORDS))]
    
    proc_doc = [] 
    
    if removeStopWords:
      proc_doc = [word for word in re.findall(r'\w+', doc) if ((len(word) > 2) and (word not in STOPWORDS))]
    else:
      proc_doc = [word for word in re.findall(r'\w+', doc)]

    if use_porter:
      proc_doc = do_stemming(proc_doc)      
    else:
      proc_doc = do_lemmatizing(proc_doc)



    proc_docs.append(proc_doc)
    print("{}/{}".format(counter, length))
    counter += 1

    if max_papers != None:
      if counter == max_papers+1:
        break 

  return proc_docs

In [None]:
def train_glove(proc_docs):
  #Creating a corpus object
  corpus = Corpus() 

  #Training the corpus to generate the co occurence matrix which is used in GloVe
  corpus.fit(proc_docs, window=10)

  glove = Glove(no_components=5, learning_rate=0.05) 
  glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
  glove.add_dictionary(corpus.dictionary)
  # glove.save('glove.model')

  return glove 

In [None]:
glove_models = {} 

year_counter = 0 
num_years = 20

for folder_path in all_paths:
  print(folder_path)
  proc_docs = get_proc_docs_glove(folder_path, STOPWORDS, verbose=True, use_porter=False, removeStopWords=True)

  if proc_docs == []:
    continue 
    
  glove_model = train_glove(proc_docs) 
  glove_models[folder_path.split("/")[-2]] = glove_model
  
  year_counter += 1 
  if year_counter == num_years:
    break

/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1776/
Number of files: 1
1/1
Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1795/
Number of files: 1
1/1
Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1820/
Number of files: 1
1/1
Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 

In [None]:
glove_models

{'1776': <glove.glove.Glove at 0x7f5729709550>,
 '1795': <glove.glove.Glove at 0x7f5736613210>,
 '1820': <glove.glove.Glove at 0x7f5729a278d0>,
 '1824': <glove.glove.Glove at 0x7f572a2e3c10>,
 '1826': <glove.glove.Glove at 0x7f572a946790>,
 '1827': <glove.glove.Glove at 0x7f5728c63050>,
 '1828': <glove.glove.Glove at 0x7f57294cee10>,
 '1831': <glove.glove.Glove at 0x7f5728dcf110>,
 '1832': <glove.glove.Glove at 0x7f572a2c8250>,
 '1835': <glove.glove.Glove at 0x7f5729d74f10>,
 '1837': <glove.glove.Glove at 0x7f572a6b7dd0>,
 '1840': <glove.glove.Glove at 0x7f572929fa90>,
 '1841': <glove.glove.Glove at 0x7f5728e1d090>,
 '1842': <glove.glove.Glove at 0x7f572a34c310>,
 '1843': <glove.glove.Glove at 0x7f572a56d150>,
 '1845': <glove.glove.Glove at 0x7f572a2d36d0>,
 '1846': <glove.glove.Glove at 0x7f5728f30dd0>,
 '1850': <glove.glove.Glove at 0x7f572a35a110>,
 '1852': <glove.glove.Glove at 0x7f5729747f10>,
 '1855': <glove.glove.Glove at 0x7f572a714990>}

In [None]:
word = "nerve" 
df = pd.DataFrame()
for year,model in glove_models.items():
  try:
    df[year] = [word_tuple[0] for word_tuple in model.most_similar(word, number=10)]
  except:
    df[year] = ["Word not found"] * 9

df

Unnamed: 0,1776,1795,1820,1824,1826,1827,1828,1831,1832,1835,1837,1840,1841,1842,1843,1845,1846,1850,1852,1855
0,divide,divide,loose,optic,gradually,human,optic,Word not found,delpech,Word not found,termination,muscle,Word not found,proqu,pneumogastric,contract,supply,colom,tube,investigating
1,bark,weight,diffuse,sanguins,existence,pupil,repertus,Word not found,precursory,Word not found,optic,movement,Word not found,370,sensation,pb1,large,capable,flouren,varicose
2,march,continue,tint,true,hean,difliculty,nasal,Word not found,compound,Word not found,tubule,body,Word not found,surgery,power,distinctive,deposition,continue,normal,swell
3,rib,perish,account,particuliere,rite,c_reature,eye,Word not found,fever,Word not found,retina,exclusively,Word not found,medicine,satisfactory,piper,langston,independent,frog,ness
4,experiment,hitherto,soluble,successivemcnt,ope,substance,furnish,Word not found,partially,Word not found,cerebral,sensation,Word not found,western,mucous,demon,vesicle,eady,divide,mammalia
5,fear,gradually,glassy,degree,distil,gmshe,be,Word not found,ick,Word not found,view,eye,Word not found,1855,adduced,exception,interior,ent,spinal,undoubtedly
6,large,reproduction,ingredi,mencent,mble,hydrocyanic,perverseness,Word not found,tyne,Word not found,egg,eyeball,Word not found,journal,render,tar,pass,duct,disorganize,dehiscence
7,dog,subject,tlze,fais,call,excellenc,correspondence,Word not found,question,Word not found,individual,motor,Word not found,1842,cellular,inferior,white,diaphragm,take,approximate
8,spinal,function,corpora,indispensable,rie,ment,sigl1,Word not found,comlyle,Word not found,brain,involuntary,Word not found,union,contrary,166,exist,hypoglossal,liquid,ossify


## GloVe Model Stemming and Removal of StopWords

In [None]:
glove_models = {} 

year_counter = 0 
num_years = 20

for folder_path in all_paths:
  print(folder_path)
  proc_docs = get_proc_docs_glove(folder_path, STOPWORDS, verbose=True, use_porter=True, removeStopWords=True)

  if proc_docs == []:
    continue 
    
  glove_model = train_glove(proc_docs) 
  glove_models[folder_path.split("/")[-2]] = glove_model
  
  year_counter += 1 
  if year_counter == num_years:
    break

/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1776/
Number of files: 1
1/1
Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1795/
Number of files: 1
1/1
Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1820/
Number of files: 1
1/1
Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 

In [None]:
word = "nerve" 
df = pd.DataFrame()
for year,model in glove_models.items():
  try:
    df[year] = [word_tuple[0] for word_tuple in model.most_similar(do_stemming([word])[0], number=10)]
  except:
    df[year] = ["Word not found"] * 9

df

Unnamed: 0,1776,1795,1820,1824,1826,1827,1828,1831,1832,1835,1837,1840,1841,1842,1843,1845,1846,1850,1852,1855
0,instantli,divid,sinu,optic,amber,suppli,optic,Word not found,pneumonia,Word not found,organ,suppli,Word not found,surgeri,filament,prefer,arteri,uncertain,alter,286
1,divid,eighth,breve,relat,rnatrli,gener,culiar,Word not found,se1,Word not found,brain,bodi,Word not found,1842,haighton,dura,pass,absenc,tube,grain
2,experi,perfect,prove,caiss,re1,nation,contrari,Word not found,vember,Word not found,optic,eyebal,Word not found,1855,oper,mater,termin,sympathet,paper,557
3,weaker,pair,sir,nombreux,day,cicatrix,admiss,Word not found,lid,Word not found,recogn,eye,Word not found,gunther,rest,cranial,excitor,neck,observ,abort
4,sore,demonstr,chang,forme,tumonr,medicin,c0l,Word not found,whic,Word not found,chord,muscl,Word not found,journal,vagu,chip,motor,resid,normal,epitheli
5,suffici,lost,yellowish,mier,easp,rnner,later,Word not found,difi,Word not found,delic,power,Word not found,370,observa,curv,furnish,local,divid,manifestli
6,divis,vascular,structur,ren,theu,rapidli,red,Word not found,puls,Word not found,structur,movement,Word not found,regener,develop,surfac,tube,divid,action,secret
7,quiet,quentli,coaccrvata,snffit,oculi,live,exist,Word not found,space,Word not found,cerebr,consequ,Word not found,western,quir,perfeclli,white,capillari,fibr,nutrit
8,nerv_,explan,action,génération,lig,natur,aliquando,Word not found,exist,Word not found,appear,motor,Word not found,union,cut,phili,suppli,precis,tubular,differ
