# Training Gensim model on neuroscience papers

In [1]:
!pip install python-docx

Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 5.0 MB/s 
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py) ... [?25l[?25hdone
  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184508 sha256=0a176e8812e729d742df64d4e4c40ba974ffba4ae39004a9134369bcec11226f
  Stored in directory: /root/.cache/pip/wheels/f6/6f/b9/d798122a8b55b74ad30b5f52b01482169b445fbb84a11797a6
Successfully built python-docx
Installing collected packages: python-docx
Successfully installed python-docx-0.8.11


In [2]:
!pip install glove_python-binary

Collecting glove_python-binary
  Downloading glove_python_binary-0.2.0-cp37-cp37m-manylinux1_x86_64.whl (948 kB)
[?25l[K     |▍                               | 10 kB 22.6 MB/s eta 0:00:01[K     |▊                               | 20 kB 28.7 MB/s eta 0:00:01[K     |█                               | 30 kB 11.2 MB/s eta 0:00:01[K     |█▍                              | 40 kB 10.1 MB/s eta 0:00:01[K     |█▊                              | 51 kB 5.4 MB/s eta 0:00:01[K     |██                              | 61 kB 6.0 MB/s eta 0:00:01[K     |██▍                             | 71 kB 5.6 MB/s eta 0:00:01[K     |██▊                             | 81 kB 6.3 MB/s eta 0:00:01[K     |███                             | 92 kB 4.9 MB/s eta 0:00:01[K     |███▌                            | 102 kB 5.3 MB/s eta 0:00:01[K     |███▉                            | 112 kB 5.3 MB/s eta 0:00:01[K     |████▏                           | 122 kB 5.3 MB/s eta 0:00:01[K     |████▌                   

In [3]:
from docx import Document
import nltk
nltk.download('punkt')
import re
from nltk import sent_tokenize
import pandas as pd
from nltk.corpus import stopwords
nltk.download('stopwords')
import pickle
import numpy as np
import glob

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
from nltk.stem import WordNetLemmatizer
import nltk 
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [36]:
from nltk.stem.porter import PorterStemmer

In [27]:
from glove import Corpus, Glove

In [5]:
!git clone 'https://github.com/igorbrigadir/stopwords.git'

Cloning into 'stopwords'...
remote: Enumerating objects: 149, done.[K
remote: Total 149 (delta 0), reused 0 (delta 0), pack-reused 149[K
Receiving objects: 100% (149/149), 85.27 KiB | 1.20 MiB/s, done.
Resolving deltas: 100% (52/52), done.


In [6]:
alir3z4_data = '/content/stopwords/en/alir3z4.txt'

more_stops = pd.read_csv('/content/stopwords/en/alir3z4.txt')
new_stops = list(more_stops["'ll"])

In [7]:
DOMAIN_STOPS = {'pubmed', 'et', 'al', 'page'}
STOPWORDS =  set(stopwords.words('english') + stopwords.words('german') +  stopwords.words('dutch') + stopwords.words('french') +  stopwords.words('spanish')  + new_stops) | DOMAIN_STOPS
STOPWORDS = set(STOPWORDS)

In [8]:
len(STOPWORDS)

2011

In [9]:
ROOT = "/content/drive/MyDrive/regen_x"

In [10]:
def get_docx(file_path):
    doc = []
    for para in Document(file_path).paragraphs:
        if para.text == "":
            continue
        doc += (sent_tokenize(para.text.lower())) # we lower text here
    return doc


def get_start_stop():
    domain_stops = {'pubmed', 'et', 'al', 'page'}
    with open('/content/stopwords/en/alir3z4.txt', 'r') as fn:
        new_stops = [line.strip() for line in fn.readlines()]
    STOPWORDS =  set(stopwords.words('english') + stopwords.words('german') +  stopwords.words('dutch') + stopwords.words('french') +  stopwords.words('spanish')  + new_stops) | domain_stops

    fn = glob.glob(ROOT + '/data/start-words/*')
    ALL_STARTS = [pickle.load(open(f , 'rb')) for f in fn]
    STARTWORDS = {}
    for f in ALL_STARTS:
      STARTWORDS.update(f)
    STARTWORDS = set(STARTWORDS.keys())

    assert(type(STOPWORDS)==set and type(STARTWORDS)==set)
    return (STARTWORDS, STOPWORDS)

In [11]:
STARTWORDS, STOPWORDS = get_start_stop()

# Optimizing Training Function

You **don't** want to do incremental training for the reasons given in [this answer](https://stackoverflow.com/questions/42746007/incremental-word2vec-model-training-in-gensim)

# Time Period Binning

In [12]:
from natsort import natsorted
import os

In [13]:
ROOT = "/content/drive/MyDrive/regen_x"
NUM_BINS = 16

In [14]:
def get_time_per_list(NUM_BINS):
    all_paths = natsorted(glob.glob(ROOT + '/data/ocr_paper_COMPREHENSIVE/*/'))
    all_path_chunked = np.array_split(all_paths , NUM_BINS)
    all_paths = np.array_split(all_paths , NUM_BINS)
    time_periods = {}
    time_per_list = []
    for i , file_chunk in enumerate(all_paths):
        time_periods[i] = file_chunk
        for j in range(len(time_periods[i])):
            time_periods[i][j] = time_periods[i][j].split('/')[-2]
        time_periods[i] = str(time_periods[i][0]) + '-' +  str(time_periods[i][-1])
        time_per_list.append(time_periods[i])
    return (time_per_list, all_path_chunked)

In [17]:
(time_per_list, all_path_chunked) = get_time_per_list(NUM_BINS)
(STOPWORDS, STARTWORDS) = get_start_stop()

In [18]:
all_path_chunked[0]

array(['/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1776/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1795/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1820/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1824/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1826/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1827/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1828/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1831/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1832/'],
      dtype='<U65')

In [30]:
# We need to split the document into sentences. 
# Then concatenate all the documents of a given year into one big array with all their sentences. 

# This functions takes a folder of files and returns one array with 
# all of the files processed sentences(which themselves are a list of words) as elements 
def get_proc_docs(year_path, STARTWORDS, STOPWORDS, max_papers=None, verbose=True, use_porter=False, useStopWords=True):
  file_paths = glob.glob(year_path + "*.docx")

  print("Number of files: {}".format(len(file_paths)))
  if len(file_paths) == 0:
    # raise Exception("Folder has no files - maybe drive was not mounted?")
    pass 
  ## -- Collecting Papers from Given Year -- ##
  proc_docs = [] 

  counter = 1
  length = len(file_paths)
  for f in file_paths:
    doc = get_docx(f)
    
    for sentence in doc:
      # don't think we need to remove stopwords and such if we're training embeddings 
      # do lemmatization here as well 

      proc_sentence = [] 
      if useStopWords:
        proc_sentence = [word for word in re.findall(r'\w+', sentence) if ((len(word) > 2) and (word not in STOPWORDS))]
      else:
        proc_sentence = [word for word in re.findall(r'\w+', sentence)]

      if use_porter:
        proc_sentence = do_stemming(proc_sentence) 
      else:
        proc_sentence = do_lemmatizing(proc_sentence) 
      proc_docs.append(proc_sentence)  

    if(verbose):
      print("\t{}/{}".format(counter, length))
    counter += 1

    if max_papers != None:
      if counter == max_papers+1:
        break 

  return proc_docs

def do_stemming(filtered):
	stemmed = []
	for f in filtered:
		stemmed.append(PorterStemmer().stem(f))
		#stemmed.append(LancasterStemmer().stem(f))
		#stemmed.append(SnowballStemmer('english').stem(f))
	return stemmed

# for lemmatization 
import spacy
# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])

def do_lemmatizing(filtered):
  # convert list to string 
  spacy_parsed_text = nlp(" ".join(filtered)) 
  # Get the lemma for each token in the parsed text 
  
  # I wanted to keep pronouns so not taking lemma if it's a pronoun but if you want to remove pronouns use below commented line 
  # return " ".join([token.lemma_ for token in doc])

  # return as list of words again 
  return [token.lemma_ if token.lemma_ != '-PRON-' else token.lower_ for token in spacy_parsed_text]

In [31]:
# This functions takes a folder of files and returns one array with 
# all of the files processed sentences(which themselves are a list of words) as elements 
def get_proc_docs_glove(year_path, STARTWORDS, STOPWORDS, max_papers=None, verbose=True, use_porter=False, useStopWords=True):
  file_paths = glob.glob(year_path + "*.docx")


  print("Number of files: {}".format(len(file_paths)))
  if len(file_paths) == 0:
    # raise Exception("Folder has no files - maybe drive was not mounted?")
    pass 
  ## -- Collecting Papers from Given Year -- ##
  proc_docs = [] 

  counter = 1
  length = len(file_paths)
  for f in file_paths:
    doc = ' '.join(get_docx(f))
    # proc_doc = [word for word in re.findall(r'\w+', doc.lower()) if ((word in STARTWORDS) and (len(word) > 2) and (word not in STOPWORDS))]
    proc_doc = [] 
    if useStopWords:
      proc_doc = [word for word in re.findall(r'\w+', doc) if ((len(word) > 2) and (word not in STOPWORDS))]
    else:
      proc_doc = [word for word in re.findall(r'\w+', doc)]

    if use_porter:
      proc_doc = do_stemming(proc_doc)      
    else:
      proc_doc = do_lemmatizing(proc_doc)
    proc_docs.append(proc_doc)
    print("{}/{}".format(counter, length))
    counter += 1

    if max_papers != None:
      if counter == max_papers+1:
        break 

  return proc_docs

In [20]:
def train_glove(proc_docs):
  #Creating a corpus object
  corpus = Corpus() 

  #Training the corpus to generate the co occurence matrix which is used in GloVe
  corpus.fit(proc_docs, window=10)

  glove = Glove(no_components=5, learning_rate=0.05) 
  glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
  glove.add_dictionary(corpus.dictionary)
  # glove.save('glove.model')

  return glove 

In [32]:
MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/NLP - Lab/WordEmbeddings/Models/GloVe/"

if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)

In [44]:
### Train and Save Models for all time periods ##

counter = 0 
for time_period_paths in all_path_chunked:
  cur_time_period = time_per_list[counter] 
  print("Current Time Period: {}".format(cur_time_period))
  all_proc_docs_time_period = [] 

  for i, year_path in enumerate(time_period_paths):
    print("{}/{}".format(i+1, len(time_period_paths)))

    proc_doc_cur_year = get_proc_docs_glove(year_path, STARTWORDS, STOPWORDS, max_papers=None, verbose=True, use_porter=True, useStopWords=True) 

    all_proc_docs_time_period += all_proc_docs_time_period + proc_doc_cur_year 

  # Train the embeddings! 
  print("Training word embeddings for {}...".format(cur_time_period))
  model = train_glove(all_proc_docs_time_period) 

  try:
    print(model.most_similar("eye", number=10))
  except:
    print("Not in vocab")
  # Store just the words + their trained embeddings.
  # with open(MODEL_PATH + "GloVe_Stemmed/{}_Stemmed.txt".format(cur_time_period), "w") as f:
  #   for word in model.dictionary:
  #       f.write(word)
  #       f.write(" ")
  #       for i in range(0, 5):
  #           f.write(str(model.word_vectors[model.dictionary[word]][i]))
  #           f.write(" ")
  #       f.write("\n")

  # Manually create space 
  del all_proc_docs_time_period
  del model 

  counter += 1

Current Time Period: 1776-1832
1/9
Number of files: 1
1/1
2/9
Number of files: 1
1/1
3/9
Number of files: 1
1/1
4/9
Number of files: 2
1/2
2/2
5/9
Number of files: 1
1/1
6/9
Number of files: 1
1/1
7/9
Number of files: 2
1/2
2/2
8/9
Number of files: 1
1/1
9/9
Number of files: 2
1/2
2/2
Training word embeddings for 1776-1832...
Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Not in vocab
Current Time Period: 1835-1846
1/9
Number of files: 2
1/2
2/2
2/9
Number of files: 1
1/1
3/9
Number of files: 0
4/9
Number of files: 1
1/1
5/9
Number of files: 1
1/1
6/9
Number of files: 1
1/1
7/9
Number of files: 1
1/1
8/9
Number of files: 1
1/1
9/9
Number of files: 2
1/2
2/2
Training word embeddings for 1835-1846...
Performing 30 training epochs with

KeyboardInterrupt: ignored

In [38]:
# Exceution of above function took: 19 mins and 42 seconds