# Training Gensim model on neuroscience papers

In [1]:
!pip install python-docx

Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 14.6 MB/s 
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py) ... [?25l[?25hdone
  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184508 sha256=1684792cd195253c2bc641c6bc19c91b6fd7819196a39e3740d0495861db6966
  Stored in directory: /root/.cache/pip/wheels/f6/6f/b9/d798122a8b55b74ad30b5f52b01482169b445fbb84a11797a6
Successfully built python-docx
Installing collected packages: python-docx
Successfully installed python-docx-0.8.11


In [2]:
!pip install glove_python-binary

Collecting glove_python-binary
  Downloading glove_python_binary-0.2.0-cp37-cp37m-manylinux1_x86_64.whl (948 kB)
[?25l[K     |▍                               | 10 kB 23.1 MB/s eta 0:00:01[K     |▊                               | 20 kB 28.3 MB/s eta 0:00:01[K     |█                               | 30 kB 31.4 MB/s eta 0:00:01[K     |█▍                              | 40 kB 33.3 MB/s eta 0:00:01[K     |█▊                              | 51 kB 21.1 MB/s eta 0:00:01[K     |██                              | 61 kB 15.7 MB/s eta 0:00:01[K     |██▍                             | 71 kB 14.5 MB/s eta 0:00:01[K     |██▊                             | 81 kB 16.0 MB/s eta 0:00:01[K     |███                             | 92 kB 17.5 MB/s eta 0:00:01[K     |███▌                            | 102 kB 13.6 MB/s eta 0:00:01[K     |███▉                            | 112 kB 13.6 MB/s eta 0:00:01[K     |████▏                           | 122 kB 13.6 MB/s eta 0:00:01[K     |████▌           

In [3]:
from docx import Document
import nltk
nltk.download('punkt')
import re
from nltk import sent_tokenize
import pandas as pd
from nltk.corpus import stopwords
nltk.download('stopwords')
import pickle
import numpy as np
import glob

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
from nltk.stem import WordNetLemmatizer
import nltk 
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [5]:
!git clone 'https://github.com/igorbrigadir/stopwords.git'

Cloning into 'stopwords'...
remote: Enumerating objects: 149, done.[K
remote: Total 149 (delta 0), reused 0 (delta 0), pack-reused 149[K
Receiving objects: 100% (149/149), 85.27 KiB | 5.02 MiB/s, done.
Resolving deltas: 100% (52/52), done.


In [6]:
alir3z4_data = '/content/stopwords/en/alir3z4.txt'

more_stops = pd.read_csv('/content/stopwords/en/alir3z4.txt')
new_stops = list(more_stops["'ll"])

In [7]:
DOMAIN_STOPS = {'pubmed', 'et', 'al', 'page'}
STOPWORDS =  set(stopwords.words('english') + stopwords.words('german') +  stopwords.words('dutch') + stopwords.words('french') +  stopwords.words('spanish')  + new_stops) | DOMAIN_STOPS
STOPWORDS = set(STOPWORDS)

In [8]:
len(STOPWORDS)

2011

In [9]:
ROOT = "/content/drive/MyDrive/regen_x"

In [10]:
def get_docx(file_path):
    doc = []
    for para in Document(file_path).paragraphs:
        if para.text == "":
            continue
        doc += (sent_tokenize(para.text.lower())) # we lower text here
    return doc


def get_start_stop():
    domain_stops = {'pubmed', 'et', 'al', 'page'}
    with open('/content/stopwords/en/alir3z4.txt', 'r') as fn:
        new_stops = [line.strip() for line in fn.readlines()]
    STOPWORDS =  set(stopwords.words('english') + stopwords.words('german') +  stopwords.words('dutch') + stopwords.words('french') +  stopwords.words('spanish')  + new_stops) | domain_stops

    fn = glob.glob(ROOT + '/data/start-words/*')
    ALL_STARTS = [pickle.load(open(f , 'rb')) for f in fn]
    STARTWORDS = {}
    for f in ALL_STARTS:
      STARTWORDS.update(f)
    STARTWORDS = set(STARTWORDS.keys())

    assert(type(STOPWORDS)==set and type(STARTWORDS)==set)
    return (STARTWORDS, STOPWORDS)

In [11]:
STARTWORDS, STOPWORDS = get_start_stop()

# Training Gensim Model

In [12]:
# Splitting into array of sentences 
# Doing lemmatization concurrently to save time and space 

In [13]:
from gensim.models import Word2Vec

In [14]:
# For gensim we need to split the document into sentences. 
# Then concatenate all the documents of a given year into one big array with all their sentences. 

# This functions takes a folder of files and returns one array with 
# all of the files processed sentences(which themselves are a list of words) as elements 
def get_proc_docs(training_paper_year, STARTWORDS, STOPWORDS, max_papers=None, verbose=True):
  global_path = "/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/"
  folder_path = global_path + "{}/".format(training_paper_year)
  print(folder_path) 
  file_paths = glob.glob(folder_path + "*.docx")

  print("Number of files: {}".format(len(file_paths)))
  if len(file_paths) == 0:
    raise Exception("Folder has no files - maybe drive was not mounted?")
  ## -- Collecting Papers from Given Year -- ##
  proc_docs = [] 

  counter = 1
  length = len(file_paths)
  for f in file_paths:
    # gives it to us in sentences! 
    doc = get_docx(f)
    # proc_doc = [word for word in re.findall(r'\w+', doc.lower()) if ((word in STARTWORDS) and (len(word) > 2) and (word not in STOPWORDS))]

    sentence = [] 
    for sentence in doc:
      # don't think we need to remove stopwords and such if we're training embeddings 
      # do lemmatization here as well 
      proc_sentence = [WordNetLemmatizer().lemmatize(word) for word in re.findall(r'\w+', sentence)]
      proc_docs.append(proc_sentence) 

    if(verbose):
      print("{}/{}".format(counter, length))
    counter += 1

    if max_papers != None:
      if counter == max_papers+1:
        break 

  return proc_docs

In [15]:
# -- Helper Function -- # 
def get_num_files_in_year(training_paper_year):
  global_path = "/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/"
  folder_path = global_path + "{}/".format(training_paper_year)
  print(folder_path) 
  file_paths = glob.glob(folder_path + "*.docx")
  print(len(file_paths))

def get_num_files_in_path(folder_path):
  file_paths = glob.glob(folder_path + "*.docx")
  print(len(file_paths))

In [18]:
proc_docs = get_proc_docs(1997, STARTWORDS, STOPWORDS)

/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1997/
Number of files: 8
1/8
2/8
3/8
4/8
5/8
6/8
7/8
8/8


In [None]:
# Train the embeddings! 
model = Word2Vec(sentences=proc_docs) 

In [None]:
model.most_similar("nerve")

  """Entry point for launching an IPython kernel.


[('retina', 0.9995856285095215),
 ('cell', 0.9994325637817383),
 ('optic', 0.9993767738342285),
 ('is', 0.999322235584259),
 ('axon', 0.9993131756782532),
 ('regeneration', 0.9992615580558777),
 ('to', 0.9991578459739685),
 ('epithelium', 0.9991327524185181),
 ('ganglion', 0.999085009098053),
 ('retinal', 0.9990763068199158)]

# Optimizing Training Function

You **don't** want to do incremental training for the reasons given in [this answer](https://stackoverflow.com/questions/42746007/incremental-word2vec-model-training-in-gensim)

# Time Period Binning

In [None]:
from natsort import natsorted
import os

In [None]:
ROOT = "/content/drive/MyDrive/regen_x"
NUM_BINS = 16

In [None]:
def get_time_per_list(NUM_BINS):
    all_paths = natsorted(glob.glob(ROOT + '/data/ocr_paper_COMPREHENSIVE/*/'))
    all_path_chunked = np.array_split(all_paths , NUM_BINS)
    all_paths = np.array_split(all_paths , NUM_BINS)
    time_periods = {}
    time_per_list = []
    for i , file_chunk in enumerate(all_paths):
        time_periods[i] = file_chunk
        for j in range(len(time_periods[i])):
            time_periods[i][j] = time_periods[i][j].split('/')[-2]
        time_periods[i] = str(time_periods[i][0]) + '-' +  str(time_periods[i][-1])
        time_per_list.append(time_periods[i])
    return (time_per_list, all_path_chunked)

In [None]:
# For gensim we need to split the document into sentences. 
# Then concatenate all the documents of a given year into one big array with all their sentences. 

# This functions takes a folder of files and returns one array with 
# all of the files processed sentences(which themselves are a list of words) as elements 
def get_proc_docs(year_path, STARTWORDS, STOPWORDS, max_papers=None, verbose=True):
  file_paths = glob.glob(year_path + "*.docx")

  print("Number of files: {}".format(len(file_paths)))
  if len(file_paths) == 0:
    # raise Exception("Folder has no files - maybe drive was not mounted?")
    pass 
  ## -- Collecting Papers from Given Year -- ##
  proc_docs = [] 

  counter = 1
  length = len(file_paths)
  for f in file_paths:
    # gives it to us in sentences! 
    doc = get_docx(f)
    # proc_doc = [word for word in re.findall(r'\w+', doc.lower()) if ((word in STARTWORDS) and (len(word) > 2) and (word not in STOPWORDS))]

    sentence = [] 
    for sentence in doc:
      # don't think we need to remove stopwords and such if we're training embeddings 
      # do lemmatization here as well 
      proc_sentence = [WordNetLemmatizer().lemmatize(word) for word in re.findall(r'\w+', sentence)]
      proc_docs.append(proc_sentence) 

    if(verbose):
      print("\t{}/{}".format(counter, length))
    counter += 1

    if max_papers != None:
      if counter == max_papers+1:
        break 

  return proc_docs

In [None]:
(time_per_list, all_path_chunked) = get_time_per_list(NUM_BINS)
(STOPWORDS, STARTWORDS) = get_start_stop()

MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/NLP - Lab/Models/"

if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)

In [None]:
all_path_chunked[0]

array(['/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1776/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1795/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1820/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1824/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1826/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1827/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1828/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1831/',
       '/content/drive/MyDrive/regen_x/data/ocr_paper_COMPREHENSIVE/1832/'],
      dtype='<U65')

In [None]:
### Train and Save Models for all time periods ##

counter = 0 
for time_period_paths in all_path_chunked:
  cur_time_period = time_per_list[counter] 
  print("Current Time Period: {}".format(cur_time_period))
  all_proc_docs_time_period = [] 

  for i, year_path in enumerate(time_period_paths):
    print("{}/{}".format(i+1, len(time_period_paths)))

    proc_doc_cur_year = get_proc_docs(year_path, STARTWORDS, STOPWORDS) 

    all_proc_docs_time_period += all_proc_docs_time_period + proc_doc_cur_year 

  # Train the embeddings! 
  print("Training word embeddings for {}...".format(cur_time_period))
  model = Word2Vec(sentences=all_proc_docs_time_period) 

  # Store just the words + their trained embeddings.
  word_vectors = model.wv
  word_vectors.save(MODEL_PATH + "{}.wordvectors".format(cur_time_period))

  # Manually create space 
  del all_proc_docs_time_period
  del word_vectors 
  del model 

  counter += 1

Current Time Period: 1776-1832
1/9
Number of files: 1
	1/1
2/9
Number of files: 1
	1/1
3/9
Number of files: 1
	1/1
4/9
Number of files: 2
	1/2
	2/2
5/9
Number of files: 1
	1/1
6/9
Number of files: 1
	1/1
7/9
Number of files: 2
	1/2
	2/2
8/9
Number of files: 1
	1/1
9/9
Number of files: 2
	1/2
	2/2
Training word embeddings for 1776-1832...
Current Time Period: 1835-1846
1/9
Number of files: 2
	1/2
	2/2
2/9
Number of files: 1
	1/1
3/9
Number of files: 0
4/9
Number of files: 1
	1/1
5/9
Number of files: 1
	1/1
6/9
Number of files: 1
	1/1
7/9
Number of files: 1
	1/1
8/9
Number of files: 1
	1/1
9/9
Number of files: 2
	1/2
	2/2
Training word embeddings for 1835-1846...
Current Time Period: 1847-1859
1/8
Number of files: 0
2/8
Number of files: 3
	1/3
	2/3
	3/3
3/8
Number of files: 1
	1/1
4/8
Number of files: 1
	1/1
5/8
Number of files: 1
	1/1
6/8
Number of files: 2
	1/2
	2/2
7/8
Number of files: 2
	1/2
	2/2
8/8
Number of files: 3
	1/3
	2/3
	3/3
Training word embeddings for 1847-1859...
Current 

In [None]:
# Exceution of above function took: 42 minutes and 8 seconds