### Word2Vec using gensim
#### Here I am using gensim library to pretrain my own embedding using Word2Vec. To prevent any leakage of information I am removing test notes from the corpus. I am using 2 window sizes of 3 and 5 and 100 dimensions.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline

from nltk import word_tokenize
# from nltk.stem import PorterStemmer
# from nltk.stem import WordNetLemmatizer
from nltk.stem import *
from nltk.util import ngrams
import string
from nltk.corpus import stopwords
import re


from time import time


# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, GlobalMaxPooling1D, Add, Concatenate, concatenate, Input
from keras.layers.embeddings import Embedding
from keras.callbacks import Callback
from keras.optimizers import Adam, Adadelta
# SKLearn
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from keras.callbacks import TensorBoard
import nltk
from gensim.models import Word2Vec

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
use_cols = ['SUBJECT_ID', 'HADM_ID', 'TEXT']
Notes = pd.read_csv("/home/mimicuser/mimic3/NOTEEVENTS_NoTest.csv",usecols= use_cols, low_memory = False, engine = "c")

In [4]:
def preprocess_text(df):
    # This function preprocesses the text by filling not a number and replacing new lines ('\n') and carriage returns ('\r')
    df.TEXT = df.TEXT.fillna(' ')
    df.TEXT = df.TEXT.str.replace('\n',' ')
    df.TEXT = df.TEXT.str.replace('\r',' ')
    return df

In [5]:
Notes.TEXT[0]

'Admission Date:  [**2151-7-16**]       Discharge Date:  [**2151-8-4**]\n\n\nService:\nADDENDUM:\n\nRADIOLOGIC STUDIES:  Radiologic studies also included a chest\nCT, which confirmed cavitary lesions in the left lung apex\nconsistent with infectious process/tuberculosis.  This also\nmoderate-sized left pleural effusion.\n\nHEAD CT:  Head CT showed no intracranial hemorrhage or mass\neffect, but old infarction consistent with past medical\nhistory.\n\nABDOMINAL CT:  Abdominal CT showed lesions of\nT10 and sacrum most likely secondary to osteoporosis. These can\nbe followed by repeat imaging as an outpatient.\n\n\n\n                            [**First Name8 (NamePattern2) **] [**First Name4 (NamePattern1) 1775**] [**Last Name (NamePattern1) **], M.D.  [**MD Number(1) 1776**]\n\nDictated By:[**Hospital 1807**]\nMEDQUIST36\n\nD:  [**2151-8-5**]  12:11\nT:  [**2151-8-5**]  12:21\nJOB#:  [**Job Number 1808**]\n'

In [6]:
Notes = preprocess_text(Notes)

In [7]:
Notes.TEXT[0]

'Admission Date:  [**2151-7-16**]       Discharge Date:  [**2151-8-4**]   Service: ADDENDUM:  RADIOLOGIC STUDIES:  Radiologic studies also included a chest CT, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis.  This also moderate-sized left pleural effusion.  HEAD CT:  Head CT showed no intracranial hemorrhage or mass effect, but old infarction consistent with past medical history.  ABDOMINAL CT:  Abdominal CT showed lesions of T10 and sacrum most likely secondary to osteoporosis. These can be followed by repeat imaging as an outpatient.                                [**First Name8 (NamePattern2) **] [**First Name4 (NamePattern1) 1775**] [**Last Name (NamePattern1) **], M.D.  [**MD Number(1) 1776**]  Dictated By:[**Hospital 1807**] MEDQUIST36  D:  [**2151-8-5**]  12:11 T:  [**2151-8-5**]  12:21 JOB#:  [**Job Number 1808**] '

In [8]:
def clean_text(text):
    punc_list = string.punctuation+'0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.lower().translate(t).split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    text = " ".join(text)
    
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    # text = text.split()
    # stemmer = SnowballStemmer('english')
    # stemmed_words = [stemmer.stem(word) for word in text]
    # text = " ".join(stemmed_words)
    return text

In [9]:
Notes['TEXT'] = Notes['TEXT'].map(lambda x: clean_text(x))

In [10]:
Notes.TEXT[0]

'admission date discharge date service addendum radiologic studies radiologic studies also included chest confirmed cavitary lesions left lung apex consistent infectious process tuberculosis also moderate sized left pleural effusion head head showed intracranial hemorrhage mass effect old infarction consistent past medical history abdominal abdominal showed lesions sacrum likely secondary osteoporosis followed repeat imaging outpatient first name namepattern first name namepattern last name namepattern number dictated hospital medquist job job number'

In [11]:
Corpus = []
for i in range(0,len(Notes)):
    Corpus.append(Notes.TEXT[i].split(" "))

In [None]:
vocabulary_size = 400000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(Notes['TEXT'])
sequences = tokenizer.texts_to_sequences(Notes['TEXT'])

In [24]:
len(sequences)

2083180

In [12]:
len(Corpus)

2083180

In [None]:
# train model
model_vec_5 = Word2Vec(sequences, min_count=1)

In [20]:
model_vec_5.save("Word2Vec_5.bin")

In [27]:
model_vec_3 = Word2Vec(Corpus, min_count=1, window= 3)

In [28]:
model_vec_3.save("Word2Vec_3.bin")