In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import tensorflow as tf
import os
import sys
import logging
import argparse
from smart_open import smart_open
import gensim
from gensim.models import KeyedVectors
from tqdm import tqdm_notebook
import inflect 
from gensim.models.fasttext import FastText
from scipy.spatial.distance import cosine
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

In [None]:
#Preprcessing for Fast Text  
nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')
p = inflect.engine() 
stops = list(stopwords.words("english"))
extra = [']','[','(',')','{','@','}',';',':','#','$','^','+','-',',','=','.']
print(len(stops))

def remove_punctuation(text): 
	translator = str.maketrans('', '', string.punctuation) 
	return text.translate(translator) 
def remove_whitespace(text): 
	return " ".join(text.split()) 
def remove_others(text):
    st = ""
    for i in range(0,len(text)):
      if(text[i] not in extra):
        st+=text[i]
    return st
def lemmetize(text):
    ans = ""
    g = text.split(' ')
    for x in g:
      ans += "{} ".format(lemmatizer.lemmatize(x))
    return ans
def convert_number(text): 
	temp_str = text.split() 
	new_string = [] 

	for word in temp_str: 
		if word.isdigit(): 
			temp = p.number_to_words(word) 
			new_string.append(temp) 
		else: 
			new_string.append(word) 
	temp_str = ' '.join(new_string) 
	return temp_str 

def remove_stopwords(text):
  g = text.split(' ')
  ans = ""
  for i in range(0,len(g)):
    if(g[i] not in stops):
      if(i==len(g)-1):
        ans+="{}".format(g[i])
      else:
        ans+="{} ".format(g[i])
  return ans

In [None]:
from sklearn.manifold import TSNE
def display_closestwords_tsnescatterplot(model, word, size):
    arr = np.empty((0,size), dtype='f')
    word_labels = [word]
    close_words = model.similar_by_word(word)
    arr = np.append(arr, np.array([model[word]]), axis=0)
    for wrd_score in close_words:
      wrd_vector = model[wrd_score[0]]
      word_labels.append(wrd_score[0])
      arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)
    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    plt.scatter(x_coords, y_coords)
    for label, x, y in zip(word_labels, x_coords, y_coords):
      plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.0005, x_coords.max()+0.0005)
    plt.ylim(y_coords.min()+0.0005, y_coords.max()+0.0005)
    plt.show()

In [None]:
logger = logging.getLogger(__name__)

def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False):
    model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=binary, unicode_errors='ignore')
    outfiletsv = tensor_filename + '_tensor.tsv'
    outfiletsvmeta = tensor_filename + '_metadata.tsv'

    with smart_open(outfiletsv, 'wb') as file_vector, smart_open(outfiletsvmeta, 'wb') as file_metadata:
        for word in model.index2word:
            file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n'))
            vector_row = '\t'.join(str(x) for x in model[word])
            file_vector.write(gensim.utils.to_utf8(vector_row) + gensim.utils.to_utf8('\n'))

    logger.info("2D tensor file saved to %s", outfiletsv)
    logger.info("Tensor metadata file saved to %s", outfiletsvmeta)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
class SentenceIterator: 
    def __init__(self, filepath): 
        self.filepath = filepath 

    def __iter__(self): 
        for line in open(self.filepath): 
            yield line.split() 

In [None]:
!unzip gdrive/My\ Drive/pubmed-rct-master.zip

In [None]:
!7z e pubmed-rct-master/PubMed_200k_RCT/train.7z

In [None]:
!python -m nltk.downloader stopwords

In [None]:
import io 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
#word_tokenize accepts a string as an input, not a file. 
stop_words = set(stopwords.words('english')) 
file1 = open("train.txt") 
line = file1.read()# Use this to read file content as a stream: 
words = line.split() 
for r in tqdm_notebook(words):   
    if not r in stop_words: 
        appendFile = open('lowercasetext.txt','a') 
        appendFile.write(" "+r.lower()) 
        appendFile.close() 

In [None]:
%cd ../../

In [None]:
sentences = SentenceIterator('train.txt')
model = Word2Vec(sentences)
# Change window size and min word count
model = Word2Vec(sentences, min_count=5, workers=3, window =5, sg = 1)

In [None]:
model1 = KeyedVectors.load_word2vec_format('Cbow_default')
model2 = KeyedVectors.load_word2vec_format('Skipgram_default')

In [None]:
model1.most_similar('cardiovascular',topn = 15)
model.most_similar('skin',topn = 15)

In [None]:
model.wv.save_word2vec_format('Cbow_default')

In [None]:
!cp Cbow_default gdrive/My\ Drive/

In [None]:
!cp GoogleNews-vectors-negative300.bin.gz gdrive/My\ Drive/

In [None]:
!cp gdrive/My\ Drive/lowercasetext.txt ./

In [None]:
model.wv.cosine_similarities(model['illness'], [model['disease']])[0]

In [None]:
txt_file = open('words_classes.txt', 'a+')
i = 0
for k in tqdm_notebook(words):
  x = model.wv.cosine_similarities(model[k], [model['disease']])[0]
  y = model.wv.cosine_similarities(model[k], [model['symptom']])[0]
  z = model.wv.cosine_similarities(model[k], [model['drug']])[0]
  if(x == max(x,y,z)):
    txt_file.write(k + " " + str(1) + "\n")
    continue
  elif(y == max(x,y,z)):
    txt_file.write(k + " " + str(2)+ "\n")
    continue
  else:
    txt_file.write(k + " " + str(3)+ "\n")
    continue

In [None]:
result = model.most_similar(positive=['neuro', 'heart'], negative=['cardiovascular'], topn=1)
print(result)

In [None]:
sentences2 = SentenceIterator('train.txt')
model2 = Word2Vec(sentences2, sg=1)

In [None]:
model2.most_similar('skin', topn=15)

In [None]:
model2.wv.save_word2vec_format('Skipgram_default')
word2vec2tensor('Skipgram_default', 'Skipgram_default', binary=False)

In [None]:
!cp Skipgram_default_* gdrive/My\ Drive/
!cp Skipgram_default gdrive/My\ Drive/

In [None]:
model2.most_similar('cardiovascular')[0:15]

In [None]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
glove_file = datapath("/content/glove.6B.100d.txt")
tmp_file = get_tmpfile("glove_embeddings_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)

In [None]:
model.save_word2vec_format("pretrained_glove")

In [None]:
!cp pretrained_glove gdrive/My\ Drive/

In [None]:
model_2 = Word2Vec(size=100, min_count=5)
model_2.build_vocab(sentences)
total_examples = model_2.corpus_count

In [None]:
print(total_examples)

In [None]:
model.save("pretrained_glove_model")

In [None]:
new_model = Word2Vec.load("pretrained_glove_model")

In [None]:
model.build_vocab([list(model.vocab.keys())], update=True)
model.intersect_word2vec_format("glove.6B.100d.txt", binary=False, lockf=1.0)
model.train(sentences, total_examples=total_examples, epochs=model_2.iter)

In [None]:
!pip install glove_python

In [None]:
from glove import Corpus, Glove

#Creating a corpus object
corpus = Corpus() 

#Training the corpus to generate the co occurence matrix which is used in GloVe
corpus.fit(sentences, window=15)
glove = Glove(no_components=100, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=50, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove_30e_15w.model')
!cp glove_30e_15w.model gdrive/My\ Drive/


In [None]:
corpus = Corpus() 
corpus.fit(sentences, window=5)
glove = Glove(no_components=100, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=50, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove_30e_5w.model')
!cp glove_30e_5w.model gdrive/My\ Drive/

In [None]:
from glove import Corpus, Glove
glove2 = Glove.load('glove.model')
glove2.dictionary.keys()

In [None]:
KeyedVectors.cosine_similarities(glove2.word_vectors[glove2.dictionary['cardio']], [glove2.word_vectors[glove2.dictionary['and']]])

In [None]:
glove2.word_vectors[glove2.dictionary['cardio']]

In [None]:
tensor_filename = 'glove'
outfiletsv = tensor_filename + '_tensor.tsv'
outfiletsvmeta = tensor_filename + '_metadata.tsv'

with smart_open(outfiletsv, 'wb') as file_vector, smart_open(outfiletsvmeta, 'wb') as file_metadata:
    for word in glove2.dictionary.keys():
        file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n'))
        vector_row = '\t'.join(str(x) for x in glove2.word_vectors[glove2.dictionary[word]])
        file_vector.write(gensim.utils.to_utf8(vector_row) + gensim.utils.to_utf8('\n'))

In [None]:
from scipy.spatial.distance import cosine
def getAverage(x,word):
  avg = 0.0
  for i in x:
    avg += cosine(model[i],model[word])https://www.google.com/search?client=ubuntu&channel=fs&q=mydrive&ie=utf-8&oe=utf-8
  return avg/len(x)
words = [['disease','dieseases','infectious-diseases','Disease'],['symptom','sign','symptoms','Symptom'], ['drug','drugs','medication']]
final = []
#out_file = open()
for word in model.words:
  p = 100
  cur = None
  for i in range(0,len(words)): 
    temp = getAverage(words[i],word)
    if(p>temp):
      cur = i 
      p=temp
  if(cur==None):
    print(temp,i,word)
    break
  final.append([word,cur])


In [None]:
eval_list = [['tetanus', 'spasms', 'Penicillin'], ['Rabies', 'hypersalivation', 'globulin'], ['Measles', 'rash', 'Analgesic'],
             ['Asthma', 'cough', 'bronchodilator'], ['typhoid', 'fever', 'Penicillin'], ['Malaria', 'fever', 'chloroquine'],
             ['Dengue', 'fever', 'Analgesic'], ['Flu', 'cough', 'decongestant'], ['Cardiomyopathy', 'breathlessness', 'Anticoagulant'],
             ['achalasia', 'heartburn', 'Antianginal'], ['chickenpox', 'rash', 'Analgesic'], ['Cholera', 'diarrhoea', 'IV'],
             ['Tuberculosis', 'cough', 'antibiotics']]

In [None]:
model1['rash'] #CBOW
model2['rash'] #Skipgram
glove2.word_vectors[glove2.dictionary['cardio']] #glove
eval_list_glove = []
eval_list_cbow = []
eval_list_skipgram = []
for l in eval_list:
  k = []
  for w in l:
    k.append(model1[w])
  eval_list_cbow.append(k)
for l in eval_list:
  k = []
  for w in l:
    k.append(model2[w])
  eval_list_skipgram.append(k)
for l in eval_list:
  k = []
  for w in l:
    k.append(glove2.word_vectors[glove2.dictionary['cardio']])
  eval_list_glove.append(k)

In [None]:
glove2.word_vectors[glove2.dictionary['infectious-diseases']]

In [None]:
words = [['disease','diseases','Disease'],['symptom','sign','symptoms','Symptom'], ['drug','drugs','medication']]

In [None]:
from scipy.spatial.distance import cosine
def getAverage_glove(x,word):
  avg = 0.0
  for i in x:
    avg += cosine(glove2.word_vectors[glove2.dictionary[i]],word)
  return avg/len(x)

def getAverage_cbow(x,word):
  avg = 0.0
  for i in x:
    avg += cosine(model1[i],word)
  return avg/len(x)

def getAverage_skipgram(x,word):
  avg = 0.0
  for i in x:
    avg += cosine(model2[i],word)
  return avg/len(x)

In [None]:
#evaluation metrics
def metric_p1(eval_list_emb):
  mp1 = 0
  for w in eval_list_emb:
    for i in range(len(eval_list_emb)):
      for j in range(i+1,len(eval_list_emb)):
        mp1+=cosine(eval_list_emb[i],eval_list_emb[j])
  return mp1/=len(eval_list_emb)
  
def metric_p2(eval_list_emb):
  mp2 = 0
  for l in eval_list_emb:
    for w in l:
      #add one to the mp2 for correct classification 
  return mp2/=3*(len(eval_list_emb))

In [None]:
def metric_p2_glove(eval_list_emb):
  mp2 = 0
  for l in eval_list_emb:
      a1 = getAverage_glove(words[0], l[0])
      a2 = getAverage_glove(words[1], l[0])
      a3 = getAverage_glove(words[2], l[0])
      b1 = getAverage_glove(words[0], l[1])
      b2 = getAverage_glove(words[1], l[1])
      b3 = getAverage_glove(words[2], l[1])
      c1 = getAverage_glove(words[0], l[2])
      c2 = getAverage_glove(words[1], l[2])
      c3 = getAverage_glove(words[2], l[2])
      if(a1 == max(a1,a2,a3)): mp2 += 1
      if(b2 == max(b1,b2,b3)): mp2 += 1
      if(c3 == max(c1,c2,c3)): mp2 += 1
      #add one to the mp2 for correct classification
  ans = mp2/(3*len(eval_list_emb))
  return ans

def metric_p2_cbow(eval_list_emb):
  mp2 = 0
  for l in eval_list_emb:
      a1 = getAverage_cbow(words[0], l[0])
      a2 = getAverage_cbow(words[1], l[0])
      a3 = getAverage_cbow(words[2], l[0])
      b1 = getAverage_cbow(words[0], l[1])
      b2 = getAverage_cbow(words[1], l[1])
      b3 = getAverage_cbow(words[2], l[1])
      c1 = getAverage_cbow(words[0], l[2])
      c2 = getAverage_cbow(words[1], l[2])
      c3 = getAverage_cbow(words[2], l[2])
      if(a1 == max(a1,a2,a3)): mp2 += 1
      if(b2 == max(b1,b2,b3)): mp2 += 1
      if(c3 == max(c1,c2,c3)): mp2 += 1
      #add one to the mp2 for correct classification
  ans = mp2/(3*len(eval_list_emb))
  return ans

def metric_p2_skipgram(eval_list_emb):
  mp2 = 0
  for l in eval_list_emb:
      a1 = getAverage_skipgram(words[0], l[0])
      a2 = getAverage_skipgram(words[1], l[0])
      a3 = getAverage_skipgram(words[2], l[0])
      b1 = getAverage_skipgram(words[0], l[1])
      b2 = getAverage_skipgram(words[1], l[1])
      b3 = getAverage_skipgram(words[2], l[1])
      c1 = getAverage_skipgram(words[0], l[2])
      c2 = getAverage_skipgram(words[1], l[2])
      c3 = getAverage_skipgram(words[2], l[2])
      if(a1 == max(a1,a2,a3)): mp2 += 1
      if(b2 == max(b1,b2,b3)): mp2 += 1
      if(c3 == max(c1,c2,c3)): mp2 += 1
      #add one to the mp2 for correct classification
  ans = mp2/(3*len(eval_list_emb))
  return ans

In [None]:
#NER for diseases

import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import unicodedata
 
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense
from keras.layers import TimeDistributed, Dropout, Bidirectional
 
# Defining Constants
 
# Maximum length of text sentences
MAXLEN = 180
# Number of LSTM units
LSTM_N = 150
# batch size
BS=48

In [None]:
data = pd.read_csv("train.csv", encoding="latin1")
test_data = pd.read_csv("test.csv", encoding="latin1")

In [None]:
words = list(set(data["Word"].append(test_data["Word"]).values))
words.append("ENDPAD")
 
# Converting greek characters to ASCII characters eg. 'naïve café' to 'naive cafe'
words = [unicodedata.normalize('NFKD', str(w)).encode('ascii','ignore') for w in words]
n_words = len(words)
print("\nLength of vocabulary = ",n_words)
 
tags = list(set(data["tag"].values))
n_tags = len(tags)
print("\nnumber of tags = ",n_tags)
 
# Creating words to indices dictionary.
word2idx = {w: i for i, w in enumerate(words)}
# Creating tags to indices dictionary.
tag2idx = {t: i for i, t in enumerate(tags)}

In [None]:
def get_tagged_sentences(data):
'''
Objective: To get list of sentences along with labelled tags.
Returns a list of lists of (word,tag) tuples.
Each inner list contains a words of a sentence along with tags.
'''
    agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(), s["tag"].values.tolist())]
    grouped = data.groupby("Sent_ID").apply(agg_func)
    sentences = [s for s in grouped]
    return sentences
 
def get_test_sentences(data):
'''
Objective: To get list of sentences.
Returns a list of lists of words.
Each inner list contains a words of a sentence.
'''
 
    agg_func = lambda s: [w for w in s["Word"].values.tolist()]
    grouped = data.groupby("Sent_ID").apply(agg_func)
    sentences = [s for s in grouped]
    return sentences
# Getting training sentences in a list
sentences = get_tagged_sentences(data)

In [None]:
# Converting words to indices for test sentences (Features)
# Converting greek characters to ASCII characters in train set eg. 'naïve café' to 'naive cafe'
X = [[word2idx[unicodedata.normalize('NFKD', str(w[0])).
encode('ascii','ignore')] for w in s] for s in sentences]
 
# Converting words to indices for test sentences (Features)
# Converting greek characters to ASCII characters in test-set eg. 'naïve café' to 'naive cafe'
X_test = [[word2idx[unicodedata.normalize('NFKD', str(w)).
encode('ascii','ignore')] for w in s] for s in test_sentences]
 
'''
Padding train and test sentences to 180 words.
Sentences of length greater than 180 words are truncated.
Sentences of length less than 180 words are padded with a high value.
'''
X = pad_sequences(maxlen=MAXLEN, sequences=X, padding="post", value=n_words - 1)
X_test = pad_sequences(maxlen=MAXLEN, sequences=X_test, padding="post", value=n_words - 1)
 
# Converting tags to indices for test sentences (labels)
y = [[tag2idx[w[1]] for w in s] for s in sentences]
# Padding tag labels to 180 words.
y = pad_sequences(maxlen=MAXLEN, sequences=y, padding="post", value=tag2idx["O"])
 
# Making labels in one hot encoded form for DL model
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [None]:
# 180 dimensional word indices as input
input = Input(shape=(MAXLEN,))
 
# Embedding layer of same length output (180 dim embedding will be generated)
model = Embedding(input_dim=n_words, output_dim=MAXLEN, input_length=MAXLEN)(input)
 
# Adding dropout layer
model = Dropout(0.2)(model)
 
# Bidirectional LSTM to learn from both forward as well as backward context
model = Bidirectional(LSTM(units=LSTM_N, return_sequences=True, recurrent_dropout=0.1))(model)
 
# Adding a TimeDistributedDense, to applying a Dense layer on each 180 timesteps
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model) # softmax output layer
model = Model(input, out)
 
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
history = model.fit(X, np.array(y), batch_size=BS, epochs=2, validation_split=0.05, verbose=1)

In [None]:
pred = model.predict(X_test)
pred_index = np.argmax(pred, axis=-1)

In [None]:
print(metric_p2_glove(eval_list_glove))

In [None]:
print(metric_p2_cbow(eval_list_cbow))

In [None]:
print(metric_p2_skipgram(eval_list_skipgram))

Fast Text Based Model Embeddings

In [None]:
#Fasttext librry
import fasttext
model_skip = fasttext.train_unsupervised("/content/pubmed-rct-master/PubMed_20k_RCT_numbers_replaced_with_at_sign/train_pre.txt",model='skipgram')

In [None]:
model_skip.get_nearest_neighbors("medicine",20)


In [None]:
from gensim.models.fasttext import FastText 
#Gensim Model Fasttext
model = FastText(corpus_file="/content/pubmed-rct-master/PubMed_20k_RCT_numbers_replaced_with_at_sign/train_pre.txt",e)

In [None]:
model.get_nearest_neighbors('sinusitis',20)

In [None]:
#Code for cosine Similarity Fast Text
from scipy.spatial.distance import cosine
def getAverage(x,word):
  avg = 0.0
  for i in x:
    avg += cosine(model_skip[i],model_skip[word])
  return avg/len(x)
words = [['disease','diseases','infectious-diseases'],['drug','medicine'],['symptom','sign']]
final = []
c =0
for word in model_skip.words:
  c+=1
  p = 100
  cur = None
  for i in range(0,len(words)): 
    temp = getAverage(words[i],word)
    if(p>temp):
      cur = i 
      p=temp
  if(cur==None):
    print(temp,i,word)
    break
  final.append([word,cur])
def getD(model,a,b):
  return 1.0-cosine(model_skip[a],model_skip[b])
p = ["disease","drug","symptom"]
l = ["aspirin","paracetamol","insomania","diabetes","thyroid","insulin"]
for k in l:
  ans = ""
  for i in p:
    ans += "{} ".format(getD(model_skip,k,i))
  print(k,ans)

print(final)
f = open("/content/drive/My Drive/labelsFastText.csv","w")
f.write("{}\n".format(words))
for x in final:
  ans = "{}, {}\n".format(x[0],x[1])
  f.write(ans)
f.close()