In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models import FastText
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from sklearn.neighbors import NearestNeighbors

In [None]:
df = pd.read_csv('//Users/urmi/Documents/NLP/Assignment_1/pmc-data-all.csv',usecols=['Article_Id', 'Article_Title', 'Abstract'])
df.shape
#, nrows=30000

In [None]:
#df.loc[len(df.index)] = [0,'justjustjust','A 21-year-old college student undergoes colonoscopy due to family history of multiple polyps in his older siblings. His brother underwent total proctocolectomy at age 22, and his sister underwent a total proctocolectomy at age 28, after both were found to have hundreds of colonic adenomas on colonoscopy. Both siblings are currently well without any findings of neoplasms. The patient undergoes sigmoidoscopy and is found to have dozens of small colonic polyps within rectosigmoid. Several of these are biopsied and are all benign adenomas.'] 

In [None]:
#display(df)

In [None]:
df = df.dropna()

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.info()

In [None]:
#df.loc[205935].Article_Id

In [None]:
id_list = df['Article_Id']

In [None]:
id_list[2]

In [None]:
#Transforming questions to list for ease of processing
abstract_list = df['Abstract'].values.tolist()

In [None]:
#Tokenizing with simple preprocess gensim's simple preprocess
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True)) # returns lowercase tokens, ignoring tokens that are too short or too long

abstract_words = list(sent_to_words(abstract_list))

In [None]:
#Getting stopwords
stop_words = stopwords.words('english')

def remove_stopwords(sentence):
  filtered_words = [word for word in sentence if word not in stop_words]
  return filtered_words

abstract_questions = [remove_stopwords(question) for question in abstract_words]

In [None]:
#print(filtered_questions[2])

In [None]:
#from nltk.stem.porter import *
#stemmer = PorterStemmer()
#stem_questions = [stemmer.stem(item) for item in filtered_questions]
#stem_questions = [[stemmer.stem(word) for word in item] for item in filtered_questions]
#singles = [stemmer.stem(plural) for plural in plurals]

In [None]:
#len(filtered_questions)
#len(stem_questions)

In [None]:
#Instantiating the word2vec model
n = 300
model = Word2Vec(abstract_questions, size = n, window = 8)

#Training model 
model.train(abstract_questions, total_examples=len(abstract_questions), epochs=10)

In [None]:
word_vectors = model.wv
print('Words similar to "health" are: ', word_vectors.most_similar(positive='health'))


In [None]:
#fasttext model
ft_model = FastText(abstract_questions, size=n, window=8, min_count=5, workers=2,sg=1)

In [None]:
print('Words similar to "health" are: ', ft_model.wv.most_similar('health'))

In [None]:
# code for tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(abstract_list)
print(X.shape)

In [None]:
#To proprely work with scikit's vectorizer
merged_questions = [' '.join(question) for question in abstract_questions]
document_names = ['Doc {:d}'.format(i) for i in range(len(merged_questions))]

def get_tfidf(docs, ngram_range=(1,1), index=None):
    vect = TfidfVectorizer(stop_words='english', ngram_range=ngram_range)
    tfidf = vect.fit_transform(docs).todense()
    return pd.DataFrame(tfidf, columns=vect.get_feature_names(), index=index).T

tfidf = get_tfidf(merged_questions, ngram_range=(1,1), index=document_names)

In [None]:
#centroid function
def get_sent_embs(emb_model):
    sent_embs = []
    for desc in range(len(abstract_questions)):
        sent_emb = np.zeros((1, n))
        if len(stem_questions[desc]) > 0:
            sent_emb = np.zeros((1, n))
            div = 0
            model = emb_model
            for word in abstract_questions[desc]:
                if word in model.wv.vocab and word in tfidf.index:
                    word_emb = model.wv[word]
                    weight = tfidf.loc[word, 'Doc {:d}'.format(desc)]
                    sent_emb = np.add(sent_emb, word_emb * weight)
                    div += weight
                else:
                    div += 1e-13 #to avoid dividing by 0
        if div == 0:
            print(desc)

        sent_emb = np.divide(sent_emb, div)
        sent_embs.append(sent_emb.flatten())
    return sent_embs

In [None]:
ft_sent = get_sent_embs(emb_model = ft_model) 

In [None]:
#to find similarity
def get_n_most_similar(interest_index, embeddings, n):
    
    nbrs = NearestNeighbors(n_neighbors=n, metric='cosine').fit(embeddings)
    distances, indices = nbrs.kneighbors(embeddings)
    #print(distances)
    #print(indices)
    similar_indices = indices[interest_index][1:]
    similar_distances = distances[interest_index][1:]
    #print (similar_distances)
    #print(similar_indices)
    return similar_indices, similar_distances

def print_similar(interest_index, embeddings, n):
    
    closest_ind, closest_dist = get_n_most_similar(interest_index, embeddings, n)
    print('Question : \t %s \n \n is most similar to these %s questions: \n' % (abstract_list[interest_index], n))
    closest_ind = closest_ind[::-1]
    closest_dist = closest_dist[::-1]
    
    with open('test_output.txt','a') as file:
        file.write('TOPIC_NO Q0 PMCID RANK SCORE RUN_NAME\n')
        for idx,question in enumerate(closest_ind):
            file.write(f'{30} {0} {id_list[question]} {idx} {closest_dist[idx]} {"file_30"}\n')
            print('Article_Id :', id_list[question],',','Score:' ,closest_dist[idx],': ',abstract_list[question],'\n')
            #file.write()

In [None]:
#test = ['58-year-old', 'african-american', 'woman', 'present', 'ER', 'episod', 'pressing/burn', 'anterior', 'chest', 'pain', 'began', 'two', 'day', 'earlier', 'first', 'time', 'life', 'pain', 'start', 'walk', 'radiat', 'back', 'accompani', 'nausea', 'diaphoresi', 'mild', 'dyspnea', 'increas', 'inspir', 'latest', 'episod', 'pain', 'end', 'half', 'hour', 'prior', 'arriv', 'known', 'hypertens', 'obes', 'deni', 'smoke', 'diabet', 'hypercholesterolemia', 'famili', 'histori', 'heart', 'diseas', 'current', 'take', 'medic', 'physic', 'examin', 'normal', 'ekg', 'show', 'nonspecif', 'chang']
print_similar(216864, ft_sent, 1000)
