In [17]:
import pandas as pd
import numpy as np
import string
from collections import Counter
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from gensim.models.fasttext import FastText as ft
from numpy import genfromtxt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [20]:
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
             "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
             "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these",
             "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do",
             "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while",
             "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before",
             "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
             "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each",
             "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than",
             "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]


def load_embeddings(words):
    en_model = ft.load_fasttext_format('drug-reviews/wiki/wiki.en.bin')
    embedding = np.array([])
    for word in words:
        embedding = np.append(embedding, en_model[word[0]])
    return embedding


def top_words(lines, num_words, n_grams):
    all_tokens = []
    for line in lines:
        tokens = tokenize(line, n_grams)
        for t in tokens:
            if t not in stopwords:
                all_tokens.append(t)
    counter = Counter(all_tokens)
    return counter.most_common(num_words)


stemmer = SnowballStemmer('english')

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text,n_grams):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return [' '.join(x) for x in ngrams(tokens, n_grams)]
  


def ngrams(tokens, n):
    output = []
    for i in range(len(tokens) - n + 1):
        output.append(tokens[i:i + n])
    return output


def plot_with_labels(low_dim_embs, labels, filename='tsne-bi.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
    plt.savefig(filename)



In [None]:
# main    
# read raw data 
raw_data = pd.read_csv('full_merge.csv')

# extract review lines
reviews = raw_data['Reviews']
#print(reviews[3])
# process and extract top n-grams
top_words = top_words(reviews, 100, 2)
print(top_words)
with open('drug-reviews/top_words_n-grams.txt', mode='w', encoding='utf-8') as myfile:
    myfile.write('\n'.join([w[0] for w in top_words]))

# for words in our vocabulary we load the pre-trained embeddings from wikipedia data provided by fasttext      
embeddings = load_embeddings(top_words)
np.savetxt('drug-reviews/bigram-embeddings', embeddings, delimiter=',')
top_words = []
with open('drug-reviews/top_words_bigram.txt', mode='r', encoding='utf-8') as myfile:
    top_words = myfile.readlines()
embeddings = genfromtxt('drug-reviews/bigram-embeddings')
embeddings = embeddings.reshape(100, 300)
print(np.shape(embeddings))

# apply t-SNE
tsne = TSNE(perplexity=10.0, n_components=2, init='pca', n_iter=5000)
low_dim_embedding = tsne.fit_transform(embeddings)

#plot
plot_with_labels(low_dim_embedding, top_words)