Ref Notebook: https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne

Data: https://www.kaggle.com/c/quora-question-pairs/data

About t-SNE: https://www.datacamp.com/community/tutorials/introduction-t-sne

In [1]:
# Core Libraries and API's
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter
import tweepy
import re
import nltk
from gensim.models import word2vec
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import pandas as pd
pd.options.mode.chained_assignment = None 

#Plot helpers
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import pylab
#Enable matplotlib to be interactive (zoom etc)
%matplotlib notebook

In [2]:
SW = SummaryWriter()

In [3]:
data = pd.read_csv('train.csv').sample(50000, random_state=23)

In [4]:
#nltk.download() if LookupError arises
STOP_WORDS = nltk.corpus.stopwords.words()

In [5]:
def clean_sentence(val):
    '''remove chars that are not letters or numbers, downcase, then remove stop words'''
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)  
            
    sentence = " ".join(sentence)
    return sentence

In [6]:
def clean_dataframe(data):
    '''drop nans, then apply 'clean_sentence' function to question1 and 2'''
    data = data.dropna(how="any")
    
    for col in ['question1', 'question2']:
        data[col] = data[col].apply(clean_sentence)
    
    return data

In [7]:
data = clean_dataframe(data)
data.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
237921,237921,9732,79801,sex necessary relationship,sex important good relationship,1
181001,181001,277377,277378,inspiring start stories,inspirational stories ever,0
294691,294691,150129,93109,best way digital marketing,best unique ways digital marketing,1
104145,104145,171986,171987,best way grow facebook fan page,get followers facebook page,1
357893,357893,487310,487311,suppose host sends two tcp segments back back ...,suppose host sends two tcp segments back back ...,1


In [8]:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    all_words = []
    corpus = []
    for col in ['question1', 'question2']:
        for sentence in data[col].iteritems():
#             print(sentence[0])
            word_list = sentence[1].split(" ")
            [all_words.append(x) for x in word_list]
            corpus.append(word_list)
            
    return corpus, all_words

In [9]:
corpus, vocab = build_corpus(data)        
# corpus[0:5]

In [10]:
model1 = word2vec.Word2Vec(corpus, size=100, window=20, min_count=200, workers=4)

In [14]:
# vecs, meta = [], []
vocab = list(model1.wv.vocab.keys())
vocab.remove('')
vecs = [model1.wv[x] for x in vocab]
meta = [x for x in vocab]
vecs = np.array(vecs)

In [15]:
SW.add_embedding(vecs, meta)

In [None]:
model1.most_similar()

In [None]:
def tsne_plot_3D(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    z = []
    for value,idx in zip(new_values,range(len(new_values))):
        x.append(value[0])
        y.append(value[1])
        z.append(idx)
        
    plt.figure(figsize=(20, 20)) 
    ax = plt.axes(projection="3d")
    
    for i in range(len(x)):
        ax.scatter3D(z[i],x[i],y[i])
        ax.text(z[i],x[i],y[i], '%s' % str(labels[i]),size=10, zorder=1,  color='k')
        '''pylab.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(2, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')'''
    plt.show()