Ref Notebook: https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne

Data: https://www.kaggle.com/c/quora-question-pairs/data

About t-SNE: https://www.datacamp.com/community/tutorials/introduction-t-sne

In [2]:
# Core Libraries and API's
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter
import tweepy
import re
import nltk
from gensim.models import word2vec
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import pandas as pd
pd.options.mode.chained_assignment = None 

#Plot helpers
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import pylab
#Enable matplotlib to be interactive (zoom etc)
%matplotlib notebook

In [3]:
SW = SummaryWriter()

In [4]:
data = pd.read_csv('train.csv').sample(50000, random_state=23)

In [5]:
#nltk.download() if LookupError arises
STOP_WORDS = nltk.corpus.stopwords.words()

In [6]:
def clean_sentence(val):
    '''remove chars that are not letters or numbers, downcase, then remove stop words'''
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)  
            
    sentence = " ".join(sentence)
    return sentence

In [7]:
def clean_dataframe(data):
    '''drop nans, then apply 'clean_sentence' function to question1 and 2'''
    data = data.dropna(how="any")
    
    for col in ['question1', 'question2']:
        data[col] = data[col].apply(clean_sentence)
    
    return data

In [8]:
data = clean_dataframe(data)
data.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
237921,237921,9732,79801,sex necessary relationship,sex important good relationship,1
181001,181001,277377,277378,inspiring start stories,inspirational stories ever,0
294691,294691,150129,93109,best way digital marketing,best unique ways digital marketing,1
104145,104145,171986,171987,best way grow facebook fan page,get followers facebook page,1
357893,357893,487310,487311,suppose host sends two tcp segments back back ...,suppose host sends two tcp segments back back ...,1


In [9]:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for col in ['question1', 'question2']:
        for sentence in data[col].iteritems():
            word_list = sentence[1].split(" ")
            corpus.append(word_list)
            
    return corpus

In [10]:
corpus = build_corpus(data)        
corpus[0:2]

[['sex', 'necessary', 'relationship'], ['inspiring', 'start', 'stories']]

In [11]:
model1 = word2vec.Word2Vec(corpus, size=100, window=20, min_count=200, workers=4)

In [25]:
model1.vocabulary.

<gensim.models.word2vec.Word2VecVocab at 0x7f6dd3334358>

In [14]:
vocab = []
for c in corpus:
    for v in c:
        vocab.append(v)

vocab = list(set(vocab))

In [20]:
# vecs, meta = [], []
vecs = [model1.wv[x] for x in vocab]
meta = [x for x in vocab]

KeyError: "word 'radiohead' not in vocabulary"

In [26]:
model1.wv['radiohead']

KeyError: "word 'radiohead' not in vocabulary"

In [None]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(10, 10)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [None]:
tsne_plot(model1)

In [None]:
# A more selective model
model2 = word2vec.Word2Vec(corpus, size=100, window=20, min_count=500, workers=4)
tsne_plot(model2)

In [None]:
# A less selective model
model3 = word2vec.Word2Vec(corpus, size=100, window=20, min_count=100, workers=4)
tsne_plot(model3)

In [None]:
model1.most_similar('interview'),model3.most_similar('interview')

In [None]:
def tsne_plot_3D(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    z = []
    for value,idx in zip(new_values,range(len(new_values))):
        x.append(value[0])
        y.append(value[1])
        z.append(idx)
        
    plt.figure(figsize=(20, 20)) 
    ax = plt.axes(projection="3d")
    
    for i in range(len(x)):
        ax.scatter3D(z[i],x[i],y[i])
        ax.text(z[i],x[i],y[i], '%s' % str(labels[i]),size=10, zorder=1,  color='k')
        '''pylab.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(2, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')'''
    plt.show()

In [None]:
tsne_plot_3D(model1)

In [None]:
tsne_plot_3D(model2)