Ref Notebook: https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne

Data: https://www.kaggle.com/c/quora-question-pairs/data

About t-SNE: https://www.datacamp.com/community/tutorials/introduction-t-sne

In [1]:
# Core Libraries and API's
import tweepy
import re
import nltk
from gensim.models import word2vec
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import pandas as pd
pd.options.mode.chained_assignment = None 

#Plot helpers
import matplotlib
import matplotlib.pyplot as plt
#Enable matplotlib to be interactive (zoom etc)
%matplotlib notebook

In [2]:
data = pd.read_csv('train.csv').sample(50000, random_state=23)

In [3]:
#nltk.download() if LookupError arises
STOP_WORDS = nltk.corpus.stopwords.words()

In [4]:
def clean_sentence(val):
    '''remove chars that are not letters or numbers, downcase, then remove stop words'''
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)  
            
    sentence = " ".join(sentence)
    return sentence

In [5]:
def clean_dataframe(data):
    '''drop nans, then apply 'clean_sentence' function to question1 and 2'''
    data = data.dropna(how="any")
    
    for col in ['question1', 'question2']:
        data[col] = data[col].apply(clean_sentence)
    
    return data

In [6]:
data = clean_dataframe(data)
data.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
237921,237921,9732,79801,sex necessary relationship,sex important good relationship,1
181001,181001,277377,277378,inspiring start stories,inspirational stories ever,0
294691,294691,150129,93109,best way digital marketing,best unique ways digital marketing,1
104145,104145,171986,171987,best way grow facebook fan page,get followers facebook page,1
357893,357893,487310,487311,suppose host sends two tcp segments back back ...,suppose host sends two tcp segments back back ...,1


In [7]:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for col in ['question1', 'question2']:
        for sentence in data[col].iteritems():
            word_list = sentence[1].split(" ")
            corpus.append(word_list)
            
    return corpus

In [8]:
corpus = build_corpus(data)        
corpus[0:2]

[['sex', 'necessary', 'relationship'], ['inspiring', 'start', 'stories']]

In [9]:
model1 = word2vec.Word2Vec(corpus, size=100, window=20, min_count=200, workers=4)

In [10]:
model1.wv['interview']

array([ 3.90317857e-01,  3.40679944e-01, -4.91676442e-02, -6.76604360e-02,
        2.00811140e-02,  1.10612118e+00,  2.91094840e-01,  7.53072977e-01,
        2.26995260e-01,  3.86490166e-01,  4.73979145e-01,  7.01414049e-02,
        1.19569421e-01,  8.23097587e-01, -1.34485078e+00,  5.36491096e-01,
       -8.08108822e-02,  4.30619955e-01, -5.31930447e-01,  4.77621257e-01,
       -3.22931141e-01, -5.80204360e-04,  2.35237226e-01,  1.12806082e-01,
       -1.76668203e+00,  1.21446478e+00,  2.56464958e-01, -6.38456941e-02,
        1.84952453e-01, -4.37417805e-01,  1.26765490e-01,  5.13811827e-01,
        6.51450872e-01,  7.62969255e-01, -1.24197155e-01, -2.85319984e-01,
        6.07117534e-01,  2.99065560e-01,  1.86426312e-01,  2.66335607e-01,
       -1.21987605e+00, -4.88827735e-01,  4.28728759e-01, -2.07771778e-01,
       -2.59910256e-01, -5.77127188e-02, -1.98839188e-01,  3.40476662e-01,
       -3.64186540e-02,  1.19228385e-01,  1.89657137e-01, -6.79841876e-01,
       -2.65309960e-01, -

In [11]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(10, 10)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [12]:
tsne_plot(model1)

  import sys


<IPython.core.display.Javascript object>

In [16]:
# A more selective model
model2 = word2vec.Word2Vec(corpus, size=100, window=20, min_count=500, workers=4)
tsne_plot(model2)

  import sys


<IPython.core.display.Javascript object>

In [17]:
# A less selective model
model3 = word2vec.Word2Vec(corpus, size=100, window=20, min_count=100, workers=4)
tsne_plot(model3)

  import sys


<IPython.core.display.Javascript object>

In [18]:
model1.most_similar('interview'),model3.most_similar('interview')

  """Entry point for launching an IPython kernel.


([('tips', 0.8946440815925598),
  ('process', 0.8723848462104797),
  ('job', 0.8216250538825989),
  ('asked', 0.8036422729492188),
  ('making', 0.7582869529724121),
  ('answer', 0.6971772313117981),
  ('questions', 0.6931705474853516),
  ('answers', 0.6881176233291626),
  ('question', 0.659483790397644),
  ('post', 0.632351279258728)],
 [('tips', 0.8800058364868164),
  ('process', 0.8792097568511963),
  ('job', 0.8250908851623535),
  ('internship', 0.8083852529525757),
  ('asked', 0.7498068809509277),
  ('hire', 0.7488759756088257),
  ('offer', 0.7478092908859253),
  ('kind', 0.7474936246871948),
  ('expect', 0.7128331661224365),
  ('making', 0.7024226784706116)])

In [None]:
def tsne_plot_3D(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    z = []
    for value,idx in zip(new_values,range(len(new_values))):
        x.append(value[0])
        y.append(value[1])
        z.append(idx)
        
    plt.figure(figsize=(10, 10)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()