# Word2Vec tutorial

## Importing libraries

In [58]:
import re # For pre-processing
import pandas as pd# For data handling
from time import time # To time operations
from collections import defaultdict # For word frequency
import spacy # For pre-processing
import logging  # To monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.naive_bayes import ComplementNB

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit

import numpy as np

In [10]:
df = pd.read_csv('simpsons_dataset.csv')
df.shape

(158314, 2)

In [5]:
df.head(10)

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...
5,Martin Prince,I don't think there's anything left to say.
6,Edna Krabappel-Flanders,Bart?
7,Bart Simpson,Victory party under the slide!
8,,
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!


In [6]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

## Difference between reset_index() and reset_index(drop=True)

In [11]:
df = df.dropna().reset_index()
df.head(10)

Unnamed: 0,index,raw_character_text,spoken_words
0,0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,1,Lisa Simpson,Where's Mr. Bergstrom?
2,2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,3,Lisa Simpson,That life is worth living.
4,4,Edna Krabappel-Flanders,The polls will be open from now until the end ...
5,5,Martin Prince,I don't think there's anything left to say.
6,6,Edna Krabappel-Flanders,Bart?
7,7,Bart Simpson,Victory party under the slide!
8,9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
9,10,Landlady,"Hey, hey, he Moved out this morning. He must h..."


In [15]:
df = pd.read_csv('simpsons_dataset.csv')

df = df.dropna().reset_index(drop=True)
df.head(10)

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...
5,Martin Prince,I don't think there's anything left to say.
6,Edna Krabappel-Flanders,Bart?
7,Bart Simpson,Victory party under the slide!
8,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
9,Landlady,"Hey, hey, he Moved out this morning. He must h..."


In [16]:
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [18]:
nl = spacy.load('en',disable=['ner','parser'])

def cleaning(doc):
    text = [token.lemma_ for token in doc if not token.is_stop]
    if len(text) > 2:
        return ' '.join(text)

In [19]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

t = time()

text = [cleaning(doc) for doc in nl.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 1.41 mins


In [20]:
df_clean = pd.DataFrame({'clean':text})
df_clean.head(10)

Unnamed: 0,clean
0,actually little disease magazine news show be ...
1,be mr bergstrom
2,not know would sure like talk not touch lesson...
3,life worth live
4,poll open end recess case decide thought will ...
5,not think be leave
6,
7,victory party slide
8,mr bergstrom mr bergstrom
9,hey hey move morning new job take copernicus c...


In [21]:
df_clean.isnull().sum()

clean    38133
dtype: int64

In [23]:
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(92412, 1)

## The main reason to use bigrams is to catch words like "mr_burns" or "bart_simpson" !



In [29]:
from gensim.models.phrases import Phrases, Phraser

# As Phrases() takes a list of list of words as input:

sent = [row.split() for row in df_clean['clean']]

INFO - 20:55:53: 'pattern' package not found; tag filters are not available for English


In [30]:
sent[0]

['actually',
 'little',
 'disease',
 'magazine',
 'news',
 'show',
 'be',
 'natural',
 'think']

In [32]:
phrases = Phrases(sent, min_count=30, progress_per=10000)


INFO - 21:01:07: collecting all words and their counts
INFO - 21:01:07: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 21:01:07: PROGRESS: at sentence #10000, processed 67396 words and 50551 word types
INFO - 21:01:07: PROGRESS: at sentence #20000, processed 140465 words and 95808 word types
INFO - 21:01:07: PROGRESS: at sentence #30000, processed 207950 words and 132011 word types
INFO - 21:01:08: PROGRESS: at sentence #40000, processed 270207 words and 164407 word types
INFO - 21:01:08: PROGRESS: at sentence #50000, processed 334085 words and 196195 word types
INFO - 21:01:08: PROGRESS: at sentence #60000, processed 400877 words and 228659 word types
INFO - 21:01:08: PROGRESS: at sentence #70000, processed 467802 words and 260712 word types
INFO - 21:01:08: PROGRESS: at sentence #80000, processed 534361 words and 292095 word types
INFO - 21:01:08: PROGRESS: at sentence #90000, processed 602037 words and 321944 word types
INFO - 21:01:08: collected 328658 word typ

## The goal of Phraser() is to cut down memory consumption of Phrases(), by discarding model state not strictly needed for the bigram detection task:

In [33]:
bigram = Phraser(phrases)

INFO - 21:05:12: source_vocab length 328658
INFO - 21:05:15: Phraser built with 127 phrasegrams


## Transform the corpus based on the bigrams detected:


In [34]:
sentences = bigram[sent]

In [35]:
sentences[0]

['actually',
 'little',
 'disease',
 'magazine',
 'news',
 'show',
 'be',
 'natural',
 'think']

In [36]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

29673

In [37]:
word_freq

defaultdict(int,
            {'actually': 432,
             'little': 2130,
             'disease': 46,
             'magazine': 125,
             'news': 252,
             'show': 213,
             'be': 35473,
             'natural': 78,
             'think': 3765,
             'mr': 808,
             'bergstrom': 18,
             'not': 14093,
             'know': 4998,
             'would': 1208,
             'sure': 1250,
             'like': 5322,
             'talk': 987,
             'touch': 197,
             'lesson': 165,
             'plan': 309,
             'teach': 329,
             'life': 1244,
             'worth': 144,
             'live': 762,
             'poll': 20,
             'open': 432,
             'end': 469,
             'recess': 11,
             'case': 220,
             'decide': 136,
             'thought': 119,
             'will': 6281,
             'final': 108,
             'statement': 21,
             'martin': 121,
             'leave': 1061,
  

In [39]:
sorted(word_freq, key=word_freq.get)[:10]


['leland',
 'promontory',
 'parkville',
 'mopey',
 'doover',
 'mixology',
 'planter',
 'cassis',
 'ssslurr',
 'shpeech']

In [40]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]


['be', 'not', 'oh', 'will', 'like', "'s", 'know', 'think', 'hey', 'good']

## Training the word2vec

In [41]:
import multiprocessing
from gensim.models import Word2Vec

In [43]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
print(cores)

8


# The parameters:
## min_count = int - Ignores all words with total absolute frequency lower than this - (2, 100)
## window = int - The maximum distance between the current and predicted word within a sentence. E.g. window words on the left and window words on the left of our target - (2, 10)
## size = int - Dimensionality of the feature vectors. - (50, 300)
## sample = float - The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial. - (0, 1e-5)
## alpha = float - The initial learning rate - (0.01, 0.05)
## min_alpha = float - Learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00
## negative = int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)
## workers = int - Use these many worker threads to train the model (=faster training with multicore machines)

In [44]:
w2v_model = Word2Vec(min_count=20, 
                     window=2, 
                     size=300, 
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [45]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 21:28:31: collecting all words and their counts
INFO - 21:28:31: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 21:28:31: PROGRESS: at sentence #10000, processed 65193 words, keeping 9096 word types
INFO - 21:28:32: PROGRESS: at sentence #20000, processed 136024 words, keeping 13916 word types
INFO - 21:28:32: PROGRESS: at sentence #30000, processed 201577 words, keeping 16865 word types
INFO - 21:28:32: PROGRESS: at sentence #40000, processed 262082 words, keeping 19506 word types
INFO - 21:28:32: PROGRESS: at sentence #50000, processed 324069 words, keeping 21758 word types
INFO - 21:28:32: PROGRESS: at sentence #60000, processed 388895 words, keeping 23910 word types
INFO - 21:28:33: PROGRESS: at sentence #70000, processed 454042 words, keeping 25876 word types
INFO - 21:28:33: PROGRESS: at sentence #80000, processed 518929 words, keeping 27769 word types
INFO - 21:28:33: PROGRESS: at sentence #90000, processed 584755 words, keeping 29345 word types


Time to build vocab: 0.05 mins


## Training of the model:

## Parameters of the training:

## total_examples = int - Count of sentences;
## epochs = int - Number of iterations (epochs) over the corpus - [10, 20, 30]

In [46]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 21:29:40: training model with 7 workers on 3375 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 21:29:41: EPOCH 1 - PROGRESS: at 37.39% examples, 82785 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:42: EPOCH 1 - PROGRESS: at 66.81% examples, 71346 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:42: worker thread finished; awaiting finish of 6 more threads
INFO - 21:29:42: worker thread finished; awaiting finish of 5 more threads
INFO - 21:29:42: worker thread finished; awaiting finish of 4 more threads
INFO - 21:29:42: worker thread finished; awaiting finish of 3 more threads
INFO - 21:29:42: worker thread finished; awaiting finish of 2 more threads
INFO - 21:29:42: worker thread finished; awaiting finish of 1 more threads
INFO - 21:29:42: worker thread finished; awaiting finish of 0 more threads
INFO - 21:29:42: EPOCH - 1 : training on 601119 raw words (219268 effective words) took 2.8s, 76947 effective words/s
INFO - 21:29:43: EPOCH 2 - PROG

INFO - 21:30:09: EPOCH 10 - PROGRESS: at 68.49% examples, 72815 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:10: worker thread finished; awaiting finish of 6 more threads
INFO - 21:30:10: worker thread finished; awaiting finish of 5 more threads
INFO - 21:30:10: worker thread finished; awaiting finish of 4 more threads
INFO - 21:30:10: worker thread finished; awaiting finish of 3 more threads
INFO - 21:30:10: worker thread finished; awaiting finish of 2 more threads
INFO - 21:30:10: worker thread finished; awaiting finish of 1 more threads
INFO - 21:30:10: worker thread finished; awaiting finish of 0 more threads
INFO - 21:30:10: EPOCH - 10 : training on 601119 raw words (219194 effective words) took 2.8s, 79422 effective words/s
INFO - 21:30:11: EPOCH 11 - PROGRESS: at 42.87% examples, 91338 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:12: EPOCH 11 - PROGRESS: at 86.72% examples, 92369 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:12: worker thread finished; awaiting finish of 6 mor

INFO - 21:30:35: EPOCH - 19 : training on 601119 raw words (219133 effective words) took 2.9s, 75832 effective words/s
INFO - 21:30:36: EPOCH 20 - PROGRESS: at 39.12% examples, 85505 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:37: EPOCH 20 - PROGRESS: at 80.01% examples, 85963 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:37: worker thread finished; awaiting finish of 6 more threads
INFO - 21:30:37: worker thread finished; awaiting finish of 5 more threads
INFO - 21:30:37: worker thread finished; awaiting finish of 4 more threads
INFO - 21:30:37: worker thread finished; awaiting finish of 3 more threads
INFO - 21:30:37: worker thread finished; awaiting finish of 2 more threads
INFO - 21:30:37: worker thread finished; awaiting finish of 1 more threads
INFO - 21:30:37: worker thread finished; awaiting finish of 0 more threads
INFO - 21:30:38: EPOCH - 20 : training on 601119 raw words (219339 effective words) took 2.5s, 86091 effective words/s
INFO - 21:30:39: EPOCH 21 - PROGRESS: at 39

INFO - 21:31:02: worker thread finished; awaiting finish of 1 more threads
INFO - 21:31:02: worker thread finished; awaiting finish of 0 more threads
INFO - 21:31:02: EPOCH - 29 : training on 601119 raw words (219058 effective words) took 2.8s, 77158 effective words/s
INFO - 21:31:03: EPOCH 30 - PROGRESS: at 35.62% examples, 78451 words/s, in_qsize 0, out_qsize 0
INFO - 21:31:04: EPOCH 30 - PROGRESS: at 63.43% examples, 68119 words/s, in_qsize 1, out_qsize 0
INFO - 21:31:05: worker thread finished; awaiting finish of 6 more threads
INFO - 21:31:05: worker thread finished; awaiting finish of 5 more threads
INFO - 21:31:05: worker thread finished; awaiting finish of 4 more threads
INFO - 21:31:05: worker thread finished; awaiting finish of 3 more threads
INFO - 21:31:05: worker thread finished; awaiting finish of 2 more threads
INFO - 21:31:05: worker thread finished; awaiting finish of 1 more threads
INFO - 21:31:05: worker thread finished; awaiting finish of 0 more threads
INFO - 21:31

Time to train the model: 1.43 mins


## As we do not plan to train the model any further, we are calling init_sims(), which will make the model much more memory-efficient:

In [47]:
w2v_model.init_sims(replace=True)

INFO - 21:32:42: precomputing L2-norms of word weight vectors


# Word2Vec on Reddit comments (ML Course)

In [3]:
# IPython-related imports
from IPython.display import HTML

import nltk

import gensim


# Removing special charecters and tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/swetha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
train = pd.read_pickle('../../kaggle_compettition/ift3395-ift6390-reddit-comments/data_train.pkl')

In [5]:
from tqdm import tqdm, trange
import string
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmtizer = WordNetLemmatizer()

trset = []
for i in trange(len(train[0])):
    trset.append(train[0][i])
    trset[i] = trset[i].lower()
    trset[i] = tokenizer.tokenize(trset[i])
    trset[i] = [i for i in trset[i] if not i in stop_words]
    trset[i] = [lemmtizer.lemmatize(w) for w in trset[i]]
    
    trset[i] = " ".join(trset[i])

[nltk_data] Downloading package wordnet to /home/swetha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
100%|██████████| 70000/70000 [00:09<00:00, 7342.98it/s]


In [6]:
from gensim.models.phrases import Phrases, Phraser

# As Phrases() takes a list of list of words as input:

sent = [row.split() for row in trset]

In [22]:
sent[0]

['honestly',
 'buffalo',
 'correct',
 'answer',
 'remember',
 'people',
 'somewhat',
 'joking',
 'buffalo',
 'mantra',
 'starting',
 'goalie',
 'win',
 'game',
 'get',
 'traded',
 'think',
 'edmonton',
 'front',
 'office',
 'travesty',
 'better',
 'part',
 '10',
 'year',
 'buffalo',
 'systematic',
 'destruction',
 'term',
 'competitive',
 'much',
 'responsible',
 'change',
 'draft',
 'lottery']

In [7]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 11:19:42: collecting all words and their counts
INFO - 11:19:42: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 11:19:42: PROGRESS: at sentence #10000, processed 223948 words and 195676 word types
INFO - 11:19:42: PROGRESS: at sentence #20000, processed 454311 words and 359352 word types
INFO - 11:19:43: PROGRESS: at sentence #30000, processed 680045 words and 508758 word types
INFO - 11:19:43: PROGRESS: at sentence #40000, processed 912738 words and 653432 word types
INFO - 11:19:44: PROGRESS: at sentence #50000, processed 1143566 words and 789416 word types
INFO - 11:19:44: PROGRESS: at sentence #60000, processed 1369541 words and 919140 word types
INFO - 11:19:45: collected 1042724 word types from a corpus of 1593639 words (unigram + bigrams) and 70000 sentences
INFO - 11:19:45: using 1042724 counts as vocab in Phrases<0 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>


In [8]:
bigram = Phraser(phrases)

INFO - 11:19:45: source_vocab length 1042724
INFO - 11:19:55: Phraser built with 873 phrasegrams


In [9]:
sentences = bigram[sent]

In [27]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

68560

In [28]:
word_freq

defaultdict(int,
            {'honestly': 695,
             'buffalo': 70,
             'correct': 369,
             'answer': 495,
             'remember': 1145,
             'people': 9163,
             'somewhat': 157,
             'joking': 62,
             'mantra': 7,
             'starting': 432,
             'goalie': 126,
             'win': 1442,
             'game': 5494,
             'get': 9494,
             'traded': 158,
             'think': 8040,
             'edmonton': 48,
             'front_office': 40,
             'travesty': 10,
             'better': 3379,
             'part': 1874,
             '10_year': 168,
             'systematic': 7,
             'destruction': 54,
             'term': 564,
             'competitive': 273,
             'much': 4072,
             'responsible': 123,
             'change': 1360,
             'draft': 214,
             'lottery': 45,
             'ah': 209,
             'yes': 1532,
             'way': 4439,
             'c

In [29]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['like',
 'one',
 'would',
 'get',
 'people',
 'think',
 'gt',
 'time',
 'really',
 'know']

In [30]:
import multiprocessing

from gensim.models import Word2Vec

## min_count=20,window=2,size=300,sample=6e-5,alpha=0.03,min_alpha=0.0007,             negative=20,workers=cores-1,epochs = 50

In [32]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=50, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [35]:
w2v_model.init_sims(replace=True)

INFO - 11:01:56: precomputing L2-norms of word weight vectors


In [36]:
w2v_model.wv.most_similar(positive=["honestly"])

[('really', 0.4021736681461334),
 ('think', 0.3312563896179199),
 ('much', 0.31276416778564453),
 ('impactful', 0.31093668937683105),
 ('actually', 0.30668026208877563),
 ('though', 0.29392924904823303),
 ('probably', 0.27342164516448975),
 ('anyone', 0.2730477452278137),
 ('would', 0.271810919046402),
 ('delusional', 0.2530396580696106)]

In [38]:
w2v_model.wv.doesnt_match(["honestly", "person", "people"])

'honestly'

In [39]:
w2v_model.wv.similarity("people", 'person')

0.50759155

In [40]:
w2v_model = Word2Vec(min_count=10,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=40, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

w2v_model.init_sims(replace=True)

INFO - 11:14:30: collecting all words and their counts
INFO - 11:14:30: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 11:14:31: PROGRESS: at sentence #10000, processed 215073 words, keeping 24562 word types
INFO - 11:14:32: PROGRESS: at sentence #20000, processed 435130 words, keeping 35421 word types
INFO - 11:14:32: PROGRESS: at sentence #30000, processed 651529 words, keeping 43839 word types
INFO - 11:14:33: PROGRESS: at sentence #40000, processed 874326 words, keeping 51146 word types
INFO - 11:14:34: PROGRESS: at sentence #50000, processed 1094880 words, keeping 57292 word types
INFO - 11:14:35: PROGRESS: at sentence #60000, processed 1311363 words, keeping 63296 word types
INFO - 11:14:36: collected 68560 word types from a corpus of 1525906 raw words and 70000 sentences
INFO - 11:14:36: Loading a fresh vocabulary
INFO - 11:14:36: effective_min_count=10 retains 12459 unique words (18% of original 68560, drops 56101)
INFO - 11:14:36: effective_min_count=

Time to build vocab: 0.16 mins


INFO - 11:14:41: EPOCH 1 - PROGRESS: at 12.60% examples, 107162 words/s, in_qsize 0, out_qsize 0
INFO - 11:14:42: EPOCH 1 - PROGRESS: at 26.25% examples, 111446 words/s, in_qsize 0, out_qsize 0
INFO - 11:14:43: EPOCH 1 - PROGRESS: at 39.86% examples, 111870 words/s, in_qsize 0, out_qsize 0
INFO - 11:14:44: EPOCH 1 - PROGRESS: at 53.44% examples, 112697 words/s, in_qsize 0, out_qsize 0
INFO - 11:14:45: EPOCH 1 - PROGRESS: at 66.17% examples, 112899 words/s, in_qsize 0, out_qsize 0
INFO - 11:14:46: EPOCH 1 - PROGRESS: at 79.89% examples, 113266 words/s, in_qsize 0, out_qsize 0
INFO - 11:14:47: EPOCH 1 - PROGRESS: at 94.08% examples, 113838 words/s, in_qsize 0, out_qsize 0
INFO - 11:14:47: worker thread finished; awaiting finish of 6 more threads
INFO - 11:14:47: worker thread finished; awaiting finish of 5 more threads
INFO - 11:14:47: worker thread finished; awaiting finish of 4 more threads
INFO - 11:14:47: worker thread finished; awaiting finish of 3 more threads
INFO - 11:14:47: work

INFO - 11:15:30: EPOCH 7 - PROGRESS: at 39.86% examples, 115226 words/s, in_qsize 0, out_qsize 0
INFO - 11:15:31: EPOCH 7 - PROGRESS: at 52.77% examples, 114985 words/s, in_qsize 0, out_qsize 0
INFO - 11:15:32: EPOCH 7 - PROGRESS: at 66.17% examples, 115310 words/s, in_qsize 0, out_qsize 0
INFO - 11:15:33: EPOCH 7 - PROGRESS: at 79.89% examples, 115460 words/s, in_qsize 0, out_qsize 0
INFO - 11:15:34: EPOCH 7 - PROGRESS: at 93.39% examples, 115292 words/s, in_qsize 0, out_qsize 0
INFO - 11:15:35: worker thread finished; awaiting finish of 6 more threads
INFO - 11:15:35: worker thread finished; awaiting finish of 5 more threads
INFO - 11:15:35: worker thread finished; awaiting finish of 4 more threads
INFO - 11:15:35: worker thread finished; awaiting finish of 3 more threads
INFO - 11:15:35: worker thread finished; awaiting finish of 2 more threads
INFO - 11:15:35: worker thread finished; awaiting finish of 1 more threads
INFO - 11:15:35: worker thread finished; awaiting finish of 0 mor

INFO - 11:16:19: EPOCH 13 - PROGRESS: at 79.30% examples, 112766 words/s, in_qsize 0, out_qsize 0
INFO - 11:16:20: EPOCH 13 - PROGRESS: at 92.75% examples, 112704 words/s, in_qsize 0, out_qsize 0
INFO - 11:16:21: worker thread finished; awaiting finish of 6 more threads
INFO - 11:16:21: worker thread finished; awaiting finish of 5 more threads
INFO - 11:16:21: worker thread finished; awaiting finish of 4 more threads
INFO - 11:16:21: worker thread finished; awaiting finish of 3 more threads
INFO - 11:16:21: worker thread finished; awaiting finish of 2 more threads
INFO - 11:16:21: worker thread finished; awaiting finish of 1 more threads
INFO - 11:16:21: worker thread finished; awaiting finish of 0 more threads
INFO - 11:16:21: EPOCH - 13 : training on 1525906 raw words (876150 effective words) took 7.7s, 113443 effective words/s
INFO - 11:16:22: EPOCH 14 - PROGRESS: at 7.88% examples, 65557 words/s, in_qsize 0, out_qsize 0
INFO - 11:16:23: EPOCH 14 - PROGRESS: at 17.10% examples, 7296

INFO - 11:17:08: worker thread finished; awaiting finish of 6 more threads
INFO - 11:17:08: worker thread finished; awaiting finish of 5 more threads
INFO - 11:17:08: worker thread finished; awaiting finish of 4 more threads
INFO - 11:17:08: worker thread finished; awaiting finish of 3 more threads
INFO - 11:17:08: worker thread finished; awaiting finish of 2 more threads
INFO - 11:17:08: worker thread finished; awaiting finish of 1 more threads
INFO - 11:17:08: EPOCH 19 - PROGRESS: at 100.00% examples, 93753 words/s, in_qsize 0, out_qsize 1
INFO - 11:17:08: worker thread finished; awaiting finish of 0 more threads
INFO - 11:17:08: EPOCH - 19 : training on 1525906 raw words (876701 effective words) took 9.4s, 93736 effective words/s
INFO - 11:17:09: EPOCH 20 - PROGRESS: at 7.30% examples, 60495 words/s, in_qsize 0, out_qsize 0
INFO - 11:17:10: EPOCH 20 - PROGRESS: at 19.67% examples, 83190 words/s, in_qsize 1, out_qsize 0
INFO - 11:17:11: EPOCH 20 - PROGRESS: at 33.34% examples, 94919 

INFO - 11:17:57: worker thread finished; awaiting finish of 0 more threads
INFO - 11:17:57: EPOCH - 24 : training on 1525906 raw words (876143 effective words) took 7.3s, 120485 effective words/s
INFO - 11:17:58: EPOCH 25 - PROGRESS: at 9.26% examples, 75711 words/s, in_qsize 0, out_qsize 0
INFO - 11:17:59: EPOCH 25 - PROGRESS: at 17.10% examples, 70217 words/s, in_qsize 0, out_qsize 0
INFO - 11:18:00: EPOCH 25 - PROGRESS: at 25.56% examples, 70722 words/s, in_qsize 0, out_qsize 0
INFO - 11:18:01: EPOCH 25 - PROGRESS: at 41.17% examples, 85227 words/s, in_qsize 0, out_qsize 0
INFO - 11:18:02: EPOCH 25 - PROGRESS: at 52.04% examples, 87092 words/s, in_qsize 0, out_qsize 0
INFO - 11:18:03: EPOCH 25 - PROGRESS: at 61.16% examples, 85616 words/s, in_qsize 0, out_qsize 0
INFO - 11:18:04: EPOCH 25 - PROGRESS: at 72.10% examples, 86699 words/s, in_qsize 0, out_qsize 0
INFO - 11:18:05: EPOCH 25 - PROGRESS: at 85.92% examples, 90170 words/s, in_qsize 0, out_qsize 0
INFO - 11:18:06: EPOCH 25 - P

INFO - 11:18:47: worker thread finished; awaiting finish of 2 more threads
INFO - 11:18:47: worker thread finished; awaiting finish of 1 more threads
INFO - 11:18:47: worker thread finished; awaiting finish of 0 more threads
INFO - 11:18:47: EPOCH - 30 : training on 1525906 raw words (875961 effective words) took 6.3s, 138056 effective words/s
INFO - 11:18:48: EPOCH 31 - PROGRESS: at 16.36% examples, 142938 words/s, in_qsize 0, out_qsize 0
INFO - 11:18:49: EPOCH 31 - PROGRESS: at 34.03% examples, 146766 words/s, in_qsize 0, out_qsize 0
INFO - 11:18:50: EPOCH 31 - PROGRESS: at 50.74% examples, 146772 words/s, in_qsize 0, out_qsize 0
INFO - 11:18:51: EPOCH 31 - PROGRESS: at 68.18% examples, 147878 words/s, in_qsize 0, out_qsize 0
INFO - 11:18:52: EPOCH 31 - PROGRESS: at 85.92% examples, 148996 words/s, in_qsize 0, out_qsize 0
INFO - 11:18:52: worker thread finished; awaiting finish of 6 more threads
INFO - 11:18:52: worker thread finished; awaiting finish of 5 more threads
INFO - 11:18:5

INFO - 11:19:28: worker thread finished; awaiting finish of 0 more threads
INFO - 11:19:28: EPOCH - 37 : training on 1525906 raw words (875467 effective words) took 5.8s, 150263 effective words/s
INFO - 11:19:29: EPOCH 38 - PROGRESS: at 16.36% examples, 142193 words/s, in_qsize 0, out_qsize 0
INFO - 11:19:30: EPOCH 38 - PROGRESS: at 34.03% examples, 147151 words/s, in_qsize 0, out_qsize 0
INFO - 11:19:31: EPOCH 38 - PROGRESS: at 51.40% examples, 149093 words/s, in_qsize 0, out_qsize 0
INFO - 11:19:32: EPOCH 38 - PROGRESS: at 68.18% examples, 149163 words/s, in_qsize 0, out_qsize 0
INFO - 11:19:33: EPOCH 38 - PROGRESS: at 85.92% examples, 149700 words/s, in_qsize 0, out_qsize 0
INFO - 11:19:34: worker thread finished; awaiting finish of 6 more threads
INFO - 11:19:34: worker thread finished; awaiting finish of 5 more threads
INFO - 11:19:34: worker thread finished; awaiting finish of 4 more threads
INFO - 11:19:34: worker thread finished; awaiting finish of 3 more threads
INFO - 11:19:3

Time to train the model: 5.1 mins


In [41]:
w2v_model.wv.most_similar(positive=["honestly"])

INFO - 11:20:01: precomputing L2-norms of word weight vectors


[('think', 0.3799281120300293),
 ('really', 0.345345139503479),
 ('actually', 0.2863292694091797),
 ('impactful', 0.28042757511138916),
 ('would', 0.28009873628616333),
 ('passionate', 0.27521342039108276),
 ('though', 0.27376729249954224),
 ('happen', 0.2671111822128296),
 ('unbiased', 0.25786978006362915),
 ('moronic', 0.2577594220638275)]

In [42]:
w2v_model.wv.similarity("people", 'person')

0.40952381

# Text classification

In [44]:
# train word2vec on all the texts - both training and test set
# we're not using test labels, just texts so this is fine
from gensim.models import Word2Vec

model = Word2Vec(sentences, size=100, window=5, min_count=10, workers=2)
w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

t = time()

model.train(sentences, total_examples=model.corpus_count, epochs=20, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 18:20:41: collecting all words and their counts
INFO - 18:20:41: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 18:20:42: PROGRESS: at sentence #10000, processed 215073 words, keeping 24562 word types
INFO - 18:20:42: PROGRESS: at sentence #20000, processed 435130 words, keeping 35421 word types
INFO - 18:20:43: PROGRESS: at sentence #30000, processed 651529 words, keeping 43839 word types
INFO - 18:20:43: PROGRESS: at sentence #40000, processed 874326 words, keeping 51146 word types
INFO - 18:20:44: PROGRESS: at sentence #50000, processed 1094880 words, keeping 57292 word types
INFO - 18:20:45: PROGRESS: at sentence #60000, processed 1311363 words, keeping 63296 word types
INFO - 18:20:45: collected 68560 word types from a corpus of 1525906 raw words and 70000 sentences
INFO - 18:20:45: Loading a fresh vocabulary
INFO - 18:20:45: effective_min_count=10 retains 12459 unique words (18% of original 68560, drops 56101)
INFO - 18:20:45: effective_min_count=

INFO - 18:21:36: worker thread finished; awaiting finish of 0 more threads
INFO - 18:21:36: EPOCH - 3 : training on 1525906 raw words (1361196 effective words) took 5.4s, 251445 effective words/s
INFO - 18:21:37: EPOCH 4 - PROGRESS: at 18.31% examples, 245782 words/s, in_qsize 1, out_qsize 0
INFO - 18:21:38: EPOCH 4 - PROGRESS: at 37.28% examples, 249348 words/s, in_qsize 0, out_qsize 0
INFO - 18:21:39: EPOCH 4 - PROGRESS: at 55.87% examples, 250684 words/s, in_qsize 0, out_qsize 0
INFO - 18:21:40: EPOCH 4 - PROGRESS: at 74.82% examples, 251039 words/s, in_qsize 0, out_qsize 0
INFO - 18:21:41: EPOCH 4 - PROGRESS: at 93.39% examples, 250336 words/s, in_qsize 0, out_qsize 0
INFO - 18:21:41: worker thread finished; awaiting finish of 1 more threads
INFO - 18:21:41: worker thread finished; awaiting finish of 0 more threads
INFO - 18:21:41: EPOCH - 4 : training on 1525906 raw words (1361333 effective words) took 5.4s, 250916 effective words/s
INFO - 18:21:42: EPOCH 5 - PROGRESS: at 19.00% e

INFO - 18:22:35: worker thread finished; awaiting finish of 1 more threads
INFO - 18:22:35: worker thread finished; awaiting finish of 0 more threads
INFO - 18:22:35: EPOCH - 14 : training on 1525906 raw words (1361072 effective words) took 5.4s, 254140 effective words/s
INFO - 18:22:36: EPOCH 15 - PROGRESS: at 19.00% examples, 253904 words/s, in_qsize 0, out_qsize 0
INFO - 18:22:37: EPOCH 15 - PROGRESS: at 37.97% examples, 251033 words/s, in_qsize 0, out_qsize 0
INFO - 18:22:38: EPOCH 15 - PROGRESS: at 56.53% examples, 252426 words/s, in_qsize 1, out_qsize 0
INFO - 18:22:39: EPOCH 15 - PROGRESS: at 74.82% examples, 251139 words/s, in_qsize 0, out_qsize 0
INFO - 18:22:40: EPOCH 15 - PROGRESS: at 94.08% examples, 252039 words/s, in_qsize 0, out_qsize 0
INFO - 18:22:40: worker thread finished; awaiting finish of 1 more threads
INFO - 18:22:41: worker thread finished; awaiting finish of 0 more threads
INFO - 18:22:41: EPOCH - 15 : training on 1525906 raw words (1361292 effective words) to

Time to train the model: 1.83 mins


In [11]:
# start with the classics - naive bayes of the multinomial and bernoulli varieties
# with either pure counts or tfidf features
mult_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: sentences)), ("multinomial nb", MultinomialNB())])
bern_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: sentences)), ("bernoulli nb", BernoulliNB())])
mult_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: sentences)), ("multinomial nb", MultinomialNB())])
bern_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: sentences)), ("bernoulli nb", BernoulliNB())])
# SVM - which is supposed to be more or less state of the art 
# http://www.cs.cornell.edu/people/tj/publications/joachims_98a.pdf
svc = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: sentences)), ("linear svc", SVC(kernel="linear"))])
svc_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: sentences)), ("linear svc", SVC(kernel="linear"))])

In [12]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(word2vec))])
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
# and a tf-idf version of the same
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(word2vec))])
        else:
            self.dim=0
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [15]:
etree_w2v = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
# svc_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)), 
#                         ("extra trees",svc])

In [16]:
cross_val_score(etree_w2v, sentences, train[1], cv=5)

array([0.26164286, 0.26035714, 0.25414286, 0.25835714, 0.256     ])

In [None]:
svc_w2v_tfidf.fit(sentences,train[1])

In [36]:
model.wv.most_similar(positive=["honestly"])

INFO - 18:18:25: precomputing L2-norms of word weight vectors


[('actually', 0.4714917540550232),
 ('really', 0.4584989547729492),
 ('tbh', 0.450907438993454),
 ('anyone', 0.4295053780078888),
 ('though', 0.429007887840271),
 ('dont', 0.4191727042198181),
 ('polarizing', 0.40613698959350586),
 ('lifespan', 0.39335620403289795),
 ('many_people', 0.38840723037719727),
 ('reason', 0.38497790694236755)]

In [22]:
lsvc = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)), 
                        ("svc", LinearSVC())])

cross_val_score(lsvc, sentences, train[1], cv=5).mean()



0.299

In [23]:
lsvc = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)), 
                        ("svc", LinearSVC(C=0.8))])

cross_val_score(lsvc, sentences, train[1], cv=3).mean()



0.2976285339995668

In [24]:
lsvc = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)), 
                        ("svc", LinearSVC(C=0.8, max_iter=2500))])

cross_val_score(lsvc, sentences, train[1], cv=3).mean()



0.2975999627750624

In [27]:
mnb = Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)), 
                        ("mnb", MultinomialNB(alpha=1.2))])

cross_val_score(mnb, sentences, train[1], cv=5).mean()

ValueError: Negative values in data passed to MultinomialNB (input X)

ValueError: Negative values in data passed to MultinomialNB (input X)



KeyboardInterrupt: 

In [30]:
from sklearn.linear_model import LogisticRegression

lgr = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)), 
                        ("lgr", LogisticRegression(multi_class='multinomial',max_iter=1000))])

cross_val_score(lgr, sentences, train[1], cv=5).mean()

0.29178571428571426

## min count = 5, epochs = 30

In [37]:
model.wv.most_similar(positive=["paris"])

[('australia', 0.5528700947761536),
 ('london', 0.5514118671417236),
 ('asia', 0.5451439023017883),
 ('portugal', 0.5186939239501953),
 ('france', 0.5154790878295898),
 ('moscow', 0.5140597820281982),
 ('rural', 0.503890872001648),
 ('berlin', 0.5007058382034302),
 ('apartheid', 0.5004202127456665),
 ('tokyo', 0.4965803623199463)]

In [38]:
model.wv.most_similar(positive=["london"])

[('virginia', 0.592710018157959),
 ('village', 0.5891340970993042),
 ('mayor', 0.5721107721328735),
 ('1946', 0.5619644522666931),
 ('moscow', 0.5570530295372009),
 ('1976', 0.5552209615707397),
 ('paris', 0.5514118671417236),
 ('tokyo', 0.5438507199287415),
 ('istanbul', 0.539239764213562),
 ('1950s', 0.5304075479507446)]

In [39]:
model.wv.most_similar(positive=["funny"])

[('hilarious', 0.6419342756271362),
 ('amusing', 0.5643250346183777),
 ('ironic', 0.5459393262863159),
 ('annoying', 0.5240662097930908),
 ('cringey', 0.5232990384101868),
 ('silly', 0.4757176637649536),
 ('entertaining', 0.45466816425323486),
 ('dumb', 0.45443469285964966),
 ('joke', 0.4518563747406006),
 ('funnier', 0.45147237181663513)]

In [40]:
model.wv.most_similar(positive=["sad"])

[('happy', 0.5672705769538879),
 ('bittersweet', 0.5211690664291382),
 ('depressing', 0.510392963886261),
 ('depressed', 0.5041153430938721),
 ('nostalgic', 0.4386507272720337),
 ('uneasy', 0.4362569749355316),
 ('cringey', 0.4345662593841553),
 ('chester', 0.43363049626350403),
 ('glad', 0.43283742666244507),
 ('heartbreaking', 0.42796841263771057)]

In [41]:
model.wv.most_similar(positive=["man"])

[('dude', 0.5487749576568604),
 ('lady', 0.4121003746986389),
 ('guy', 0.4017930030822754),
 ('holy_shit', 0.3794368505477905),
 ('girlfriend', 0.37439557909965515),
 ('granddaughter', 0.3686869144439697),
 ('15_year', 0.3599141836166382),
 ('slut', 0.35143059492111206),
 ('mother', 0.3511870503425598),
 ('ents', 0.3497922420501709)]

In [42]:
model.wv.most_similar(positive=["king"])

[('lich', 0.680612325668335),
 ('queen', 0.5602347254753113),
 ('poro', 0.5519959330558777),
 ('robb', 0.5336294174194336),
 ('monarch', 0.5306412577629089),
 ('baratheon', 0.4936992824077606),
 ('danny', 0.489499568939209),
 ('doran', 0.48945847153663635),
 ('kneel', 0.4888492822647095),
 ('heir', 0.48488175868988037)]

In [43]:
model.wv.most_similar(positive=["cheat"])

[('valve', 0.5266107320785522),
 ('csgo', 0.4711252450942993),
 ('prescription', 0.45646488666534424),
 ('opiate', 0.4392750859260559),
 ('cheating', 0.4268868863582611),
 ('confirm', 0.41841793060302734),
 ('toxic', 0.4131224453449249),
 ('unban', 0.40976160764694214),
 ('thrower', 0.40350669622421265),
 ('folder', 0.3997250497341156)]

## mincount = 10, epochs = 20

In [45]:
model.wv.most_similar(positive=["sad"])

INFO - 18:30:16: precomputing L2-norms of word weight vectors


[('happy', 0.5614525079727173),
 ('depressing', 0.5262471437454224),
 ('depressed', 0.49628227949142456),
 ('shame', 0.4946313798427582),
 ('disappointed', 0.4835307002067566),
 ('chester', 0.4776112735271454),
 ('nostalgic', 0.46819111704826355),
 ('ironic', 0.4626058340072632),
 ('nervous', 0.4575343430042267),
 ('disappointing', 0.4566504657268524)]

In [48]:
model.wv.most_similar(positive=["swede"])

[('swedish', 0.7197892069816589),
 ('finn', 0.7137970924377441),
 ('finland', 0.7105278372764587),
 ('anglo', 0.7088577747344971),
 ('estonia', 0.7073632478713989),
 ('malta', 0.69926917552948),
 ('culturally', 0.6989283561706543),
 ('heritage', 0.693010687828064),
 ('norwegian', 0.6929962635040283),
 ('authoritarian', 0.6672325134277344)]

In [47]:
model.wv.most_similar(positive=["norway"])

[('estonia', 0.7281621694564819),
 ('greek', 0.6562989354133606),
 ('finland', 0.6537322998046875),
 ('czech', 0.6498714089393616),
 ('bosnia', 0.6423696279525757),
 ('swede', 0.6395374536514282),
 ('republic', 0.6357571482658386),
 ('france', 0.6344835758209229),
 ('turk', 0.6317201852798462),
 ('malta', 0.6270574331283569)]

In [49]:
model.wv.most_similar(positive=["sweden"])

[('eastern_europe', 0.6872494220733643),
 ('finland', 0.6830415725708008),
 ('croatia', 0.6543526649475098),
 ('independence', 0.6518363356590271),
 ('france', 0.6482505798339844),
 ('hungary', 0.6477479934692383),
 ('turkey', 0.6448993682861328),
 ('turkish', 0.6414752006530762),
 ('ussr', 0.6368622183799744),
 ('iran', 0.6278806924819946)]

In [50]:
model.wv.most_similar(positive=["king"])

[('lich', 0.7301069498062134),
 ('robb', 0.6086158752441406),
 ('queen', 0.584738552570343),
 ('aegon', 0.5654856562614441),
 ('prince', 0.5352966785430908),
 ('danny', 0.5238744020462036),
 ('monarch', 0.5233479738235474),
 ('baratheon', 0.5145795345306396),
 ('rightful', 0.51219242811203),
 ('aemon', 0.5119877457618713)]

In [51]:
model.wv.most_similar(positive=["queen"])

[('monarch', 0.6225389242172241),
 ('king', 0.5847386121749878),
 ('heir', 0.5782822370529175),
 ('bastard', 0.5569009780883789),
 ('baratheon', 0.5562009811401367),
 ('aegon', 0.5404238700866699),
 ('elizabeth', 0.5367737412452698),
 ('wed', 0.5315330028533936),
 ('kingsguard', 0.5231883525848389),
 ('throne', 0.5116215944290161)]

In [52]:
model.wv.most_similar(positive=["honestly"])

[('genuinely', 0.498552143573761),
 ('tbh', 0.49609270691871643),
 ('actually', 0.48080170154571533),
 ('really', 0.47539612650871277),
 ('wholeheartedly', 0.4495782256126404),
 ('anyone', 0.44637858867645264),
 ('definitely', 0.43503817915916443),
 ('personally', 0.42949724197387695),
 ('realistically', 0.42518749833106995),
 ('dont', 0.40180718898773193)]

In [53]:
model.wv.most_similar(positive=["happy"])

[('sad', 0.5614525079727173),
 ('frustrated', 0.45697569847106934),
 ('nice', 0.4280584752559662),
 ('happier', 0.4143102467060089),
 ('glad', 0.4074043929576874),
 ('wanting', 0.40049436688423157),
 ('hope', 0.40004801750183105),
 ('want', 0.3929707407951355),
 ('proud', 0.3877381384372711),
 ('upset', 0.3861738443374634)]

In [56]:
cross_val_score(lgr, sentences, train[1], cv=5).mean()

0.29178571428571426

In [55]:
model.wv.most_similar(positive=["war"])

[('iraq', 0.5791922211647034),
 ('gulf', 0.5764289498329163),
 ('afghanistan', 0.5482458472251892),
 ('casualty', 0.5424610376358032),
 ('syria', 0.5380897521972656),
 ('wwii', 0.5250870585441589),
 ('military', 0.5203325748443604),
 ('syrian', 0.5167602300643921),
 ('invading', 0.5087327361106873),
 ('naval', 0.5010626912117004)]

In [59]:
tfidf = TfidfVectorizer(smooth_idf=False, sublinear_tf=True, max_df=0.5, use_idf=False)
clf = ComplementNB(alpha=0.25)
pipe = Pipeline([('vect', tfidf), ('clf', clf )])

cross_val_score(pipe,sentences,train[1]).mean()

AttributeError: 'list' object has no attribute 'lower'



KeyboardInterrupt: 