In [None]:
## Image credit [4]

In [None]:
# !pip install tensorflow_datasets

# Load Libraries and Data

In [1]:
import tensorflow_datasets as tfds

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
ds_train = tfds.load(
    'yelp_polarity_reviews',
    split='train',
    shuffle_files=True,
)
ds_train = tfds.as_dataframe(ds_train)

In [5]:
ds_train.head()

Unnamed: 0,label,text
0,0,b'wow. used to so much like the ones by us in...
1,1,"b""Went for dinner last night and was very impr..."
2,0,"b'Cheap food, every time I have gone they alwa..."
3,1,b'The salesperson there was very gracious and ...
4,1,"b'When I decided to buy a scooter, I had no id..."


In [6]:
ds_train.shape

(560000, 2)

In [7]:
# how much data to keep
ds_train = ds_train.sample(frac = 0.75, random_state=42)

In [8]:
ds_train.shape

(420000, 2)

In [9]:
ds_train.iloc[0]["text"]

b"First time: 5 stars\\n+ I seriously left dreaming about this place and couldn't wait to return\\n+ Really fresh fish\\n+ AYCE sushi that's affordable\\n+ AYCE includes desserts (I ordered 3 -  yes, I have a sweet tooth)\\n+ Special nigiri items can be ordered as well\\n- Long wait (but it was totally worth it)\\n- No sashimi\\n\\nSecond time: 4 stars\\n+ Wait wasn't as long\\n- Fish wasn't as fresh \\n\\nOverall would definitely return to this off-the-strip AYCE sushi joint.  It's such a good deal and the fresh is relatively fresh.  If you're not into sushi they have other options as well.  Till next time Goyemon!"

In [10]:
ds_train.iloc[0]["label"]

1

### Step 1 - Text Pre-Processing

In [4]:
from gensim import utils

In [12]:
# lowercase
# ignore too short and too long tokens
utils.simple_preprocess(ds_train.iloc[0]["text"])[:20]

['first',
 'time',
 'stars',
 'seriously',
 'left',
 'dreaming',
 'about',
 'this',
 'place',
 'and',
 'couldn',
 'wait',
 'to',
 'return',
 'really',
 'fresh',
 'fish',
 'ayce',
 'sushi',
 'that']

In [13]:
from tqdm.notebook import tqdm

In [14]:
# tokenize each review and store as list of words
tokenized_reviews = []
for text in tqdm(ds_train["text"]): 
    tokenized_reviews.append(utils.simple_preprocess(text))

  0%|          | 0/420000 [00:00<?, ?it/s]

In [5]:
# We will detect bigrams
from gensim.models import Phrases

In [16]:
tokenized_reviews[:1]

[['first',
  'time',
  'stars',
  'seriously',
  'left',
  'dreaming',
  'about',
  'this',
  'place',
  'and',
  'couldn',
  'wait',
  'to',
  'return',
  'really',
  'fresh',
  'fish',
  'ayce',
  'sushi',
  'that',
  'affordable',
  'ayce',
  'includes',
  'desserts',
  'ordered',
  'yes',
  'have',
  'sweet',
  'tooth',
  'special',
  'nigiri',
  'items',
  'can',
  'be',
  'ordered',
  'as',
  'well',
  'long',
  'wait',
  'but',
  'it',
  'was',
  'totally',
  'worth',
  'it',
  'no',
  'sashimi',
  'nsecond',
  'time',
  'stars',
  'wait',
  'wasn',
  'as',
  'long',
  'fish',
  'wasn',
  'as',
  'fresh',
  'noverall',
  'would',
  'definitely',
  'return',
  'to',
  'this',
  'off',
  'the',
  'strip',
  'ayce',
  'sushi',
  'joint',
  'it',
  'such',
  'good',
  'deal',
  'and',
  'the',
  'fresh',
  'is',
  'relatively',
  'fresh',
  'if',
  'you',
  're',
  'not',
  'into',
  'sushi',
  'they',
  'have',
  'other',
  'options',
  'as',
  'well',
  'till',
  'next',
  'time',

In [17]:
# fir the gensim bigram model
bigram = Phrases(tokenized_reviews)

In [18]:
# pass the list of tokenized reviews to the bigram model object
bigram_tokenized_reviews = bigram[tokenized_reviews]

In [19]:
bigram_tokenized_reviews[0]

['first',
 'time',
 'stars',
 'seriously',
 'left',
 'dreaming_about',
 'this',
 'place',
 'and',
 'couldn',
 'wait',
 'to',
 'return',
 'really',
 'fresh',
 'fish',
 'ayce_sushi',
 'that',
 'affordable',
 'ayce',
 'includes',
 'desserts',
 'ordered',
 'yes',
 'have',
 'sweet_tooth',
 'special',
 'nigiri',
 'items',
 'can',
 'be',
 'ordered',
 'as',
 'well',
 'long',
 'wait',
 'but',
 'it',
 'was',
 'totally_worth',
 'it',
 'no',
 'sashimi',
 'nsecond',
 'time',
 'stars',
 'wait',
 'wasn',
 'as',
 'long',
 'fish',
 'wasn',
 'as',
 'fresh',
 'noverall',
 'would',
 'definitely',
 'return',
 'to',
 'this',
 'off',
 'the',
 'strip',
 'ayce_sushi',
 'joint',
 'it',
 'such',
 'good',
 'deal',
 'and',
 'the',
 'fresh',
 'is',
 'relatively',
 'fresh',
 'if',
 'you',
 're',
 'not',
 'into',
 'sushi',
 'they',
 'have',
 'other',
 'options',
 'as',
 'well',
 'till',
 'next',
 'time',
 'goyemon']

In [20]:
bigram.save("bigram_model.pkl")

In [6]:
bigram = Phrases.load("bigram_model.pkl")

# Train Word2Vec Model

In [8]:
import gensim

In [None]:
# fitting the word2vec model
# sg = 0 is for CBOW 1 for skip-gram
# model = gensim.models.Word2Vec(sentences=bigram_tokenized_reviews, vector_size = 100, 
#                               window = 5, min_count = 5, max_vocab_size = None, sg = 0, workers = 6)

In [None]:
# save the model so you dont have to train again
# model.save("yelp_word2vec")

In [9]:
# load the model
model = gensim.models.Word2Vec.load("yelp_word2vec")

In [24]:
# how many words
len(model.wv.key_to_index)

89073

In [25]:
# get a particular word vector using the word itself
model.wv.get_vector("mcdonalds")

array([ 0.5954207 ,  0.9215558 ,  0.5002837 , -1.0792872 ,  0.15311298,
        0.38278088,  1.1268    ,  0.5279211 ,  0.5314047 , -0.5242518 ,
       -0.63606566,  0.65701336, -0.665191  ,  0.1737934 ,  0.17205867,
        0.50990766, -0.73550326,  1.5150176 , -1.3236736 , -0.9226875 ,
        0.815984  ,  0.17829499, -1.3470298 ,  1.3521905 ,  1.5117137 ,
        1.0000699 ,  1.5407293 , -0.18895806, -1.1945325 , -1.7376685 ,
       -2.8940902 ,  0.11137597,  1.0917991 , -2.7263606 ,  0.02620541,
        1.5034566 ,  1.2069572 , -0.26808172, -0.72729355,  0.92445034,
       -0.6396061 , -0.03437746, -1.7500983 , -0.60100937, -0.0658905 ,
       -0.6145809 ,  0.2629476 , -1.104312  , -0.945986  , -0.03267215,
       -1.6544546 ,  0.5292398 ,  0.4113373 ,  0.6807149 ,  3.0900521 ,
        0.48221853,  2.9971917 ,  1.7336515 ,  2.395294  ,  1.1309831 ,
        0.39629656, -0.2856317 ,  1.6745335 , -1.0451968 , -2.7337499 ,
        2.686315  , -1.8451014 ,  2.4041522 , -2.5629377 ,  0.81

In [26]:
# check most similar words
model.wv.most_similar('mcdonalds')

[('mcdonald', 0.9432393908500671),
 ('burger_king', 0.9023249745368958),
 ('mcd', 0.8458371758460999),
 ('denny', 0.8444104194641113),
 ('subway', 0.8436354994773865),
 ('taco_bell', 0.8319615721702576),
 ('wendy', 0.8295611143112183),
 ('dennys', 0.8171852827072144),
 ('waffle_house', 0.809360921382904),
 ('panda_express', 0.8076522946357727)]

In [27]:
# final word embedding matrix
model.wv.vectors.shape

(89073, 100)

# Can we visualize Word Vectors?

In [None]:
all_sims = model.wv.most_similar('mcdonalds', topn=model.wv.vectors.shape[0])
top_10 = list(all_sims[:10])

In [None]:
top_10

In [None]:
# all_sims = model.wv.most_similar('mcdonalds', topn=model.wv.vectors.shape[0])
not_top_10 = list(all_sims[190:200])

In [None]:
not_top_10

In [None]:
top_10_vectors = []
for word in top_10:
    cur_word = word[0]
    cur_word_vec =  model.wv.get_vector(cur_word)
    top_10_vectors.append(cur_word_vec)

In [None]:
top_10_vectors = np.array(top_10_vectors)

In [None]:
not_top_10_vectors = []
for word in not_top_10:
    cur_word = word[0]
    cur_word_vec =  model.wv.get_vector(cur_word)
    not_top_10_vectors.append(cur_word_vec)

In [None]:
not_top_10_vectors = np.array(not_top_10_vectors)

In [None]:
top_10_vectors_df = pd.DataFrame(top_10_vectors)

In [None]:
top_10_vectors_df["word"] = [item[0] for item in top_10]

In [None]:
not_top_10_vectors_df = pd.DataFrame(not_top_10_vectors)

In [None]:
not_top_10_vectors_df["word"] = [item[0] for item in not_top_10]

In [None]:
top_not_top_10 = pd.concat([top_10_vectors_df, not_top_10_vectors_df], ignore_index = True)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components = 3)

In [None]:
pca.fit(top_not_top_10.iloc[:,:-1])

In [None]:
projected_vectors = pca.transform(top_not_top_10.iloc[:,:-1])

In [None]:
projected_vectors_df = pd.concat([pd.DataFrame(projected_vectors), pd.DataFrame(top_not_top_10["word"])],1)

In [None]:
projected_vectors_df.columns = ["x", "y", "z", "word"]

In [None]:
projected_vectors_df

In [None]:
import plotly.express as px

fig = px.scatter_3d(projected_vectors_df, x="x", y="y", z="z",
              text='word')
fig.write_html("word_cloud_yelp_reviews.html")
fig.show()

In [None]:
# see the word to numeric index mapping
model.wv.key_to_index

In [None]:
# confirm the word vectors for index 0 is the same if you use indexing or name

# using indexing
model.wv.vectors[0]

In [None]:
list(model.wv.key_to_index)[0]

In [None]:
# confirm the word vectors for index 0 is the same
# using name
model.wv.get_vector(list(model.wv.key_to_index)[0])

# Is Word2Vec Smart?

### Vector Algebra

In [None]:
## reference for the analogies [5]

## these analogies might not work if you reduce the data too much

## in that case some of the words below might not have been in the training corpus
## and therefore we wouldnt have any word vectors for them

#### breakfast + lunch = ?

In [None]:
model.wv.most_similar(positive=["breakfast", "lunch"])

#### lunch - day + night = ?

In [None]:
model.wv.most_similar(positive=[u'lunch', u'night'], negative=[u'day'])

#### coffee - drink + snack = ?

In [None]:
model.wv.most_similar(positive=[u'coffee', u'snack'], negative=[u'drink'])

#### Burger King + fine dining = ?

In [None]:
model.wv.most_similar(positive=[u'burger_king', u'fine_dining'])

#### Denny's + fine dining = ?

In [None]:
model.wv.most_similar(positive=[u'dennys', u'fine_dining'])

#### Applebee's + italian = ?

In [None]:
model.wv.most_similar(positive=[u"applebee", u'italian'])

#### Applebee's + pancakes = ?

In [None]:
model.wv.most_similar(positive=[u"applebee", u'pancakes'])

#### Applebee's + pizza = ?

In [None]:
model.wv.most_similar(positive=[u"applebee", u'pizza'])

In [None]:
## most similar is only one of the utility functions
## but gensim offers many different utility functions

In [None]:
model.wv.similarity("breakfast", "lunch")

In [None]:
model.wv.similarity("breakfast", "car")

In [None]:
model.wv.doesnt_match(["breakfast", "car", "lunch", "dinner"])

# What if we get some new data?

In [28]:
# load a new sample of data
ds_test = tfds.load(
    'yelp_polarity_reviews',
    split='test',
    shuffle_files=True,
)
ds_test = tfds.as_dataframe(ds_test)

In [29]:
ds_test.shape

(38000, 2)

In [30]:
ds_test.head()

Unnamed: 0,label,text
0,0,"b'Was not impressed, and will not return.'"
1,0,b'I went in to purchase overalls and was treat...
2,0,b'This place really is horrible... Every time ...
3,1,b'First time visit..... enjoyed their little ...
4,0,"b'I\'ll start with the good - Price, Location..."


In [31]:
# tokenize each review and store as list of words
tokenized_test_reviews = []
for text in tqdm(ds_test["text"]): 
    tokenized_test_reviews.append(utils.simple_preprocess(text))


# pass the list of tokenized reviews to the bigram model object
bigram_tokenized_test_reviews = bigram[tokenized_test_reviews]

  0%|          | 0/38000 [00:00<?, ?it/s]

In [32]:
############ update the word2vec model with the test data as well? 
model.build_vocab(bigram_tokenized_test_reviews, update=True)
model.train(bigram_tokenized_test_reviews, total_examples=model.corpus_count, epochs=model.epochs)




(17169765, 22901205)

In [33]:
len(model.wv.key_to_index)

89170

In [34]:
# final word embedding matrix
model.wv.vectors.shape

(89170, 100)

In [None]:
# save the model so you dont have to train again
model.save("yelp_word2vec_updated_test")

In [10]:
# load the model
model = gensim.models.Word2Vec.load("yelp_word2vec_updated_test")

In [36]:
vocab = set(model.wv.index_to_key)

# Using Word2Vec for Feature Extraction

In [37]:
# function to average the word vectors
# ignores the words that are not in the vocab
def average_word_vectors(idx):
    text = bigram_tokenized_test_reviews[idx]
    word_vectors = []
    for word in text: 
        if len(vocab.intersection([word])) == 1:
            word_vectors.append(model.wv.get_vector(word))
    
    if len(word_vectors) > 0:
        return np.array(word_vectors).mean(0)
    
    else:
        return(np.zeros((1, model.wv.vectors.shape[1])))
    
    

In [38]:
# use the abive function and store in a list
average_word_vectors_test = np.zeros((ds_test.shape[0], model.wv.vectors.shape[1]))
counter = 0
for i in  tqdm(range(ds_test.shape[0])):
    average_word_vectors_test[counter,:] = average_word_vectors(i)
    counter = counter + 1

  0%|          | 0/38000 [00:00<?, ?it/s]

In [39]:
import pandas as pd

In [40]:
# convert to a dataframe
test_data_with_average_vectors = pd.concat([pd.DataFrame(average_word_vectors_test), ds_test["label"]],1)

  test_data_with_average_vectors = pd.concat([pd.DataFrame(average_word_vectors_test), ds_test["label"]],1)


In [41]:
test_data_with_average_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,label
0,1.424024,-0.700056,0.097169,0.674064,0.527084,0.598445,1.202447,-0.257022,0.130901,0.215007,...,0.851286,-0.990031,-0.121236,0.194301,-0.6312,-0.251535,0.629992,0.226676,-0.343596,0
1,0.206348,-0.315422,0.428004,-0.36989,-0.950479,0.258945,0.689689,-0.33247,0.36471,0.304101,...,0.772579,-0.985501,-0.914002,-0.458235,-0.302389,-0.637496,0.138051,0.265036,-0.097515,0
2,0.023439,0.170056,-0.174807,-0.183004,-0.139097,0.381339,0.068949,0.15284,0.173853,-0.070727,...,0.145002,0.40671,-0.49565,-0.089761,0.30798,0.157156,0.397281,0.349354,-0.363138,0
3,-0.193409,0.103001,-0.529377,0.278799,-0.232966,-0.209779,-0.482386,-0.122429,0.195486,0.237129,...,-0.041765,-0.069597,0.607927,0.489913,0.857186,-0.490995,0.034571,-0.342732,-0.36546,1
4,0.25276,0.036868,-0.016586,0.287304,-0.672313,-0.076965,-0.03788,0.347945,0.015328,-0.583083,...,-0.037533,-0.410264,-0.253524,-0.14389,0.07243,-0.443154,0.348752,0.31902,-0.476614,0


In [42]:
# drop the na row
test_data_with_average_vectors = test_data_with_average_vectors.dropna()

In [43]:
# random forest classifier
from sklearn.ensemble import RandomForestClassifier

In [44]:
rf = RandomForestClassifier(n_jobs = 6)

In [45]:
# grid search
from sklearn.model_selection import GridSearchCV

In [46]:
# pass the parameters here as a dictionary
param_grid = {"n_estimators": [100]}

In [47]:
# instantiate the grid search object
gs = GridSearchCV(rf, param_grid, cv = 5, verbose = 3)

In [48]:
# fit the gridsearch object
gs.fit(test_data_with_average_vectors.iloc[:,:-1], test_data_with_average_vectors.iloc[:,-1])

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ..................n_estimators=100;, score=0.866 total time=  11.5s
[CV 2/5] END ..................n_estimators=100;, score=0.872 total time=   6.6s
[CV 3/5] END ..................n_estimators=100;, score=0.869 total time=   6.5s
[CV 4/5] END ..................n_estimators=100;, score=0.873 total time=   6.7s
[CV 5/5] END ..................n_estimators=100;, score=0.868 total time=   6.6s


In [49]:
# 5 fold averaged score
gs.best_score_

0.8697368421052631

# Bag of Words

In [50]:
# countvectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [51]:
# pipeline to put different steps together
from sklearn.pipeline import Pipeline

In [None]:
# from sklearn.preprocessing import FunctionTransformer

In [None]:
# count_vec = CountVectorizer(min_df=15, lowercase=True, ngram_range=(1,2))

In [None]:
# count_vec.fit(ds_test["text"].astype(str))

In [None]:
# text_count_vec = count_vec.transform(ds_test["text"].astype(str))

In [None]:
# text_count_vec.shape

In [None]:
# text_labels = ds_test["label"].values.astype(float)

In [None]:
# text_count_vec = text_count_vec.toarray()

In [52]:
from sklearn.linear_model import LogisticRegression

In [53]:
# first step count vectorize
# next step classification
pipeline_bow = Pipeline(
    [
        ("vect", CountVectorizer(min_df=5, lowercase=True, ngram_range=(1,2))),
#         ("transform",FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
        ("clf", LogisticRegression(penalty = "elasticnet", solver  ="saga")),
    ]
)


In [54]:
# pass the parameters by writing the name you used for the steps and then "__"
param_grid_bow = {"clf__l1_ratio": [0.5], 
                 "vect__max_features": [100, 500, 1000, 2000]}

In [57]:
# instantiate the grid search object
lr_bow_gs = GridSearchCV(pipeline_bow, param_grid_bow, cv = 5, verbose = 3, error_score='raise')

In [58]:
# fit by directly passing the text data
lr_bow_gs.fit(ds_test["text"].astype(str), ds_test["label"].astype(str))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END clf__l1_ratio=0.5, vect__max_features=100;, score=0.761 total time=  13.6s
[CV 2/5] END clf__l1_ratio=0.5, vect__max_features=100;, score=0.770 total time=  14.2s
[CV 3/5] END clf__l1_ratio=0.5, vect__max_features=100;, score=0.760 total time=  13.1s
[CV 4/5] END clf__l1_ratio=0.5, vect__max_features=100;, score=0.769 total time=  13.8s
[CV 5/5] END clf__l1_ratio=0.5, vect__max_features=100;, score=0.774 total time=  12.9s




[CV 1/5] END clf__l1_ratio=0.5, vect__max_features=500;, score=0.868 total time=  23.1s




[CV 2/5] END clf__l1_ratio=0.5, vect__max_features=500;, score=0.866 total time=  20.5s




[CV 3/5] END clf__l1_ratio=0.5, vect__max_features=500;, score=0.858 total time=  20.2s




[CV 4/5] END clf__l1_ratio=0.5, vect__max_features=500;, score=0.868 total time=  19.9s




[CV 5/5] END clf__l1_ratio=0.5, vect__max_features=500;, score=0.868 total time=  20.2s




[CV 1/5] END clf__l1_ratio=0.5, vect__max_features=1000;, score=0.889 total time=  26.0s




[CV 2/5] END clf__l1_ratio=0.5, vect__max_features=1000;, score=0.889 total time=  26.8s




[CV 3/5] END clf__l1_ratio=0.5, vect__max_features=1000;, score=0.892 total time=  29.7s




[CV 4/5] END clf__l1_ratio=0.5, vect__max_features=1000;, score=0.894 total time=  26.9s




[CV 5/5] END clf__l1_ratio=0.5, vect__max_features=1000;, score=0.891 total time=  26.6s




[CV 1/5] END clf__l1_ratio=0.5, vect__max_features=2000;, score=0.905 total time=  37.4s




[CV 2/5] END clf__l1_ratio=0.5, vect__max_features=2000;, score=0.909 total time=  38.9s




[CV 3/5] END clf__l1_ratio=0.5, vect__max_features=2000;, score=0.908 total time=  38.1s




[CV 4/5] END clf__l1_ratio=0.5, vect__max_features=2000;, score=0.911 total time=  38.1s




[CV 5/5] END clf__l1_ratio=0.5, vect__max_features=2000;, score=0.912 total time=  38.0s




In [59]:
# 5 fold averaged score
lr_bow_gs.best_score_

0.9091052631578946

#### Doc2Vec Training

In [12]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [61]:
bigram_tokenized_reviews[0]

['first',
 'time',
 'stars',
 'seriously',
 'left',
 'dreaming_about',
 'this',
 'place',
 'and',
 'couldn',
 'wait',
 'to',
 'return',
 'really',
 'fresh',
 'fish',
 'ayce_sushi',
 'that',
 'affordable',
 'ayce',
 'includes',
 'desserts',
 'ordered',
 'yes',
 'have',
 'sweet_tooth',
 'special',
 'nigiri',
 'items',
 'can',
 'be',
 'ordered',
 'as',
 'well',
 'long',
 'wait',
 'but',
 'it',
 'was',
 'totally_worth',
 'it',
 'no',
 'sashimi',
 'nsecond',
 'time',
 'stars',
 'wait',
 'wasn',
 'as',
 'long',
 'fish',
 'wasn',
 'as',
 'fresh',
 'noverall',
 'would',
 'definitely',
 'return',
 'to',
 'this',
 'off',
 'the',
 'strip',
 'ayce_sushi',
 'joint',
 'it',
 'such',
 'good',
 'deal',
 'and',
 'the',
 'fresh',
 'is',
 'relatively',
 'fresh',
 'if',
 'you',
 're',
 'not',
 'into',
 'sushi',
 'they',
 'have',
 'other',
 'options',
 'as',
 'well',
 'till',
 'next',
 'time',
 'goyemon']

In [62]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(bigram_tokenized_reviews)]


KeyboardInterrupt



In [None]:
## dm = 0 (wont train word vectors unless dbow_words = 1) 
## is for PV-DBOW and dm = 1 for PV-DM (this will also train wordvectors and will be slower)

## PV DM trains word vecs and doc vecs jointly so they will be in the same latent space

## you can also combine dbow docvec training with skip-gram word2vec training

In [None]:
## this is dbow training with no word vectors training
model_doc2vec = Doc2Vec(documents, vector_size=100, window=5, min_count=5, 
                max_vocab_size = None, dm = 0, dbow_words = 0, workers = 6)

In [None]:
model_doc2vec.dv.vectors.shape[0]

In [None]:
model_doc2vec.wv.vectors.shape[0]

In [None]:
model_doc2vec.save("doc2vec_model")

In [13]:
model_doc2vec = Doc2Vec.load("doc2vec_model")

In [64]:
doc_vectors_test = np.zeros((ds_test.shape[0], model_doc2vec.dv.vectors.shape[1]))
counter = 0
for i in  tqdm(range(ds_test.shape[0])):
    doc_vectors_test[counter,:] = model_doc2vec.infer_vector(bigram_tokenized_test_reviews[i])
    counter = counter + 1

  0%|          | 0/38000 [00:00<?, ?it/s]

In [65]:
model_doc2vec.infer_vector(["breakfast", "lunch"])

array([ 0.12740333, -0.04252623, -0.12010309, -0.15111555, -0.02159969,
       -0.0466487 , -0.05456675, -0.10682402, -0.16092806, -0.00024953,
       -0.1980863 ,  0.00542467,  0.06942987, -0.00358879, -0.01333259,
        0.08755029,  0.11056387,  0.1593351 ,  0.09521074,  0.16908862,
       -0.10223883,  0.07661804, -0.00250234, -0.04443074,  0.02750483,
        0.02344697,  0.1514215 , -0.12096574, -0.08844744, -0.05042389,
       -0.04914619, -0.0687502 , -0.0707244 ,  0.03824665,  0.07084145,
       -0.07525028,  0.11513499,  0.02423349, -0.09514002,  0.00283078,
        0.05316712,  0.11461578,  0.0645297 , -0.05068608,  0.04649041,
       -0.06147944, -0.01180126, -0.00935863,  0.02278047,  0.08635486,
       -0.11586186, -0.02865634,  0.02269515, -0.08487106, -0.04284452,
        0.06117215,  0.07111584, -0.14696641, -0.224923  ,  0.02988753,
        0.08003271,  0.03400295,  0.05538803,  0.02615677,  0.11765283,
       -0.07268206, -0.03791004,  0.00281054,  0.01295131,  0.03

In [66]:
test_data_with_doc_vectors = pd.concat([pd.DataFrame(doc_vectors_test), ds_test["label"]],1)

  test_data_with_doc_vectors = pd.concat([pd.DataFrame(doc_vectors_test), ds_test["label"]],1)


In [67]:
test_data_with_doc_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,label
0,-0.010963,0.024556,0.033487,-0.030405,-0.0107,0.160846,0.0616,0.045562,-0.308516,-0.129442,...,0.046861,0.05025,-0.028404,0.047572,-0.109506,-0.010114,0.040207,-0.121606,0.043996,0
1,0.002518,0.067657,0.215513,0.113546,-0.178188,0.046497,0.033798,0.153311,-0.0576,-0.49499,...,0.023305,0.085243,-0.196187,-0.002344,-0.035031,-0.239109,0.044488,-0.386034,-0.026507,0
2,0.060576,-0.321535,0.124541,-0.589404,0.117055,-0.210162,-0.243618,0.080497,-0.386968,-0.189406,...,0.145396,-0.087417,-0.061523,0.195065,0.138643,-0.055118,0.164058,-0.636384,-0.225305,0
3,-0.1279,0.011965,-0.151027,-0.070436,0.431418,-0.276977,0.003214,-0.389544,-0.206026,-0.029095,...,0.198486,0.02489,0.259501,0.250445,0.182513,0.194714,0.020806,-0.252609,-0.014311,1
4,0.302764,0.530054,-0.029385,-0.454152,-0.136055,-0.090069,0.08384,-0.095688,0.072761,0.540542,...,0.363625,-0.173468,-0.195882,0.298731,-0.521504,-0.015584,1.108533,-0.179358,-0.375045,0


In [68]:
rf = RandomForestClassifier(n_jobs = 6)

In [69]:
param_grid = {"n_estimators": [100]}

In [70]:
gs_doc2vec = GridSearchCV(rf, param_grid, cv = 5, verbose = 3)

In [71]:
gs_doc2vec.fit(test_data_with_doc_vectors.iloc[:,:-1], test_data_with_doc_vectors.iloc[:,-1])

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ..................n_estimators=100;, score=0.877 total time=   9.9s
[CV 2/5] END ..................n_estimators=100;, score=0.869 total time=   6.3s
[CV 3/5] END ..................n_estimators=100;, score=0.875 total time=   6.5s
[CV 4/5] END ..................n_estimators=100;, score=0.874 total time=   6.4s
[CV 5/5] END ..................n_estimators=100;, score=0.874 total time=   6.4s


In [72]:
gs_doc2vec.best_score_

0.873578947368421

### References for images used

[1] https://openclassrooms.com/en/courses/6532301-introduction-to-natural-language-processing/6980811-apply-a-simple-bag-of-words-approach

[2] https://swatimeena989.medium.com/training-word2vec-using-gensim-14433890e8e4

[3] http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/

[4] https://medium.com/wisio/a-gentle-introduction-to-doc2vec-db3e8c0cce5e

[5] https://github.com/pwharrison/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb

[6] https://stackoverflow.com/questions/67697776/how-did-online-training-work-in-the-word2vec-model-using-genism

[7] https://stackoverflow.com/questions/53616003/doc2vec-online-training