In [4]:
import gensim
gensim.__version__
import numpy as np
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from gensim.models import  Word2Vec
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 12.0 MB/s eta 0:00:00
Collecting click
  Downloading click-8.1.6-py3-none-any.whl (97 kB)
     ---------------------------------------- 97.9/97.9 kB ? eta 0:00:00
Collecting regex>=2021.8.3
  Downloading regex-2023.6.3-cp311-cp311-win_amd64.whl (268 kB)
     -------------------------------------- 268.0/268.0 kB 8.3 MB/s eta 0:00:00
Installing collected packages: regex, click, nltk
Successfully installed click-8.1.6 nltk-3.8.1 regex-2023.6.3



[notice] A new release of pip available: 22.3.1 -> 23.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
dataset = pd.read_csv(r'C:\Users\Anonymous\Desktop\Machine Learning\Datasets\Restaurant_Reviews.tsv', delimiter = '\t')

dataset.head()




Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [8]:
def preprocess(text_col, size):
    corpus = []
    for i in range(0, size):
        review = re.sub('[^a-zA-Z]', ' ', text_col[i])
        review = review.lower()
        review = review.split()
        ps = PorterStemmer()
        review = [ps.stem(word) for word in review if not word in set(stops)]
        review = ' '.join(review)
        review = review.split()
        if review == []:
          review = ["anything"]
        corpus.append(review)
    return corpus


In [9]:
nltk.download('stopwords')

stops = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anonymous\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [16]:
stops

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [11]:

corpus = preprocess(text_col=dataset['Review'],
                    size=dataset.shape[0])

len(corpus)



1000

In [18]:
corpus[:3]

"""CBOW: sg = 0
Skip-Gram: sg = 1
"""

model_r =  Word2Vec(corpus, min_count=1, vector_size=100, sg=0)

means = []
for sentence in corpus :
    sent = np.array([model_r.wv.get_vector(word) for word in sentence ])
    row_means = sent.mean(axis=0)
    means.append(row_means)
print(means)


[array([-2.4893647e-03,  4.8283180e-03,  6.0674925e-03, -2.8892758e-03,
       -2.5840201e-03, -9.9929133e-03, -3.4418378e-03,  9.0297982e-03,
       -4.0729488e-03, -6.0139853e-03,  2.8880090e-03, -3.8047396e-03,
       -1.8876993e-04,  1.3774122e-03,  4.8083267e-03,  4.6240599e-03,
        4.4361344e-03,  2.2477680e-03,  6.6754455e-04, -7.6338965e-03,
        4.3593836e-03, -3.5406344e-04,  7.3395371e-03, -7.6720840e-03,
        3.2821253e-03, -2.6154658e-04, -5.9525645e-03, -2.5874516e-03,
       -1.4902529e-03,  4.5433911e-03,  6.1504487e-03, -2.8558339e-03,
        2.8707684e-04,  9.7159296e-05,  3.8106602e-03,  5.6854761e-03,
        1.5974789e-03,  1.7824963e-04,  1.9636811e-03,  4.6534243e-04,
        1.9815788e-03, -1.0600599e-03, -5.1420392e-03, -3.3769969e-04,
        3.6829896e-04,  1.8138516e-04, -2.2946263e-03,  1.5433417e-03,
       -3.4970667e-03,  1.3495721e-03, -5.3157192e-04, -6.2151947e-03,
        2.3273051e-04,  2.9773663e-03,  2.3042001e-03,  8.4956951e-04,
     

In [19]:
means = np.array(means)
print(means)


[[-0.00248936  0.00482832  0.00606749 ... -0.0039599  -0.00132851
  -0.00285674]
 [ 0.00410618 -0.00112031 -0.00397767 ...  0.00081174  0.00304167
  -0.00740726]
 [-0.00521496  0.00181101 -0.00672057 ... -0.0001369   0.00624619
  -0.00342533]
 ...
 [-0.00335364  0.00013293 -0.00160126 ... -0.00363972  0.00139065
   0.00052057]
 [-0.00117236 -0.00082415  0.00021906 ... -0.0021179   0.00520433
  -0.00169304]
 [-0.00114351  0.00053284  0.00145375 ... -0.00363293 -0.002106
  -0.0034314 ]]


In [None]:

X = means
y = dataset.iloc[:, 1]

X.shape

y.shape

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.20,
                                                    random_state = 23,
                                                    stratify=y)
classifier = RandomForestClassifier(random_state=23)
classifier.fit(X_train, y_train)


In [13]:
"""**Model Evaluation**

"""

y_pred_prob = classifier.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))

y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))



0.7046000000000001
0.64


In [14]:
"""#### Grid Search CV"""

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
print(classifier.get_params())

params = {'max_features':[2,5,10,20,50]}
gcv = GridSearchCV(classifier, param_grid=params, cv=kfold, verbose=3)
gcv.fit(X, y)

print(gcv.best_params_)

print(gcv.best_score_)



{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 23, 'verbose': 0, 'warm_start': False}
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END ....................max_features=2;, score=0.650 total time=   0.3s
[CV 2/5] END ....................max_features=2;, score=0.610 total time=   0.2s
[CV 3/5] END ....................max_features=2;, score=0.675 total time=   0.2s
[CV 4/5] END ....................max_features=2;, score=0.685 total time=   0.2s
[CV 5/5] END ....................max_features=2;, score=0.660 total time=   0.2s
[CV 1/5] END ....................max_features=5;, score=0.645 total time=   0.4s
[CV 2/5] END ....................max_features=5;, score=0.650 t

In [15]:
test_corp = ['bad taste', 'horrible','love']
tst_corpus = preprocess(text_col=test_corp,
                    size=3)

print(tst_corpus)

test_means = []
for sentence in tst_corpus :
    word_vects = [model_r.wv.get_vector(word, norm=True) for word in sentence ]
    row_means = np.mean(word_vects,axis=0)
    test_means.append(row_means)
test_means = np.array( test_means )

y_pred = gcv.predict(test_means)
print(y_pred)

[['bad', 'tast'], ['horribl'], ['love']]
[0 0 1]
