## Loading Processed Reviews

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv("dataset\processed_reviews.csv")
df.head(5)

Unnamed: 0,Review_text,Rating
0,liked,1
1,bought phone amazon using samsung m30s couple ...,1
2,awesome book reasonable price must buy,1
3,good,1
4,book fine bad contains nice concepts nicely ex...,1


In [11]:
df.dropna(subset=["Review_text"], inplace=True)
df.isnull().sum()

Review_text    0
Rating         0
dtype: int64

## Word2Vec Implem

In [9]:
import gensim
from gensim.models import Word2Vec, KeyedVectors

In [15]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')  # Download WordNet data if you haven't already
nltk.download('punkt_tab')
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nirma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nirma\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [3]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [12]:
corpus = []

for index, row in df.iterrows():
    sentences = row["Review_text"]  # Split into sentences
    corpus.append(sentences)

In [19]:
## Let's see the first sentence in the corpus

corpus[10]

'good option'

In [16]:
words=[]
for sent in corpus:
  sent_token = sent_tokenize(sent)
  for sent in sent_token:
    words.append(simple_preprocess(sent))

In [20]:
## The words of the corpus[10] is made into a sub-list
words [10]

['good', 'option']

In [24]:
## Training Word2Vec from scratch

w2v_model=gensim.models.Word2Vec(words, epochs=150)

In [38]:
## Running a similarity search for a word
w2v_model.wv.similar_by_word("good")

[('nice', 0.8603376746177673),
 ('awesome', 0.8299969434738159),
 ('excellent', 0.8229254484176636),
 ('great', 0.7919813394546509),
 ('decent', 0.7783781886100769),
 ('average', 0.7271357178688049),
 ('superb', 0.7168322205543518),
 ('ok', 0.701496422290802),
 ('awsome', 0.672161340713501),
 ('amazing', 0.6701365113258362)]

### Word2Vec has been implemented but we need to do AvgWord2Vec.

But Why?

For every word it is giving a 100 dimension vector, what we want is 100 dim per sentence, for better generalization 


In [28]:
def avg_word2vec(sentence):
    word_vectors = [w2v_model.wv[word] for word in sentence if word in w2v_model.wv.index_to_key]
    if word_vectors:  # Check if word_vectors is not empty
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(w2v_model.vector_size)  # Return a zero vector if no words are found

In [29]:
from tqdm import tqdm

In [30]:
import numpy as np
## apply for all sentences in corpus
X=[]

for i in tqdm(range(len(words))): ## words is a list of sentences which has  mini list of words
  X.append(avg_word2vec(words[i]))


100%|██████████| 60654/60654 [00:11<00:00, 5320.45it/s]


In [31]:
## All sentences are converted to 100 length vectors
print(X[0].shape)
print(X[60653].shape)

(100,)
(100,)


#### Now our training data is the 100 feature length vector pointing to each sentence

In [39]:
## Independent Features
X_new = np.array(X)

In [40]:
## Dependent Feature
y = df["Rating"]

In [41]:
## Verify Length
print(X_new.shape)
print(y.shape)

(60654, 100)
(60654,)


In [42]:
processed_df = pd.DataFrame(X_new)


In [43]:
processed_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-2.009762,-2.108592,-0.223621,-1.393906,0.433622,0.835111,2.576652,-0.094758,0.126729,-1.42802,...,0.314802,-0.191911,1.493221,-0.645711,-2.028114,-2.393979,1.394179,0.546807,-1.051062,-2.832405
1,0.160922,-0.047205,-0.197426,0.100856,-0.378788,1.007363,0.24461,-0.215053,0.465857,-0.419049,...,-0.683699,0.509766,0.25554,-0.609829,-0.216296,-0.164153,-0.274602,-0.012934,-0.186139,0.180499
2,0.577347,0.329095,1.305352,-1.771341,0.04348,-1.115728,0.207343,0.308839,1.338823,-1.030806,...,-0.1871,-1.191622,0.21199,-0.347762,0.775164,-0.919379,0.319311,-0.179645,0.365293,0.608363
3,-1.597558,0.113899,0.260711,-0.19022,1.039261,2.134495,-1.837562,-0.867553,0.602202,-0.472719,...,0.138886,0.898425,0.720696,-0.648485,-1.058941,-0.225319,-0.877628,0.522586,-0.288065,0.515723
4,-1.238543,1.222089,1.032567,-0.37943,0.4243,-0.395015,-0.893469,0.445555,-0.220653,-1.365061,...,0.099366,0.509544,0.219448,-0.063983,0.45115,0.38579,-0.456647,-0.896907,0.773374,0.848653


In [45]:
## Train-Test Split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_new,y,test_size=0.2,random_state=42)

In [46]:
len(X_train),len(X_test),len(y_train),len(y_test)

(48523, 12131, 48523, 12131)

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


classifier = RandomForestClassifier()

classifier.fit(X_train, y_train)



In [48]:
## Check Score on test
y_pred = classifier.predict(X_test)
print(f"Word2Vec Accuracy: {accuracy_score(y_test, y_pred)}")

Word2Vec Accuracy: 0.908334020278625


In [49]:
## Classification Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.51      0.62      1779
           1       0.92      0.98      0.95     10352

    accuracy                           0.91     12131
   macro avg       0.85      0.74      0.78     12131
weighted avg       0.90      0.91      0.90     12131



In [51]:
## Pickle the model and the classifier
import pickle
pickle.dump(w2v_model, open("artifacts/w2v_model.pkl", "wb"))
pickle.dump(classifier, open("artifacts/classifier.pkl", "wb"))
