In [92]:
import pandas as pd
import numpy as np
import nltk
import gensim
from gensim.models import Word2Vec,KeyedVectors
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score,classification_report
import warnings
warnings.filterwarnings('ignore')


# **Load The Data**

In [3]:
df = pd.read_csv(r'data\all_kindle_review .csv')

In [4]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


# **Cleaning and Pre-processing of Data**

In [6]:
df = df[['reviewText','rating']]

In [7]:
df.isna().sum()

reviewText    0
rating        0
dtype: int64

In [10]:
df['rating'] = df['rating'].apply(lambda x: 0 if x<3 else 1)

In [12]:
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",1
1,Great short read. I didn't want to put it dow...,1
2,I'll start by saying this is the first of four...,1
3,Aggie is Angela Lansbury who carries pocketboo...,1
4,I did not expect this type of book to be in li...,1


In [14]:
df['reviewText'] = df['reviewText'].str.lower()

In [18]:
# Cleaning using Regular expression
def clean_text(text):
    # Removing Special Characters
    text = re.sub('[^a-zA-z0-9]+',' ',text)
    # Removing URL
    text = re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', text)
    # Removing html tags
    text = BeautifulSoup(text,'lxml').get_text()
    return text

In [19]:
df['reviewText'] = df['reviewText'].apply(clean_text)

In [21]:
def pre_process_text(text):
    text = text.split()
    text = [word for word in text if word not in stopwords.words('english')]
    text = ' '.join(text)
    return text

In [22]:
df['reviewText'] = df['reviewText'].apply(pre_process_text)

In [23]:
df['reviewText'][0]

'jace rankin may short nothing mess man hauled saloon undertaker knows famous bounty hunter oregon 1890s shot man saloon finished years long quest avenge sister murder trying figure next snotty nosed farm boy rescued gang bullies offers money kill man forced ranch reluctantly agrees bring man justice kill outright first needs tell sister widower news kyla kyle springer bailey riding trails sleeping ground past month trying find jace wants revenge man killed husband took ranch amongst crimes keen detour jace wants take realizes options hides behind boy persona best tries keep pace confrontation along way gets shot jace discovers kyle kyla come clean whole reason needs scoundrel dead hope still help book share touching moments slow blooming romance kyla find good reason fear men hide behind boy persona watching jace slowly pull shell help conquer fears endearing pain real deeply rooted disappear face sexiness neither understandable aversion marriage magically disappear round nookie would

In [25]:
a = list(map(len,df['reviewText']))
list(filter(lambda num: num<1,a))

[]

**Loading the Good Word2Vec Model for converting tokens to vectors**

In [26]:
# Loading the Google Word2Vec Model
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')
v_k_king = wv['king']

**Converting documents to the words/Tokens**

In [54]:
tokens = []

for sent in df['reviewText']:
    sent_token = sent_tokenize(sent)
    for word in sent_token:
        tokens.append(simple_preprocess(word))  # Tokenization

In [56]:
# Let's train this word2vec model from scratch
model = gensim.models.Word2Vec(tokens)

In [66]:
# to get all the vocabulary
len(model.wv.index_to_key)

9896

In [58]:
model.corpus_count

12000

In [59]:
model.epochs

5

In [60]:
model.wv.similar_by_word('positive')

[('rave', 0.9805091619491577),
 ('terrible', 0.9805081486701965),
 ('movie', 0.9792441725730896),
 ('stated', 0.9782194495201111),
 ('fair', 0.9780911207199097),
 ('alas', 0.9761949777603149),
 ('fault', 0.9739845991134644),
 ('compelled', 0.9735496044158936),
 ('suffice', 0.9715210795402527),
 ('meant', 0.9713084101676941)]

In [52]:
df['reviewText'][0]

0        jace rankin may short nothing mess man hauled ...
1        great short read want put read one sitting sex...
2        start saying first four books expecting 34 con...
3        aggie angela lansbury carries pocketbooks inst...
4        expect type book library pleased find price right
                               ...                        
11995    valentine cupid vampire jena ian another vampi...
11996    read seven books series apocalyptic adventure ...
11997    book really cuppa situation man capturing woma...
11998    tried use charge kindle even register charging...
11999    taking instruction look often hidden world sex...
Name: reviewText, Length: 12000, dtype: object

In [68]:
def avg_word2vec(doc):
    # Collect vectors only for words that are in the model's vocabulary
    vectors = [model.wv[word] for word in doc if word in model.wv.index_to_key]
    
    # If no word is in the model, return a zero vector (or you can return None based on your use case)
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        # Return a zero vector with the same dimensions as Word2Vec vectors (adjust size based on model)
        return np.zeros(model.vector_size)

In [69]:
from tqdm import tqdm

In [70]:
# Apply the Avg Word2Vec to every sentence
X = []

for i in tqdm(range(len(tokens))):
    X.append(avg_word2vec(tokens[i]))

100%|██████████| 12000/12000 [00:13<00:00, 908.98it/s]


In [71]:
X[0]

[array([-0.281378  ,  0.39000988,  0.14210576, -0.19574448,  0.08137444,
        -0.47618926,  0.16061227,  0.6914358 , -0.05361806, -0.21768424,
        -0.18114538, -0.54760295, -0.0577124 ,  0.1271986 ,  0.02348149,
        -0.15252715,  0.20524251, -0.42302534, -0.04175119, -0.51376444,
         0.07403649,  0.13088448,  0.16306257, -0.26346046, -0.12926766,
         0.07440615, -0.2868229 , -0.2991338 , -0.20941047,  0.00580248,
         0.3883097 ,  0.22968173, -0.04398709, -0.3215918 , -0.12310809,
         0.38173392, -0.12795874, -0.2911186 , -0.19228038, -0.5037918 ,
         0.1141332 , -0.18076341, -0.20424558, -0.13825502,  0.24893385,
        -0.1659545 , -0.12747934, -0.03371937,  0.21826749,  0.24944745,
         0.17416644, -0.2142686 , -0.00981412, -0.02411588, -0.17385857,
         0.07644884,  0.27605608,  0.01389365, -0.14131589,  0.03502082,
         0.08979297,  0.07307647, -0.0263056 , -0.07896022, -0.44625962,
         0.1862077 ,  0.11920673,  0.13756335, -0.4

In [72]:
X_new = np.array(X)

In [73]:
X_new.shape

(12000, 100)

In [74]:
df['reviewText'].shape

(12000,)

In [75]:
y = df['rating']

In [76]:
y

0        1
1        1
2        1
3        1
4        1
        ..
11995    1
11996    1
11997    1
11998    0
11999    1
Name: rating, Length: 12000, dtype: int64

In [77]:
new_df = pd.DataFrame(X_new)

In [79]:
new_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.281378,0.39001,0.142106,-0.195744,0.081374,-0.476189,0.160612,0.691436,-0.053618,-0.217684,...,0.294354,0.203319,0.200092,0.085489,0.464407,0.167736,0.152129,-0.074084,0.181339,0.083247
1,-0.386615,0.471767,0.087999,-0.143058,0.125888,-0.747925,0.216509,1.046431,-0.24423,-0.255803,...,0.677941,0.132873,0.34279,-0.393562,0.661509,0.211994,0.309745,-0.041328,0.238298,0.007169
2,-0.335573,0.431236,0.163691,0.042151,0.045633,-0.59775,0.172803,0.85485,-0.289718,-0.196259,...,0.418581,0.094287,0.164093,-0.075129,0.555014,0.208969,0.135871,-0.266357,0.096004,0.038944
3,-0.430456,0.390558,0.249022,0.388653,0.062283,-0.528311,0.190362,0.796267,-0.499843,-0.122947,...,0.42541,-0.004631,-0.025452,-0.036713,0.307946,0.341102,0.205326,-0.290011,0.043521,-0.080665
4,-0.089918,0.307439,-0.014399,-0.149726,0.175095,-0.516441,-0.053689,0.935147,-0.133568,-0.19717,...,0.721548,0.197374,0.238906,-0.237469,0.425457,0.322489,0.169577,-0.160468,0.229457,-0.199894


In [80]:
new_df['rating'] = y

In [81]:
new_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,rating
0,-0.281378,0.39001,0.142106,-0.195744,0.081374,-0.476189,0.160612,0.691436,-0.053618,-0.217684,...,0.203319,0.200092,0.085489,0.464407,0.167736,0.152129,-0.074084,0.181339,0.083247,1
1,-0.386615,0.471767,0.087999,-0.143058,0.125888,-0.747925,0.216509,1.046431,-0.24423,-0.255803,...,0.132873,0.34279,-0.393562,0.661509,0.211994,0.309745,-0.041328,0.238298,0.007169,1
2,-0.335573,0.431236,0.163691,0.042151,0.045633,-0.59775,0.172803,0.85485,-0.289718,-0.196259,...,0.094287,0.164093,-0.075129,0.555014,0.208969,0.135871,-0.266357,0.096004,0.038944,1
3,-0.430456,0.390558,0.249022,0.388653,0.062283,-0.528311,0.190362,0.796267,-0.499843,-0.122947,...,-0.004631,-0.025452,-0.036713,0.307946,0.341102,0.205326,-0.290011,0.043521,-0.080665,1
4,-0.089918,0.307439,-0.014399,-0.149726,0.175095,-0.516441,-0.053689,0.935147,-0.133568,-0.19717,...,0.197374,0.238906,-0.237469,0.425457,0.322489,0.169577,-0.160468,0.229457,-0.199894,1


In [83]:
X = new_df.drop(columns='rating')
y = new_df['rating']

In [86]:
new_df.shape

(12000, 101)

In [90]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [91]:
dt = DecisionTreeClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)
lr = LogisticRegression(random_state=42)
rf = RandomForestClassifier(random_state=42)

models = [dt,gb,lr,rf]

In [93]:
model_list = ["Decision Tree Model","Gradient Boosting Model", "Logistic Regression Model","Random Forest Model"]

for m,name in zip(models,model_list):
    m.fit(X_train,y_train)
    y_pred = m.predict(X_test)
    
    print(f'Accuracy of {name}: {accuracy_score(y_test,y_pred)}')
    print(f'Report of {name}:   {classification_report(y_test,y_pred)}')
    print('#'*30)
    print()

Accuracy of Decision Tree Model: 0.6654166666666667
Report of Decision Tree Model:                 precision    recall  f1-score   support

           0       0.50      0.54      0.52       803
           1       0.76      0.73      0.74      1597

    accuracy                           0.67      2400
   macro avg       0.63      0.63      0.63      2400
weighted avg       0.67      0.67      0.67      2400

##############################

Accuracy of Gradient Boosting Model: 0.7604166666666666
Report of Gradient Boosting Model:                 precision    recall  f1-score   support

           0       0.67      0.55      0.61       803
           1       0.79      0.87      0.83      1597

    accuracy                           0.76      2400
   macro avg       0.73      0.71      0.72      2400
weighted avg       0.75      0.76      0.75      2400

##############################

Accuracy of Logistic Regression Model: 0.765
Report of Logistic Regression Model:                 precis

# **Thankyou**