Libraries

In [35]:
import pandas as pd
import re
import nltk
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
import gensim
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import numpy as np


Load Dataset

In [2]:
df=pd.read_csv("E:/Krish course materials/Gen AI/Spam Ham email Prediction/all_kindle_review.csv")
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


Choosing only the selected columns for analysis

In [3]:
df=df[["reviewText","rating"]]
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [4]:
df.shape

(12000, 2)

In [5]:
df.dtypes

reviewText    object
rating         int64
dtype: object

Checking for Null values

In [6]:
df.isnull().sum()

reviewText    0
rating        0
dtype: int64

Checking for Duplicated rows

In [7]:
df.duplicated().sum()

0

In [8]:
df["rating"].unique()

array([3, 5, 4, 2, 1], dtype=int64)

In [9]:
df["rating"].value_counts()

5    3000
4    3000
3    2000
2    2000
1    2000
Name: rating, dtype: int64

#### Preprocessing and cleaning

Converting the rating column to positive review or negative review 

In [10]:
df["rating"]=df["rating"].apply(lambda x: 0 if x<3 else 1)
df["rating"]

0        1
1        1
2        1
3        1
4        1
        ..
11995    1
11996    1
11997    1
11998    0
11999    1
Name: rating, Length: 12000, dtype: int64

In [11]:
df["rating"].unique()

array([1, 0], dtype=int64)

In [12]:
df["rating"].value_counts()

1    8000
0    4000
Name: rating, dtype: int64

Lowering the Text column 

In [13]:
df["reviewText"]=df["reviewText"].str.lower()
df["reviewText"].head()

0    jace rankin may be short, but he's nothing to ...
1    great short read.  i didn't want to put it dow...
2    i'll start by saying this is the first of four...
3    aggie is angela lansbury who carries pocketboo...
4    i did not expect this type of book to be in li...
Name: reviewText, dtype: object

Cleaning the text column

In [14]:
# Removing special characters
df["reviewText"]=df["reviewText"].apply(lambda x:re.sub('[^a-z A-Z 0-9]','',x))
# Removing url
df["reviewText"]=df["reviewText"].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?','',str(x)))
# Removing html tags
df["reviewText"]=df["reviewText"].apply(lambda x:BeautifulSoup(x,'html.parser').get_text())
# Removing any additional spaces
df["reviewText"]=df["reviewText"].apply(lambda x:" ".join(x.split()))
df["reviewText"]

0        jace rankin may be short but hes nothing to me...
1        great short read i didnt want to put it down s...
2        ill start by saying this is the first of four ...
3        aggie is angela lansbury who carries pocketboo...
4        i did not expect this type of book to be in li...
                               ...                        
11995    valentine cupid is a vampire jena and ian anot...
11996    i have read all seven books in this series apo...
11997    this book really just wasnt my cuppa the situa...
11998    tried to use it to charge my kindle it didnt e...
11999    taking instruction is a look into the often hi...
Name: reviewText, Length: 12000, dtype: object

In [15]:

lemmatizer=WordNetLemmatizer()

In [16]:
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word,pos="v") for word in text.split()])

In [17]:
df["reviewText"]=df["reviewText"].apply(lambda x:lemmatize_words(x))

Tokenize reviewsText into words and train a Word2Vec model on the tokenized data using Gensim.

In [24]:
words=[]
for i in range(len(df["reviewText"])):
    sent_token=sent_tokenize(df["reviewText"][i])
    for sent in sent_token:
        word_token=word_tokenize(sent)
        words.append(word_token)


In [27]:
words


[['jace',
  'rankin',
  'may',
  'be',
  'short',
  'but',
  'hes',
  'nothing',
  'to',
  'mess',
  'with',
  'as',
  'the',
  'man',
  'who',
  'be',
  'just',
  'haul',
  'out',
  'of',
  'the',
  'saloon',
  'by',
  'the',
  'undertaker',
  'know',
  'now',
  'hes',
  'a',
  'famous',
  'bounty',
  'hunter',
  'in',
  'oregon',
  'in',
  'the',
  '1890s',
  'who',
  'when',
  'he',
  'shoot',
  'the',
  'man',
  'in',
  'the',
  'saloon',
  'just',
  'finish',
  'a',
  'years',
  'long',
  'quest',
  'to',
  'avenge',
  'his',
  'sisters',
  'murder',
  'and',
  'be',
  'now',
  'try',
  'to',
  'figure',
  'out',
  'what',
  'to',
  'do',
  'next',
  'when',
  'the',
  'snottynosed',
  'farm',
  'boy',
  'he',
  'just',
  'rescue',
  'from',
  'a',
  'gang',
  'of',
  'bully',
  'offer',
  'him',
  'money',
  'to',
  'kill',
  'a',
  'man',
  'who',
  'force',
  'him',
  'off',
  'his',
  'ranch',
  'he',
  'reluctantly',
  'agree',
  'to',
  'bring',
  'the',
  'man',
  'to',
  '

In [28]:
model=gensim.models.Word2Vec(words)

In [29]:
model.wv.index_to_key

['the',
 'be',
 'and',
 'a',
 'to',
 'i',
 'of',
 'it',
 'this',
 'in',
 'have',
 'that',
 'book',
 'for',
 'but',
 'read',
 'her',
 'with',
 'story',
 'not',
 'as',
 'she',
 'you',
 'he',
 'like',
 'on',
 'do',
 'his',
 'one',
 'character',
 'so',
 'get',
 'just',
 'love',
 'they',
 'more',
 'all',
 'me',
 'at',
 'my',
 'about',
 'good',
 'there',
 'from',
 'an',
 'if',
 'would',
 'what',
 'out',
 'really',
 'very',
 'or',
 'make',
 'by',
 'when',
 'time',
 'author',
 'go',
 'write',
 'who',
 'up',
 'think',
 'will',
 'find',
 'its',
 'some',
 'their',
 'want',
 'know',
 'no',
 'other',
 'them',
 'series',
 'end',
 'much',
 'enjoy',
 'first',
 'him',
 'into',
 'well',
 'can',
 'even',
 'because',
 'give',
 'how',
 'didnt',
 'only',
 'too',
 'short',
 'take',
 'could',
 'great',
 'sex',
 'little',
 'say',
 'than',
 'dont',
 'way',
 'interest',
 'then',
 'two',
 'see',
 'after',
 'feel',
 'come',
 'plot',
 'keep',
 'also',
 'which',
 'romance',
 'start',
 'look',
 'work',
 'seem',
 'im'

In [30]:
model.corpus_count

12000

In [31]:
model.epochs

5

In [32]:
model.wv.similar_by_word("happy")

[('satisfy', 0.6080904006958008),
 ('glad', 0.5584081411361694),
 ('sad', 0.5501219630241394),
 ('curious', 0.5409419536590576),
 ('impress', 0.5380830764770508),
 ('sweet', 0.527277410030365),
 ('abrupt', 0.5202017426490784),
 ('up', 0.5198305249214172),
 ('disappoint', 0.5191224217414856),
 ('excite', 0.515183687210083)]

In [39]:
vec=model.wv['happy']
vec

array([ 2.98025638e-01, -3.98638844e-01, -1.01382062e-01,  2.03538865e-01,
       -3.06154072e-01, -1.30380094e+00,  3.60691071e-01,  1.28109813e-01,
        8.44988525e-01, -2.94281930e-01,  1.81951538e-01, -5.56005776e-01,
        1.49840638e-01, -2.09350705e-01,  1.21656142e-03, -1.96890905e-01,
       -5.16332090e-01,  2.01961160e-01,  2.18087524e-01,  4.13577080e-01,
        6.67196393e-01,  3.54111671e-01, -3.05591226e-01,  6.80025637e-01,
       -1.01986730e+00,  2.96957672e-01,  1.26155531e+00, -2.07787335e-01,
        4.04800773e-01,  1.40571058e+00,  5.29373825e-01, -3.51638719e-02,
        5.56542814e-01,  1.83153138e-01, -5.49765646e-01,  1.09632027e+00,
        4.54469234e-01,  6.26811862e-01, -5.35343699e-02, -8.10221136e-01,
        3.01474810e-01, -7.91866109e-02,  1.77116141e-01, -2.22783640e-01,
        8.13657165e-01,  1.19561696e+00,  3.10500175e-01, -3.18184793e-02,
        1.34864718e-01, -4.29526754e-02,  9.10016239e-01, -2.84825504e-01,
        3.50529134e-01, -

In [34]:
model.wv['happy'].shape

(100,)

In [38]:
model.wv.similarity("charge","battery")

0.7665421

In [40]:
model.wv.most_similar([vec])

[('happy', 1.0),
 ('satisfy', 0.6080904603004456),
 ('glad', 0.5584081411361694),
 ('sad', 0.5501219630241394),
 ('curious', 0.5409419536590576),
 ('impress', 0.5380830764770508),
 ('sweet', 0.527277410030365),
 ('abrupt', 0.5202017426490784),
 ('up', 0.5198305249214172),
 ('disappoint', 0.5191223621368408)]

Converting word2vec to Avg word2vec to train the model

In [42]:
def avg_word2vec(text):
    return np.mean([model.wv[word] for word in text if word in model.wv.index_to_key],axis=0)

In [43]:
X=[]
for i in range(len(words)):
    X.append(avg_word2vec(words[i]))

In [47]:
X

[array([ 0.23221779,  0.20747186,  0.121171  , -0.19712222, -0.07409979,
        -0.25299698,  0.43993664,  0.30654058, -0.10672162, -0.13572633,
         0.01525741, -0.25621715,  0.13711134, -0.01233067,  0.07519311,
         0.1726185 ,  0.42277232,  0.3254359 ,  0.1577228 , -0.3981517 ,
         0.08622272, -0.1060781 , -0.00876775, -0.12282039, -0.60087657,
         0.42820615,  0.2578384 , -0.04529215, -0.0656691 ,  0.4577014 ,
         0.1603621 , -0.17476927,  0.18554175, -0.466598  , -0.00506418,
         0.29802036,  0.34836608, -0.11390495, -0.18450244, -0.33650726,
         0.1603419 , -0.19874346, -0.14764242,  0.27133837,  0.27895877,
         0.13028868, -0.26392868, -0.26176703,  0.23513924, -0.02441481,
         0.09015245, -0.42347854,  0.12273371,  0.00332884, -0.03025609,
         0.20611054, -0.00663763,  0.28378394, -0.19440661,  0.00758512,
        -0.3980538 ,  0.08820976, -0.05054988, -0.11426789,  0.19845808,
         0.3768643 ,  0.26193646,  0.10538924, -0.0

In [46]:
len(X)

12000

In [48]:
X_new=np.array(X)

In [49]:
X_new[0]

array([ 0.23221779,  0.20747186,  0.121171  , -0.19712222, -0.07409979,
       -0.25299698,  0.43993664,  0.30654058, -0.10672162, -0.13572633,
        0.01525741, -0.25621715,  0.13711134, -0.01233067,  0.07519311,
        0.1726185 ,  0.42277232,  0.3254359 ,  0.1577228 , -0.3981517 ,
        0.08622272, -0.1060781 , -0.00876775, -0.12282039, -0.60087657,
        0.42820615,  0.2578384 , -0.04529215, -0.0656691 ,  0.4577014 ,
        0.1603621 , -0.17476927,  0.18554175, -0.466598  , -0.00506418,
        0.29802036,  0.34836608, -0.11390495, -0.18450244, -0.33650726,
        0.1603419 , -0.19874346, -0.14764242,  0.27133837,  0.27895877,
        0.13028868, -0.26392868, -0.26176703,  0.23513924, -0.02441481,
        0.09015245, -0.42347854,  0.12273371,  0.00332884, -0.03025609,
        0.20611054, -0.00663763,  0.28378394, -0.19440661,  0.00758512,
       -0.3980538 ,  0.08820976, -0.05054988, -0.11426789,  0.19845808,
        0.3768643 ,  0.26193646,  0.10538924, -0.0982123 ,  0.28

In [51]:
X_new.shape

(12000, 100)

In [None]:
# Output feature
y=df["rating"]
y

0        1
1        1
2        1
3        1
4        1
        ..
11995    1
11996    1
11997    1
11998    0
11999    1
Name: rating, Length: 12000, dtype: int64

In [55]:
y.shape

(12000,)

Creating a new DataFrame to train the model for X_new

In [None]:
# Input feature
X=pd.DataFrame(X_new)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.232218,0.207472,0.121171,-0.197122,-0.074100,-0.252997,0.439937,0.306541,-0.106722,-0.135726,...,0.372192,0.118740,-0.131865,-0.224679,0.202464,-0.117807,0.041836,-0.016122,0.173371,0.073152
1,0.157264,0.300210,0.026937,-0.071554,-0.439813,-0.854805,0.738013,0.704788,-0.077263,-0.521622,...,0.287862,0.253533,-0.155225,-0.542795,0.197625,-0.328209,0.077331,0.125909,0.972358,-0.132409
2,0.000881,0.299314,-0.037191,-0.069903,-0.226599,-0.461910,0.657529,0.589756,-0.094257,-0.405245,...,0.261706,0.204720,-0.067416,-0.295773,0.202688,-0.029077,0.128714,0.025313,0.414760,-0.061131
3,0.112574,0.084931,-0.048802,0.236457,-0.079486,-0.196823,0.627533,0.493576,-0.275368,-0.445682,...,0.180729,0.281820,0.362950,0.052613,0.311358,-0.042630,0.171721,-0.141846,0.286145,-0.249711
4,-0.047774,0.490813,-0.046589,0.006983,-0.175193,-0.535010,0.964719,0.750125,-0.290236,-0.531842,...,0.164786,0.327490,-0.275275,-0.544165,-0.030015,-0.386476,0.125393,0.301045,0.417657,-0.068255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,-0.124619,0.238546,0.094781,0.003148,0.073538,-0.225862,0.517390,0.490844,-0.048525,-0.608695,...,0.318957,-0.098717,0.195662,0.011033,0.140139,0.470851,0.011445,-0.102727,0.148267,0.280251
11996,0.095232,0.296147,-0.017649,0.077046,-0.242641,-0.612002,0.884482,0.724478,-0.125182,-0.661496,...,0.192912,0.322059,0.046952,-0.355289,0.092190,-0.335649,-0.113961,0.131279,0.590712,-0.123931
11997,0.203389,0.252874,0.056235,-0.248798,-0.254835,-0.416939,0.599763,0.527151,-0.105575,-0.388781,...,0.391608,0.145632,-0.134706,-0.389348,0.190454,0.004252,0.103991,-0.070696,0.459970,-0.056877
11998,0.100185,0.588961,-0.156284,-0.222995,-0.304730,-0.540805,0.484468,0.095218,-0.234797,0.066889,...,0.241382,0.224860,-0.507261,-0.627352,0.172357,-0.246253,-0.083979,0.251625,0.537948,0.017550


Train Test Split

In [57]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

Training the model with AdaBoostClassifier

In [60]:
from sklearn.ensemble import AdaBoostClassifier
model= AdaBoostClassifier().fit(X_train,y_train)



In [61]:
model

In [62]:
y_pred=model.predict(X_test)

In [63]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

Metric Scores

In [64]:
accuracy_score(y_test,y_pred)

0.75875

In [66]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.68      0.56      0.61       817
           1       0.79      0.86      0.82      1583

    accuracy                           0.76      2400
   macro avg       0.73      0.71      0.72      2400
weighted avg       0.75      0.76      0.75      2400



In [67]:
confusion_matrix(y_test,y_pred)

array([[ 457,  360],
       [ 219, 1364]], dtype=int64)