#### Best practices

1. preprocessing and cleaning
2. train test split
3. BOW, TFIDF, Word2Vec
4. Train ML Algo

In [1]:
### import dataset

import pandas as pd
df = pd.read_csv('data/all_kindle_review.csv')
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [2]:
## select only 'reviewText' and 'rating'

df = df[['reviewText', 'rating']]
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [3]:
## shape of dataset

df.shape

(12000, 2)

In [4]:
### missing values
df.isnull().sum()

reviewText    0
rating        0
dtype: int64

In [5]:
### unique rating
df['rating'].unique()

array([3, 5, 4, 2, 1])

In [6]:
## count unique ratings
df['rating'].value_counts()

rating
5    3000
4    3000
3    2000
2    2000
1    2000
Name: count, dtype: int64

In [7]:
### preprocessing and cleaning
## positive review 1 and negative review 0
df['rating']= df['rating'].apply(lambda x:0 if x <3 else 1)
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",1
1,Great short read. I didn't want to put it dow...,1
2,I'll start by saying this is the first of four...,1
3,Aggie is Angela Lansbury who carries pocketboo...,1
4,I did not expect this type of book to be in li...,1


In [8]:
### lower all the cases
df['reviewText']=df['reviewText'].str.lower()
df.head()

Unnamed: 0,reviewText,rating
0,"jace rankin may be short, but he's nothing to ...",1
1,great short read. i didn't want to put it dow...,1
2,i'll start by saying this is the first of four...,1
3,aggie is angela lansbury who carries pocketboo...,1
4,i did not expect this type of book to be in li...,1


In [11]:
### removing speacial characters
import re
df['reviewText'] = df['reviewText'].apply(lambda x:re.sub('[^a-z A-Z 0-9]+','',x))

## removing stopwords
import nltk
from nltk.corpus import stopwords
df['reviewText'] = df['reviewText'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))

## remove url
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?','0',str(x)))

## remove html tags
from bs4 import BeautifulSoup
df['reviewText'] = df['reviewText'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

## remove any additional spaces
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x.split()))

df.head()

Unnamed: 0,reviewText,rating
0,jace rankin may short hes nothing mess man hau...,1
1,great short read didnt want put read one sitti...,1
2,ill start saying first four books wasnt expect...,1
3,aggie angela lansbury carries pocketbooks inst...,1
4,expect type book library pleased find price right,1


In [12]:
### lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df['reviewText'] = df['reviewText'].apply(lambda x:lemmatize_words(x))

In [21]:
### train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['reviewText'],df['rating'], test_size=0.2, random_state=42)
print(X_train.shape)

(9600,)


In [23]:
### BOW

from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer()
X_train_bow = bow.fit_transform(X_train)
X_test_bow = bow.transform(X_test)

print(X_train_bow.shape, y_train.shape)

(9600, 37363) (9600,)


In [25]:
### TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
print(X_train_tfidf.shape)

(9600, 37363)


In [27]:
from gensim.models import Word2Vec
import numpy as np

# Step 1: Tokenize the text (X_train and X_test are already Series of text)
X_train_tokens = X_train.apply(lambda x: x.split())
X_test_tokens = X_test.apply(lambda x: x.split())

# Step 2: Train Word2Vec model
# Combine train and test tokens for vocabulary building
all_tokens = X_train_tokens.tolist() + X_test_tokens.tolist()

w2v_model = Word2Vec(
    sentences=all_tokens,
    vector_size=100,      
    window=5,             
    min_count=2,          
    workers=4,            
    sg=0,                 
    epochs=10
)

print(f"Word2Vec model trained. Vocabulary size: {len(w2v_model.wv)}")

# Step 3: Create document vectors by averaging word vectors
def document_vector(tokens, model, vector_size=100):
    """
    Create document embedding by averaging word vectors
    """
    # Filter tokens that exist in vocabulary
    valid_tokens = [token for token in tokens if token in model.wv]
    
    if len(valid_tokens) == 0:
        return np.zeros(vector_size)
    
    # Average all word vectors
    return np.mean([model.wv[token] for token in valid_tokens], axis=0)

# Step 4: Create feature vectors for train and test
X_train_w2v = np.array([document_vector(tokens, w2v_model) 
                        for tokens in X_train_tokens])
X_test_w2v = np.array([document_vector(tokens, w2v_model) 
                       for tokens in X_test_tokens])

print(f"Training set shape: {X_train_w2v.shape}")
print(f"Test set shape: {X_test_w2v.shape}")

# Step 5: Train a classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf.fit(X_train_w2v, y_train)

# Predictions
y_pred = clf.predict(X_test_w2v)

# Evaluation
print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Word2Vec model trained. Vocabulary size: 17718
Training set shape: (9600, 100)
Test set shape: (2400, 100)

Accuracy: 0.8017

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.61      0.67       803
           1       0.82      0.90      0.86      1597

    accuracy                           0.80      2400
   macro avg       0.79      0.75      0.77      2400
weighted avg       0.80      0.80      0.80      2400



In [29]:
from sklearn.naive_bayes import GaussianNB
nb_model_bow = GaussianNB().fit(X_train_bow.toarray(), y_train)
nb_model_tfidf = GaussianNB().fit(X_train_tfidf.toarray(), y_train)

MemoryError: Unable to allocate 2.67 GiB for an array with shape (9600, 37363) and data type int64

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
y_pred_bow = nb_model_bow.predict(X_test_bow)
y_pred_tfidf = nb_model_tfidf.predict(X_test_tfidf)

print("BOW performance metrics")
print("accuracy; ", accuracy_score(y_test, y_pred_bow))
print("confusion matrix; ", confusion_matrix(y_test, y_pred_bow))
print("classification report; ", classification_report(y_test, y_pred_bow))

print("TFIDF performance metrics")
print("accuracy; ", accuracy_score(y_test, y_pred_tfidf))
print("confusion matrix; ", confusion_matrix(y_test, y_pred_tfidf))
print("classification report; ", classification_report(y_test, y_pred_tfidf))




In [None]:
# Step 5: Train a classifier (example with Random Forest)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_w2v, y_train)

# Predictions
y_pred = clf.predict(X_test_w2v)

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))