In [279]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split


# **Appraoch**
        1. Cleaning and Pre-processing
        2. Train Test Split
        3. BOW, TF-IDF, Word2Vec
        4. Train ML Algorithms

In [280]:
df = pd.read_csv(r'C:\Users\tanuj\OneDrive\Desktop\ML Projects\NLP\NLP_practice\data\all_kindle_review .csv')

In [281]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


# **Cleaning and Pre-Processing**

In [282]:
df = df[['reviewText','rating']]

In [283]:
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [284]:
df.isna().sum() #Checking the Null Values

reviewText    0
rating        0
dtype: int64

In [285]:
# Checking the dublicate rows
df.duplicated().sum()

0

In [286]:
# checking the dtype of the data 
df.dtypes

reviewText    object
rating         int64
dtype: object

In [287]:
df['reviewText'] = df['reviewText'].str.lower() # Replace the uppercase strings to all lowercase 

In [288]:
df['reviewText'][0]

'jace rankin may be short, but he\'s nothing to mess with, as the man who was just hauled out of the saloon by the undertaker knows now. he\'s a famous bounty hunter in oregon in the 1890s who, when he shot the man in the saloon, just finished a years long quest to avenge his sister\'s murder and is now trying to figure out what to do next. when the snotty-nosed farm boy he just rescued from a gang of bullies offers him money to kill a man who forced him off his ranch, he reluctantly agrees to bring the man to justice, but not to kill him outright. but, first he needs to tell his sister\'s widower the news.kyla "kyle" springer bailey has been riding the trails and sleeping on the ground for the past month while trying to find jace. she wants revenge on the man who killed her husband and took her ranch, amongst other crimes, and she\'s not so keen on the detour jace wants to take. but she realizes she\'s out of options, so she hides behind her boy persona as best she can and tries to ke

In [289]:
# Cleaning using Regular expression
def clean_text(text):
    # Removing Special Characters
    text = re.sub('[^a-zA-z0-9]+',' ',text)
    # Removing URL
    text = re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', text)
    # Removing html tags
    text = BeautifulSoup(text,'lxml').get_text()
    return text

In [290]:
df['reviewText'] = df['reviewText'].apply(clean_text)

In [291]:
# Converting words to its root word and removing stopwords
lemmatizer = WordNetLemmatizer()

In [292]:
def pre_process_text(text):
    text = text.split()
    text = [word for word in text if word not in stopwords.words('english')]
    text = ' '.join(text)
    return text

In [293]:
df['reviewText'] = df['reviewText'].apply(pre_process_text)

In [294]:
df['reviewText'][0]

'jace rankin may short nothing mess man hauled saloon undertaker knows famous bounty hunter oregon 1890s shot man saloon finished years long quest avenge sister murder trying figure next snotty nosed farm boy rescued gang bullies offers money kill man forced ranch reluctantly agrees bring man justice kill outright first needs tell sister widower news kyla kyle springer bailey riding trails sleeping ground past month trying find jace wants revenge man killed husband took ranch amongst crimes keen detour jace wants take realizes options hides behind boy persona best tries keep pace confrontation along way gets shot jace discovers kyle kyla come clean whole reason needs scoundrel dead hope still help book share touching moments slow blooming romance kyla find good reason fear men hide behind boy persona watching jace slowly pull shell help conquer fears endearing pain real deeply rooted disappear face sexiness neither understandable aversion marriage magically disappear round nookie would

In [295]:
df.shape

(12000, 2)

In [296]:
df['rating'].unique()

array([3, 5, 4, 2, 1], dtype=int64)

In [297]:
df['rating'].value_counts()  # This is a Balance data set

rating
5    3000
4    3000
3    2000
2    2000
1    2000
Name: count, dtype: int64

In [298]:
# Positive review is 1 and Negative review is 0
df['rating'] = df['rating'].apply(lambda x: 0 if x<3 else 1)

In [299]:
df['rating'].unique()

array([1, 0], dtype=int64)

In [300]:
df['rating'].value_counts()

rating
1    8000
0    4000
Name: count, dtype: int64

In [301]:
df.head()

Unnamed: 0,reviewText,rating
0,jace rankin may short nothing mess man hauled ...,1
1,great short read want put read one sitting sex...,1
2,start saying first four books expecting 34 con...,1
3,aggie angela lansbury carries pocketbooks inst...,1
4,expect type book library pleased find price right,1


In [302]:
# Train Test Split
X = df['reviewText']
y = df['rating']

In [303]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [304]:
print(X_train[:5])  # Print first 5 rows to check the content


9182     looking forward book came double space every p...
11091    already owned book spouse forgot already part ...
6428     cool forgot request rate came makes mine unrel...
288      short short story basically scenes party one n...
2626     secret service agents secrests even longer ser...
Name: reviewText, dtype: object


# **BOW ~ Bag of Words & TF-IDF ~ Term frequency and Inverse document frequency**

**BOW**

In [330]:
bow = CountVectorizer(max_features=5000)

In [332]:
X_train_bow = bow.fit_transform(X_train).toarray()
X_test_bow = bow.transform(X_test).toarray()

In [309]:
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [310]:
nb_model = MultinomialNB().fit(X_train_bow,y_train)

In [311]:
y_pred_bow = nb_model.predict(X_test_bow)

In [313]:
print(accuracy_score(y_test,y_pred_bow))
print(classification_report(y_test,y_pred_bow))

0.8379166666666666
              precision    recall  f1-score   support

           0       0.73      0.81      0.77       803
           1       0.90      0.85      0.87      1597

    accuracy                           0.84      2400
   macro avg       0.82      0.83      0.82      2400
weighted avg       0.84      0.84      0.84      2400



# **TF-IDF**

In [315]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

In [316]:
X_train_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [317]:
nb_model = MultinomialNB().fit(X_train_tfidf,y_train)

In [318]:
y_pred_tfidf = nb_model.predict(X_test_tfidf)

In [319]:
print(accuracy_score(y_test,y_pred_tfidf))
print(classification_report(y_test,y_pred_tfidf))

0.8133333333333334
              precision    recall  f1-score   support

           0       0.86      0.53      0.65       803
           1       0.80      0.96      0.87      1597

    accuracy                           0.81      2400
   macro avg       0.83      0.74      0.76      2400
weighted avg       0.82      0.81      0.80      2400



# **ThankYou**