### Importing Necessary Modules

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy

nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Loading the Dataset

In [4]:
df = pd.read_csv("../data/kindle_reviews.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200
...,...,...,...,...,...,...,...,...,...,...
982614,982614,B00M13FNSS,"[2, 2]",5,Yasss hunny! This is a great read. That Dre is...,"07 23, 2014",A2Y66HD4J5S7QZ,Candi,A Hot Read Indeed!!,1406073600
982615,982615,B00M13FNSS,"[0, 0]",5,I ENJOYED THIS BOOK FROM BEGINNING TO END NOW ...,"07 23, 2014",A17YHECC8H9NEY,Margie,VERY GOOD BOOK,1406073600
982616,982616,B00M13FNSS,"[1, 1]",5,Great book! Cherika was a fool. She let that m...,"07 23, 2014",A20KO0BPMNREJL,Nicki,Great Read,1406073600
982617,982617,B00M13FNSS,"[0, 0]",5,When I say this was an excellent book please b...,"07 23, 2014",A1BQO66R6OLCCW,Nikey,Wow!!,1406073600


***Note***
- For this case, only review rating and review text will be used

In [5]:
data = df[["overall", "reviewText"]]

In [8]:
data

Unnamed: 0,overall,reviewText
0,5,I enjoy vintage books and movies so I enjoyed ...
1,4,This book is a reissue of an old one; the auth...
2,4,This was a fairly interesting read. It had ol...
3,5,I'd never read any of the Amy Brewster mysteri...
4,4,"If you like period pieces - clothing, lingo, y..."
...,...,...
982614,5,Yasss hunny! This is a great read. That Dre is...
982615,5,I ENJOYED THIS BOOK FROM BEGINNING TO END NOW ...
982616,5,Great book! Cherika was a fool. She let that m...
982617,5,When I say this was an excellent book please b...


In [9]:
data.shape

(982619, 2)

### Text Preprocessing and Data Cleaning

In [13]:
# Checking for nan values
data.isnull().sum()

overall        0
reviewText    22
dtype: int64

In [6]:
# Since there are nan values in the reviewText, we will drop them
data = data.dropna()

In [16]:
data.isnull().sum()

overall       0
reviewText    0
dtype: int64

In [17]:
# Checking for duplicates
data.duplicated().sum()

np.int64(278)

In [7]:
# Removing duplicates
data = data.drop_duplicates()

In [19]:
data.duplicated().sum()

np.int64(0)

In [8]:
# Checking for class imbalances
data['overall'].value_counts()

overall
5    575031
4    253959
3     96181
2     34130
1     23018
Name: count, dtype: int64

In [9]:
# For review sentiment, 3 and greater is positive (1), 3 less is negative (0)

data['overall'] = data['overall'].map(lambda x: 0 if x < 3 else 1)

In [None]:
data['overall'].value_counts()

# Class imbalance needs to be addressed later on

overall
1    925171
0     57148
Name: count, dtype: int64

In [10]:
# Removing all special characters and lowering the string

data['reviewText'] = data['reviewText'].replace(regex = r"[^a-zA-Z0-9\s+]", value = '').str.lower()

In [11]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def lemmatize(sentence: str) -> str:
    doc = nlp(sentence)
    result = [token.lemma_ for token in doc if token.text.lower() not in stopwords.words("english")]
    return " ".join(result)

In [None]:
data['lemmatizedText'] = data['reviewText'].apply(lambda x: lemmatize(x))