## Load Dataset

In [1]:
import pandas as pd
file_path = "dataset\AmazonData.csv"
df = pd.read_csv(file_path)
df.head(5)

Unnamed: 0,Unique_ID,Category,Review_Header,Review_text,Rating,Own_Rating
0,136040,smartTv,Nice one,I liked it,5,Positive
1,134236,mobile,Huge battery life with amazing display,I bought the phone on Amazon and been using my...,5,Positive
2,113945,books,Four Stars,"Awesome book at reasonable price, must buy ......",4,Positive
3,168076,smartTv,Nice quality,good,5,Positive
4,157302,books,Nice book,"The book is fine,not bad,contains nice concept...",3,Neutral


## Relevant Features

In [72]:
df=df[["Review_text","Rating"]]
df.head(5)

Unnamed: 0,Review_text,Rating
0,I liked it,5
1,I bought the phone on Amazon and been using my...,5
2,"Awesome book at reasonable price, must buy ......",4
3,good,5
4,"The book is fine,not bad,contains nice concept...",3


### Some EDA steps

In [73]:
## Number of entries
print(f"Length of dataset is {df.shape[0]}, entries")

Length of dataset is 60889, entries


In [74]:
## Missing vals
df.isnull().sum()

Unnamed: 0,0
Review_text,32
Rating,0


In [75]:
## Dropping the rows with empty text, as nothing to work on
df[df["Review_text"].isnull()]


Unnamed: 0,Review_text,Rating
655,,5
2869,,5
9021,,5
11244,,5
14885,,5
16298,,4
16434,,5
17801,,5
18679,,5
19425,,2


In [76]:
df.dropna(subset=["Review_text"], inplace=True)

In [77]:
## Missing value Statistics
df.isnull().sum()

Unnamed: 0,0
Review_text,0
Rating,0


In [78]:
## Let's see the unique ratings
df["Rating"].unique()

array([5, 4, 3, 2, 1])

In [79]:
## The Balance of the dataset (if imbalanced or balanced)
df["Rating"].value_counts()

Unnamed: 0_level_0,count
Rating,Unnamed: 1_level_1
5,34439
4,12968
1,6979
3,4364
2,2107


## Preprocessing and Cleaning




**1. Label Encoding:**

   - Positive reviews (Rating >= 3) are assigned a label of `1`.
   - Negative reviews (Rating < 3) are assigned a label of `0`.

In [80]:
## Preprocessing and cleaning

## positive review is 1 and negative review is 0
df["Rating"]=df["Rating"].apply(lambda x: 0 if x<3 else 1)

In [81]:
df.head(5)

Unnamed: 0,Review_text,Rating
0,I liked it,1
1,I bought the phone on Amazon and been using my...,1
2,"Awesome book at reasonable price, must buy ......",1
3,good,1
4,"The book is fine,not bad,contains nice concept...",1


**2. Lowercasing:**

   - Convert all text in the `Review_text` column to lowercase.


In [82]:
## 1. Lower all the cases
df["Review_text"]=df["Review_text"].str.lower()

In [83]:
import re  # For regular expressions
from bs4 import BeautifulSoup  # For removing HTML tags
from nltk.corpus import stopwords  # For removing stop words

In [84]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**3. Removing Special Characters:**

   - Remove all characters except for lowercase/uppercase letters (a-z, A-Z) and numbers (0-9).

**4. Removing Stop Words:**

   - Remove common English stop words (e.g., "the", "a", "is", "are") using the `stopwords` library.

**5. Removing URLs:**

   - Remove any URLs present in the text.

**6. Removing HTML Tags:**

   - Remove any HTML tags present in the text using the `BeautifulSoup` library.

**7. Removing Extra Spaces:**

   - Remove any extra spaces between words.



In [85]:
## WARNING: RAM-INTENSIVE 

import re
## 2.
##  Removing special characters, only a-z, A-Z and 0-9 are allowed; rest are made ''
df["Review_text"] = df["Review_text"].apply(lambda x: re.sub('[^a-z A-z 0-9]+',' ',x))

## Remove the stop words.
df["Review_text"] = df["Review_text"].apply(lambda x: " ".join([y for y in x.split() if y not in stopwords.words("english")]))

## Remove URL
df["Review_text"] = df["Review_text"].apply(lambda x:re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/`+#-]*[\w@?^=%&/~+#-])?','' , str(x)))

## Remove HTML tags
df["Review_text"] = df["Review_text"].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())

## Remove additional spaces
df["Review_text"] = df["Review_text"].apply(lambda x: " ".join(x.split()))

  df["Review_text"] = df["Review_text"].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())


In [86]:
df.head()

Unnamed: 0,Review_text,Rating
0,liked,1
1,bought phone amazon using samsung m30s couple ...,1
2,awesome book reasonable price must buy,1
3,good,1
4,book fine bad contains nice concepts nicely ex...,1


In [87]:
df.isnull().sum() ## Null-Review

Unnamed: 0,0
Review_text,0
Rating,0


In [88]:
## Export the processed df as csv
df.to_csv("dataset\processed_reviews.csv", index=False)

In [None]:
df = pd.read_csv("dataset\processed_reviews.csv") ## Loading the processed_reviews
df.head(5)

Unnamed: 0,Review_text,Rating
0,liked,1
1,bought phone amazon using samsung m30s couple ...,1
2,awesome book reasonable price must buy,1
3,good,1
4,book fine bad contains nice concepts nicely ex...,1


In [None]:
df.dropna(subset=["Review_text"], inplace=True) ## Due to Indexing Error, sometimes random empty rows are inserted; eliminating them.
df.isnull().sum()

Review_text    0
Rating         0
dtype: int64