In [1]:
import nltk

In [2]:
import pandas as pd
import numpy as np

# Reading the Dataset

In [3]:
data_frame=pd.read_csv("Dataset/AllProductReviews-change.csv")

In [4]:
data_frame.head()

Unnamed: 0,ID,ReviewTitle,ReviewBody,ReviewStar,Product
0,1,Honest review of an edm music lover\n,No doubt it has a great bass and to a great ex...,3,boAt Rockerz 255
1,2,Unreliable earphones with high cost\n,"This earphones are unreliable, i bought it be...",1,boAt Rockerz 255
2,3,Really good and durable.\n,"i bought itfor 999,I purchased it second time,...",4,boAt Rockerz 255
3,4,stopped working in just 14 days\n,Its sound quality is adorable. overall it was ...,1,boAt Rockerz 255
4,5,Just Awesome Wireless Headphone under 1000...😉\n,Its Awesome... Good sound quality & 8-9 hrs ba...,5,boAt Rockerz 255


In [5]:
data_frame['review'] = data_frame['ReviewTitle']+data_frame["ReviewBody"]

In [6]:
data_frame=data_frame.drop(["ReviewTitle","ReviewBody"],axis=1)

In [7]:
#Remove the review that contains no text
data_frame = data_frame[data_frame['review'] != '']
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14337 entries, 0 to 14336
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          14337 non-null  int64 
 1   ReviewStar  14337 non-null  int64 
 2   Product     14337 non-null  object
 3   review      14337 non-null  object
dtypes: int64(2), object(2)
memory usage: 560.0+ KB


# Data Preprocessing

In [8]:
#Making all review to lowercase
data_frame['review'] = data_frame['review'].str.lower()

In [9]:
data_frame['rating'] = [str(1) if rating > 3 else str(0) for rating in data_frame['ReviewStar']]

In [10]:
data_frame=data_frame.drop(["ReviewStar"],axis=1)

In [11]:
data_frame['rating'].value_counts()

1    9402
0    4935
Name: rating, dtype: int64

In [12]:
#Checking if the dataset contains null values
data_frame.isnull().sum()

ID         0
Product    0
review     0
rating     0
dtype: int64

In [13]:
#Function to split string to tokens and removing punctuation
def identify_tokens(row):
    tokens = nltk.word_tokenize(row)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

In [14]:
#Tokenization of DataFrame
data_frame['review'] = data_frame["review"].apply(identify_tokens)

In [15]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [16]:
#Funtion for lemmatizing the list of words
def stem_list(row):
    lem_list = [lemmatizer.lemmatize(word) for word in row]
    return (lem_list)

In [17]:
#Stemming of the dataframe
data_frame['review'] = data_frame["review"].apply(stem_list)

In [18]:
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))
stops.remove("not")
stops.remove("but")
stops.remove("no")

In [19]:
#Function to remove stop words from the list of words
def remove_stops(row):
    meaningful_words = [w for w in row if not w in stops]
    return (meaningful_words)

In [20]:
#Removing stopwords from the Dataframe
data_frame['review'] = data_frame["review"].apply(remove_stops)

In [21]:
#Function to join the processed words
def rejoin_words(row):
    joined_words = ( " ".join(row))
    return joined_words

In [22]:
#Joining the processed words in the data_frame
data_frame['review'] = data_frame["review"].apply(rejoin_words)

In [23]:
data_frame.head()

Unnamed: 0,ID,Product,review,rating
0,1,boAt Rockerz 255,honest review edm music lover no doubt ha grea...,0
1,2,boAt Rockerz 255,unreliable earphone high cost earphone unrelia...,0
2,3,boAt Rockerz 255,really good durable bought itfor purchased sec...,1
3,4,boAt Rockerz 255,stopped working day sound quality adorable ove...,0
4,5,boAt Rockerz 255,awesome wireless headphone awesome good sound ...,1


In [24]:
X=data_frame.iloc[:,2].values
y=data_frame.iloc[:,3].values

# Splitting the dataset for training and testing

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
#X contains review, y contains the ratings
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 1,test_size = 0.25)

In [27]:
X_train.shape, X_test.shape

((10752,), (3585,))

In [28]:
from collections import Counter
Counter(y_train), Counter(y_test)

(Counter({'0': 3687, '1': 7065}), Counter({'0': 1248, '1': 2337}))

# Training the data

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [42]:
text_model_nb = Pipeline([('tfifd',TfidfVectorizer()),
                       ('model', MultinomialNB())
                      ])

In [35]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [43]:
text_model_nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfifd',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [44]:
text_model_svc = Pipeline([('tfifd',TfidfVectorizer()),
                       ('model', SVC())
                      ])

In [45]:
text_model_svc.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfifd',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', d

# Evaluating

# 1.Naive Bayes Classifier

In [37]:
predictions_nb = text_model_nb.predict(X_test)
print(classification_report(y_test, predictions_nb))

              precision    recall  f1-score   support

           0       0.92      0.58      0.71      1248
           1       0.81      0.97      0.89      2337

    accuracy                           0.84      3585
   macro avg       0.86      0.78      0.80      3585
weighted avg       0.85      0.84      0.83      3585



In [38]:
print(confusion_matrix(y_test, predictions_nb))

[[ 728  520]
 [  67 2270]]


In [39]:
print(accuracy_score(y_test, predictions_nb))

0.8362622036262204


# 2.SVC 

In [47]:
predictions_svc = text_model_svc.predict(X_test)
print(classification_report(y_test, predictions_svc))

              precision    recall  f1-score   support

           0       0.88      0.82      0.84      1248
           1       0.91      0.94      0.92      2337

    accuracy                           0.90      3585
   macro avg       0.89      0.88      0.88      3585
weighted avg       0.89      0.90      0.89      3585



In [48]:
print(confusion_matrix(y_test, predictions_svc))

[[1018  230]
 [ 144 2193]]


In [49]:
print(accuracy_score(y_test, predictions_svc))

0.8956764295676429
