# Sentiment Analysis of Flipkart Reviews

In [2]:
import pandas as pd
import numpy as np
df=pd.read_csv(r"C:\coding\Innomatics intern\task7\data\reviews_badminton\data.csv",encoding="latin1")
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust Ã¢?Â¹620 ..from retailer.I did...,1


### Basic EDA

In [3]:
df.isnull().sum()

Reviewer Name       10
Review Title        10
Place of Review     50
Up Votes            10
Down Votes          10
Month              465
Review text          8
Ratings              0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8518 entries, 0 to 8517
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Reviewer Name    8508 non-null   object 
 1   Review Title     8508 non-null   object 
 2   Place of Review  8468 non-null   object 
 3   Up Votes         8508 non-null   float64
 4   Down Votes       8508 non-null   float64
 5   Month            8053 non-null   object 
 6   Review text      8510 non-null   object 
 7   Ratings          8518 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 532.5+ KB


In [5]:
df=df.dropna(subset=['Review text'])

In [6]:
df.isnull().sum()

Reviewer Name        2
Review Title         2
Place of Review     42
Up Votes             2
Down Votes           2
Month              457
Review text          0
Ratings              0
dtype: int64

In [7]:
df["Ratings"].value_counts()

5    5079
4    1744
1     766
3     615
2     306
Name: Ratings, dtype: int64

In [8]:
df.loc[0,'Review text']

'Nice product, good quality, but price is now rising which is a bad sign. 800-850 was an affordable price, especially when we play everyday. So kindly help us out in terms of the price. Thank You.READ MORE'

In [9]:
df.loc[1,'Review text']

"They didn't supplied Yonex Mavis 350. Outside cover was Yonex Ad inside was a cheapest....  Sad to hear this.READ MORE"

In [10]:
df['sentiment'] = df['Ratings'].apply(lambda x:'Positive' if x>=3 else 'Negative') 

# Let us take only the two columns we need
df = df[["Review text", 'sentiment']]
df.shape

(8510, 2)

In [11]:
df.head(10)

Unnamed: 0,Review text,sentiment
0,"Nice product, good quality, but price is now r...",Positive
1,They didn't supplied Yonex Mavis 350. Outside ...,Negative
2,Worst product. Damaged shuttlecocks packed in ...,Negative
3,"Quite O. K. , but nowadays the quality of the...",Positive
4,Over pricedJust Ã¢?Â¹620 ..from retailer.I did...,Negative
5,Good quality product. Delivered on time.READ MORE,Positive
6,BEST PURCHASE It is a good quality and is more...,Positive
7,Good quality original shuttles.READ MORE,Positive
8,AwesomeREAD MORE,Positive
9,nice original productsREAD MORE,Positive


### Identifying Input and Output

In [12]:
X=df['Review text']
y=df['sentiment']
print(X.shape,y.shape)

(8510,) (8510,)


### Splitting the Data into Train and Test

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(6382,) (6382,)
(2128,) (2128,)


### Data Preprocessing 

In [14]:
import string

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import re
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iamgu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\iamgu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
from tqdm import tqdm, tqdm_notebook

In [16]:
tqdm.pandas()

In [17]:
def preprocess_text(text):

    text = str(text)

    # Remove 'READ MORE' if found
    text = text.replace('READ MORE', '')

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)

    text = re.sub(r':\)|:\(|:\D|:\S', '', text)

    # Convert text to lowercase
    text = text.lower()

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_text = [word for word in words if word not in stop_words]
    filtered_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    return pd.Series([" ".join(filtered_text), len(filtered_text)])

In [22]:
temp_df = X_train.progress_apply(lambda x: preprocess_text(x))

temp_df.head()

100%|██████████████████████████████████████████████████████████████████████████████████████████| 6382/6382 [00:07<00:00, 873.04it/s]


Unnamed: 0,0,1
1095,nice ð,2
7326,good dry weather,3
1377,well brand,2
2127,firstly yonex best company badminton shuttle b...,9
5096,good,1


In [23]:
temp_df.columns = ['clean_text_lemma', 'text_length_lemma']

In [24]:

X_train = pd.concat([X_train, temp_df], axis=1)

X_train.head()

Unnamed: 0,Review text,clean_text_lemma,text_length_lemma
1095,Very nice ðREAD MORE,nice ð,2
7326,Very good on dry weatherREAD MORE,good dry weather,3
1377,Well brandREAD MORE,well brand,2
2127,Firstly Yonex is best company for badminton ...,firstly yonex best company badminton shuttle b...,9
5096,GoodREAD MORE,good,1


In [25]:
from sklearn.feature_extraction.text import CountVectorizer

vocab = CountVectorizer()

X_train_bow = vocab.fit_transform(X_train['clean_text_lemma'])

In [26]:
X_train_bow

<6382x2763 sparse matrix of type '<class 'numpy.int64'>'
	with 21362 stored elements in Compressed Sparse Row format>

In [27]:
temp_df = X_test.progress_apply(lambda x: preprocess_text(x))

temp_df.head()

100%|██████████████████████████████████████████████████████████████████████████████████████████| 2128/2128 [00:02<00:00, 964.25it/s]


Unnamed: 0,0,1
3733,awesome,1
3491,good,1
2532,nice,1
4429,nice product ðð better price,5
2698,genuine product,2


In [28]:
temp_df.columns = ['clean_text_lemma', 'text_length_lemma']

temp_df.head()


Unnamed: 0,clean_text_lemma,text_length_lemma
3733,awesome,1
3491,good,1
2532,nice,1
4429,nice product ðð better price,5
2698,genuine product,2


In [29]:
X_test = pd.concat([X_test, temp_df], axis=1)

X_test.head()

Unnamed: 0,Review text,clean_text_lemma,text_length_lemma
3733,Awesome....READ MORE,awesome,1
3491,goodREAD MORE,good,1
2532,NiceREAD MORE,nice,1
4429,very nice product ðð at a better priceRE...,nice product ðð better price,5
2698,Genuine productREAD MORE,genuine product,2


In [30]:
X_test_bow = vocab.transform(X_test['clean_text_lemma'])

In [31]:
print("Total unique words:", len(vocab.vocabulary_))

print("Type of train features:", type(X_train_bow))

print("Shape of input data:", X_train_bow.shape)

Total unique words: 2763
Type of train features: <class 'scipy.sparse.csr.csr_matrix'>
Shape of input data: (6382, 2763)


In [32]:
print("Total unique words:", len(vocab.vocabulary_))

print("Type of train features:", type(X_test_bow))

print("Shape of input data:", X_test_bow.shape)

Total unique words: 2763
Type of train features: <class 'scipy.sparse.csr.csr_matrix'>
Shape of input data: (2128, 2763)


### Building a Model (i.e. Train the classifier)

In [33]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB() # instantiate a Multinomial Naive Bayes model
%time nb.fit(X_train_bow, y_train)

Wall time: 38.6 ms


MultinomialNB()

### Evaluating on Train Data

In [34]:
from sklearn import metrics
# make class predictions for X_train_bow
y_train_pred = nb.predict(X_train_bow)

print(metrics.accuracy_score(y_train, y_train_pred))

0.9359135067376998


In [35]:
import warnings

warnings.filterwarnings('ignore')

### Implementing various Algorithms to find the Best Model

In [36]:
import joblib
from joblib import Memory

import os

In [37]:
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV

In [38]:
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

pipelines = {
    'logistic_regression': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', LogisticRegression())
    ], memory=memory),
    'decision_tree': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', DecisionTreeClassifier())
    ], memory=memory),
    'naive_bayes': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', MultinomialNB())
    ], memory=memory),
    
}

param_grids = {
    'logistic_regression': [
        {
            'vectorization': [CountVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000], 
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['elasticnet'], 
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga'],
            'classifier__class_weight': ['balanced']
        }
    ],
    'decision_tree': [
        {
            'vectorization': [CountVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__max_depth': [None, 5, 10]
        }
    ],
    'naive_bayes': [
        {
            'vectorization': [CountVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000], 
            'classifier__alpha' : [1, 10]
        }
    ]
}

# Perform GridSearchCV for each algorithm
best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='f1', 
                               return_train_score=True,
                               verbose=1
                              )
    
    %time grid_search.fit(X_train['clean_text_lemma'], y_train)
    
    best_models[algo] = grid_search.best_estimator_
    
    y_pred=grid_search.best_estimator_.predict(X_test['clean_text_lemma'])
    
    f1=metrics.f1_score(y_test,y_pred,pos_label='Positive')
    print('F1 Score on test data:',f1)

********** logistic_regression **********
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Wall time: 12min 47s
F1 Score on test data: 0.9288864388092613
********** decision_tree **********
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Wall time: 23.6 s
F1 Score on test data: 0.9378980891719746
********** naive_bayes **********
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Wall time: 12.4 s
F1 Score on test data: 0.9510970129526831


In [39]:
best_model = grid_search.best_estimator_

# You can then use best_model directly
print(best_model)

# Save the best model
joblib.dump(best_model, 'best_model.pkl')

# Load the best model
loaded_model = joblib.load('best_model.pkl')

Pipeline(memory=Memory(location=.cache\joblib),
         steps=[('vectorization', CountVectorizer(max_features=1000)),
                ('classifier', MultinomialNB(alpha=1))])


In [40]:
#According to the results,we see that Naive Bayes is the best model for the data along with the specified hyperparamters.

### Final Output

In [98]:
new_data = [
    "worst product ever."
]

new_data_clean = [preprocess_text(text) for text in new_data]

prediction = loaded_model.predict(new_data)

print("Prediction:", prediction)

Prediction: ['Negative']


In [104]:
new_data = [
    "The product is really good."
]

new_data_clean = [preprocess_text(text) for text in new_data]

prediction = loaded_model.predict(new_data)

print("Prediction:", prediction)

Prediction: ['Positive']
