# **SENTIMENT ANALYSIS USING TEXT**

**IMPORTING THE LIBRARY**

In [None]:
import pandas as pd
import numpy as np

### **LOAD THE DATA**

In [2]:
df = pd.read_csv('Data/reviews_badminton/data.csv')

In [3]:
df.shape

(8518, 8)

In [4]:
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [5]:
df = df[['Review Title','Review text','Ratings']]

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df.shape

(7020, 3)

In [8]:
df.dropna(inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7014 entries, 0 to 8507
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Review Title  7014 non-null   object
 1   Review text   7014 non-null   object
 2   Ratings       7014 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 219.2+ KB


In [10]:
df['Ratings'].value_counts()

Ratings
5    3978
4    1402
1     759
3     572
2     303
Name: count, dtype: int64

In [11]:
df['Ratings'] = df['Ratings'].apply(lambda x: "Positive" if x >= 4 else 'Negative')

In [12]:
df['Ratings'].value_counts()

Ratings
Positive    5380
Negative    1634
Name: count, dtype: int64

In [13]:
df['Review'] = df['Review Title'] +' '+ df['Review text']

In [14]:
df['Review'][0]

'Nice product Nice product, good quality, but price is now rising which is a bad sign. 800-850 was an affordable price, especially when we play everyday. So kindly help us out in terms of the price. Thank You.READ MORE'

In [15]:
df['Review'] = df['Review'].str[:-9]

In [16]:
df = df[['Review','Ratings']]

In [17]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def preprocessing(text):
    text = text.lower()

    text = re.sub(r'[^A-Za-z]',' ', text)

    token = word_tokenize(text)

    words = [i for i in token if i not in stopwords.words("english")]

    processed = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(processed)

In [21]:
tempdf = df['Review'].apply(preprocessing)

In [23]:
df= pd.concat([df,tempdf],axis=1)

In [26]:
df.columns = ['Review','Ratings','Reviews']

In [27]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(df['Reviews'],df['Ratings'], test_size=0.4 , random_state=42)

In [28]:
ytrain.value_counts()

Ratings
Positive    3229
Negative     979
Name: count, dtype: int64

In [29]:
ytest.value_counts()

Ratings
Positive    2151
Negative     655
Name: count, dtype: int64

In [43]:
xtrain

2694    expected better product durable broken easily ...
7647                  brilliant nice product lowest price
7656                 must buy nice gud product damage gud
7219                     best market good genuine product
7181                                 wow original product
                              ...                        
4616                                good nice one durable
6325    worst experience flipkart thing shuttle genuin...
6368               decent product time delivery fast also
6555               highly recommended go original product
968     good poor quality shuttle dameged soon think o...
Name: Reviews, Length: 4208, dtype: object

In [46]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

xtrain_tran = tfidf_vectorizer.fit_transform(xtrain)

In [51]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy='minority', random_state=42)
# Fit the model to generate the data.
oversampled_X, oversampled_Y = sm.fit_resample(xtrain_tran, ytrain)

In [65]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()

In [71]:
classifier.fit(oversampled_X, oversampled_Y)

In [72]:
xtesttran = tfidf_vectorizer.transform(xtest)


In [73]:
ypred = classifier.predict(xtesttran)

In [75]:
from sklearn.metrics import classification_report

print(classification_report(ypred,ytest))

              precision    recall  f1-score   support

    Negative       0.70      0.56      0.62       818
    Positive       0.83      0.90      0.87      1988

    accuracy                           0.80      2806
   macro avg       0.77      0.73      0.74      2806
weighted avg       0.79      0.80      0.79      2806



In [78]:
classifier2 =  LogisticRegression()
tfidf_vectorizer2 = TfidfVectorizer()

In [79]:
xtraintran2 = tfidf_vectorizer2.fit_transform(xtrain)

In [81]:
classifier2.fit(xtraintran2,ytrain)

In [82]:
xtesttran2 = tfidf_vectorizer2.transform(xtest)
ypred2 = classifier.predict(xtesttran2)

In [83]:
print(classification_report(ypred2,ytest))

              precision    recall  f1-score   support

    Negative       0.70      0.56      0.62       818
    Positive       0.83      0.90      0.87      1988

    accuracy                           0.80      2806
   macro avg       0.77      0.73      0.74      2806
weighted avg       0.79      0.80      0.79      2806



In [89]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(ypred2,ytest))

[[ 458  360]
 [ 197 1791]]


In [91]:
print(confusion_matrix(ypred,ytest))

[[ 458  360]
 [ 197 1791]]


In [92]:
!pip install joblib



In [93]:
import joblib

In [94]:
joblib.dump(classifier,'Web app/Models/Textclassifier_logreg.pkl')

['Textclassifier_logreg.pkl']

In [95]:
joblib.dump(tfidf_vectorizer, 'Web app/Models/Tfidf.pkl')

['Tfidf.pkl']