# Performing sentiment analysis on customer reviews using TF IDF vectorisation and logistic regression


In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix ,classification_report


In [2]:
df = pd.read_csv("iphone.csv")
df.head()

Unnamed: 0,productAsin,country,date,isVerified,ratingScore,reviewTitle,reviewDescription,reviewUrl,reviewedIn,variant,variantAsin
0,B09G9BL5CP,India,11-08-2024,True,4,No charger,"Every thing is good about iPhones, there's not...",https://www.amazon.in/gp/customer-reviews/R345...,Reviewed in India on 11 August 2024,Colour: MidnightSize: 256 GB,B09G9BQS98
1,B09G9BL5CP,India,16-08-2024,True,5,iPhone 13 256GB,"It look so fabulous, I am android user switche...",https://www.amazon.in/gp/customer-reviews/R2HJ...,Reviewed in India on 16 August 2024,Colour: MidnightSize: 256 GB,B09G9BQS98
2,B09G9BL5CP,India,14-05-2024,True,4,Flip camera option nill,I tried to flip camera while recording but no ...,https://www.amazon.in/gp/customer-reviews/R3Y7...,Reviewed in India on 14 May 2024,Colour: MidnightSize: 256 GB,B09G9BQS98
3,B09G9BL5CP,India,24-06-2024,True,5,Product,100% genuine,https://www.amazon.in/gp/customer-reviews/R1P9...,Reviewed in India on 24 June 2024,Colour: MidnightSize: 256 GB,B09G9BQS98
4,B09G9BL5CP,India,18-05-2024,True,5,Good product,Happy to get the iPhone 13 in Amazon offer,https://www.amazon.in/gp/customer-reviews/R1XI...,Reviewed in India on 18 May 2024,Colour: MidnightSize: 256 GB,B09G9BQS98


In [42]:
# DATA PREPROCESSING

df.dropna(subset=['reviewDescription' , 'ratingScore'], inplace= True)

x=df['reviewDescription']
y=df['ratingScore']

In [43]:
#SPLIT THE DATA INTO TRAINING AND TESTING SETS

x_test,x_train,y_test,y_train= train_test_split(x,y, test_size=0.2, random_state=42)

In [44]:
#APPLY TF IDF VECTORISATION 

tfidf_vectorizer= TfidfVectorizer(stop_words='english',max_features=50000)


In [45]:
x_train_tfidf=tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf=tfidf_vectorizer.transform(x_test)

In [46]:
#TRAIN THE LOGISTIC REGRESSION MODEL

model = LogisticRegression()
model.fit(x_train_tfidf , y_train)

In [47]:
#MAKE PREDICTIONS ON THE TEST DATA

y_pred=model.predict(x_test_tfidf)

In [48]:
#EVALUATE THE MODEL'S PERFORMANCE
accuracy=accuracy_score(y_test,y_pred)
print(f'Accuracy:{accuracy*100:.2f}%')
print(classification_report(y_test,y_pred))


Accuracy:57.65%
              precision    recall  f1-score   support

           1       0.75      0.31      0.44       465
           2       1.00      0.01      0.01       144
           3       1.00      0.01      0.01       191
           4       0.42      0.03      0.05       346
           5       0.56      0.99      0.72      1234

    accuracy                           0.58      2380
   macro avg       0.75      0.27      0.25      2380
weighted avg       0.64      0.58      0.47      2380

