In [1]:
# Sentiment Analysis on Flipkart Product Reviews

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
file_path = "flipkart_product.csv"
df = pd.read_csv(file_path, encoding='ISO-8859-1')

# Drop rows with missing reviews or ratings
df.dropna(subset=['Review', 'Rate'], inplace=True)
df['Rate'] = pd.to_numeric(df['Rate'], errors='coerce')
df.dropna(subset=['Rate'], inplace=True)

# Label Sentiments based on Rate
def get_sentiment(rating):
    if rating <= 2:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    else:
        return 'positive'

df['Sentiment'] = df['Rate'].apply(get_sentiment)

# Clean review text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"[^a-zA-Z\s]", '', text)
    text = re.sub(r"\s+", ' ', text).strip()
    return text

df['Clean_Review'] = df['Review'].apply(clean_text)

# Train-test split
X = df['Clean_Review']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Logistic Regression Model
model = LogisticRegression(max_iter=200)
model.fit(X_train_vec, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test_vec)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

# Show some predictions
for i in range(5):
    print(f"\nReview: {X_test.iloc[i]}\nActual: {y_test.iloc[i]}, Predicted: {y_pred[i]}")



Classification Report:

              precision    recall  f1-score   support

    negative       1.00      0.86      0.92      5134
     neutral       0.98      0.84      0.91      3172
    positive       0.96      1.00      0.98     29668

    accuracy                           0.97     37974
   macro avg       0.98      0.90      0.94     37974
weighted avg       0.97      0.97      0.97     37974


Confusion Matrix:

[[ 4394     8   732]
 [    4  2678   490]
 [   15    33 29620]]

Review: super
Actual: positive, Predicted: positive

Review: super
Actual: positive, Predicted: positive

Review: excellent
Actual: positive, Predicted: positive

Review: not good
Actual: negative, Predicted: negative

Review: nan
Actual: positive, Predicted: positive


In [3]:
# Add this to a separate script or notebook
import joblib

# After training your model and vectorizer:
joblib.dump(model, "logistic_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']