In [1]:
import pandas as pd

df = pd.read_csv("C:/Users/Riya/Downloads/FlipkartML/DataSet/reviews_data_dump/reviews_badminton/data.csv")
df

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1
...,...,...,...,...,...,...,...,...
8513,,,,,,,,5
8514,,,,,,,,2
8515,,,,,,,,4
8516,,,,,,,,1


In [2]:
df.columns = df.columns.str.lower().str.replace(" ", "_")
df.columns

Index(['reviewer_name', 'review_title', 'place_of_review', 'up_votes',
       'down_votes', 'month', 'review_text', 'ratings'],
      dtype='object')

In [3]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = str(text).lower()                     # lowercase
    text = re.sub(r"[^a-zA-Z ]", "", text)       # remove special chars & numbers
    words = text.split()                         # tokenize
    words = [w for w in words if w not in stop_words]  # remove stopwords
    return " ".join(words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Riya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df["clean_review"] = df["review_text"].apply(clean_text)
df[["review_text", "clean_review"]].head()

Unnamed: 0,review_text,clean_review
0,"Nice product, good quality, but price is now r...",nice product good quality price rising bad sig...
1,They didn't supplied Yonex Mavis 350. Outside ...,didnt supplied yonex mavis outside cover yonex...
2,Worst product. Damaged shuttlecocks packed in ...,worst product damaged shuttlecocks packed new ...
3,"Quite O. K. , but nowadays the quality of the...",quite k nowadays quality corks like years back...
4,Over pricedJust â?¹620 ..from retailer.I didn'...,pricedjust retaileri didnt understand wat adva...


In [5]:
%pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [6]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 3.4 MB/s eta 0:00:04
     --- ------------------------------------ 1.0/12.8 MB 3.4 MB/s eta 0:00:04
     ---- ----------------------------------- 1.6/12.8 MB 2.9 MB/s eta 0:00:04
     ------- -------------------------------- 2.4/12.8 MB 3.1 MB/s eta 0:00:04
     --------- ------------------------------ 3.1/12.8 MB 3.3 MB/s eta 0:00:03
     ----------- ---------------------------- 3.7/12.8 MB 3.0 MB/s eta 0:00:04
     ------------- -------------------------- 4.5/12.8 MB 3.1 MB/s eta 0:00:03
     ----------------- ---------------------- 5.5/12.8 MB 3.4 MB/s eta 0:00:03
     --------------------- ------------------ 6.8/12.8 MB 3.7 MB/s eta 0:00:02
     ------------------------- ----------

In [7]:
import spacy

In [8]:
nlp = spacy.load("en_core_web_sm") # type: ignore

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join(token.lemma_ for token in doc)

df["normalized_review"] = df["clean_review"].apply(lemmatize_text)
df[["clean_review", "normalized_review"]].head()

Unnamed: 0,clean_review,normalized_review
0,nice product good quality price rising bad sig...,nice product good quality price rise bad sign ...
1,didnt supplied yonex mavis outside cover yonex...,do not supply yonex mavis outside cover yonex ...
2,worst product damaged shuttlecocks packed new ...,bad product damage shuttlecock pack new box or...
3,quite k nowadays quality corks like years back...,quite k nowadays quality cork like year back u...
4,pricedjust retaileri didnt understand wat adva...,pricedjust retaileri do not understand wat adv...


In [9]:
def create_sentiment(rating):
    if rating >= 4:
        return "positive"
    elif rating <= 2:
        return "negative"
    else:
        return "neutral"

df["sentiment"] = df["ratings"].apply(create_sentiment)

In [10]:
df = df[df["sentiment"] != "neutral"]
df["sentiment"].value_counts()

sentiment
positive    6826
negative    1077
Name: count, dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7903 entries, 0 to 8517
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   reviewer_name      7893 non-null   object 
 1   review_title       7893 non-null   object 
 2   place_of_review    7857 non-null   object 
 3   up_votes           7893 non-null   float64
 4   down_votes         7893 non-null   float64
 5   month              7474 non-null   object 
 6   review_text        7895 non-null   object 
 7   ratings            7903 non-null   int64  
 8   clean_review       7903 non-null   object 
 9   normalized_review  7903 non-null   object 
 10  sentiment          7903 non-null   object 
dtypes: float64(2), int64(1), object(8)
memory usage: 740.9+ KB


In [12]:
df = df.dropna(subset=[
    "review_text",
    "clean_review",
    "normalized_review",
    "sentiment"
])

In [13]:
df = df.dropna()

In [14]:
df

Unnamed: 0,reviewer_name,review_title,place_of_review,up_votes,down_votes,month,review_text,ratings,clean_review,normalized_review,sentiment
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4,nice product good quality price rising bad sig...,nice product good quality price rise bad sign ...,positive
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1,didnt supplied yonex mavis outside cover yonex...,do not supply yonex mavis outside cover yonex ...,negative
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1,worst product damaged shuttlecocks packed new ...,bad product damage shuttlecock pack new box or...,negative
5,Baji Sankar,Mind-blowing purchase,"Certified Buyer, Hyderabad",173.0,45.0,Oct 2018,Good quality product. Delivered on time.READ MORE,5,good quality product delivered timeread,good quality product deliver timeread,positive
6,Flipkart Customer,Must buy!,"Certified Buyer, Doom Dooma",403.0,121.0,Jan 2020,BEST PURCHASE It is a good quality and is more...,5,best purchase good quality durable average shu...,good purchase good quality durable average shu...,positive
...,...,...,...,...,...,...,...,...,...,...,...
8495,vishal kumar ashish,Nice,"Certified Buyer, Haridwar",0.0,0.0,Oct 2016,Thanks to the delivery boy ... Service is alwa...,5,thanks delivery boy service always awesomeread,thank delivery boy service always awesomeread,positive
8496,Nitya Nand Rai,Good choice,"Certified Buyer, Raebareli",0.0,0.0,Oct 2016,Over priced even after 50% discount price is m...,1,priced even discount price much higher usual m...,price even discount price much high usual misl...,negative
8497,Tarun Reddy,Awesome,Certified Buyer,0.0,0.0,Oct 2016,Too much priced. It was getting me for Rs. 640...,1,much priced getting rs home town advantage pur...,much price get rs home town advantage purchasi...,negative
8499,Jayachandra,High cost,"Certified Buyer, Mangalore",0.0,0.0,Dec 2015,Hii flipkart customers care..why your delivery...,5,hii flipkart customers carewhy delivery boys b...,hii flipkart customer carewhy delivery boy bri...,positive


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7438 entries, 0 to 8507
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   reviewer_name      7438 non-null   object 
 1   review_title       7438 non-null   object 
 2   place_of_review    7438 non-null   object 
 3   up_votes           7438 non-null   float64
 4   down_votes         7438 non-null   float64
 5   month              7438 non-null   object 
 6   review_text        7438 non-null   object 
 7   ratings            7438 non-null   int64  
 8   clean_review       7438 non-null   object 
 9   normalized_review  7438 non-null   object 
 10  sentiment          7438 non-null   object 
dtypes: float64(2), int64(1), object(8)
memory usage: 697.3+ KB


#### NUMERICAL FEATURE EXTRACTION

#### BAG OF WORDS (BoW)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer # type: ignore

bow_vectorizer = CountVectorizer(max_features=5000)

X_bow = bow_vectorizer.fit_transform(df["normalized_review"])

X_bow.shape

(7438, 3077)

#### TF-IDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer # type: ignore

tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

X_tfidf = tfidf_vectorizer.fit_transform(df["normalized_review"])

X_tfidf.shape

(7438, 5000)

#### WORD2VEC (Semantic Meaning)

In [18]:
from gensim.models import Word2Vec # type: ignore

sentences = df["normalized_review"].apply(lambda x: x.split())

w2v_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4
)

In [19]:
import numpy as np

def review_vector(review):
    words = review.split()
    vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

X_w2v = np.array(df["normalized_review"].apply(review_vector).tolist())

X_w2v.shape

(7438, 100)

### MODELING APPROACH

##### Train First Model (Logistic Regression)

In [20]:
X = X_tfidf        # from your TF-IDF step
y = df["sentiment"]

##### Train-Test Split

In [21]:
from sklearn.model_selection import train_test_split # type: ignore

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

##### Train Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression # type: ignore

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

##### Predictions

In [23]:
y_pred_lr = lr_model.predict(X_test)

#### MODEL EVALUATION (F1-SCORE)

In [24]:
from sklearn.metrics import classification_report, f1_score # type: ignore

print(classification_report(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr, pos_label="positive"))

              precision    recall  f1-score   support

    negative       0.88      0.47      0.61       195
    positive       0.92      0.99      0.96      1293

    accuracy                           0.92      1488
   macro avg       0.90      0.73      0.78      1488
weighted avg       0.92      0.92      0.91      1488

F1 Score: 0.9562943593574897


#### Train Naive Bayes

In [25]:
%pip install scikit-learn

from sklearn.naive_bayes import MultinomialNB # type: ignore

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

y_pred_nb = nb_model.predict(X_test)

print(classification_report(y_test, y_pred_nb))
print("F1 Score:", f1_score(y_test, y_pred_nb, pos_label="positive"))

Note: you may need to restart the kernel to use updated packages.
              precision    recall  f1-score   support

    negative       0.88      0.38      0.54       195
    positive       0.91      0.99      0.95      1293

    accuracy                           0.91      1488
   macro avg       0.90      0.69      0.74      1488
weighted avg       0.91      0.91      0.90      1488

F1 Score: 0.951780415430267


#### Train SVM (High Accuracy)

In [26]:
from sklearn.svm import LinearSVC # type: ignore

svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)

print(classification_report(y_test, y_pred_svm))
print("F1 Score:", f1_score(y_test, y_pred_svm, pos_label="positive"))

              precision    recall  f1-score   support

    negative       0.83      0.62      0.71       195
    positive       0.94      0.98      0.96      1293

    accuracy                           0.93      1488
   macro avg       0.89      0.80      0.84      1488
weighted avg       0.93      0.93      0.93      1488

F1 Score: 0.9624288425047438




##### Compare Models

In [27]:
model_scores = {
    "Logistic Regression": f1_score(y_test, y_pred_lr, pos_label="positive"),
    "Naive Bayes": f1_score(y_test, y_pred_nb, pos_label="positive"),
    "SVM": f1_score(y_test, y_pred_svm, pos_label="positive")
}

model_scores

{'Logistic Regression': 0.9562943593574897,
 'Naive Bayes': 0.951780415430267,
 'SVM': 0.9624288425047438}

In [28]:
import joblib

# Save trained SVM model
joblib.dump(svm_model, "../models/sentiment_model.pkl")

# Save TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, "../models/tfidf_vectorizer.pkl")

['../models/tfidf_vectorizer.pkl']

In [29]:
# Save trained SVM model
joblib.dump(svm_model, "../models/sentiment_model.pkl")

['../models/sentiment_model.pkl']

In [30]:
import joblib

model = joblib.load("../models/sentiment_model.pkl")
vectorizer = joblib.load("../models/tfidf_vectorizer.pkl")

In [31]:
%pip install mlflow

Note: you may need to restart the kernel to use updated packages.


In [32]:
import mlflow # type: ignore

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Flipkart_Sentiment_Analysis")



2026/02/12 16:27:16 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/02/12 16:27:16 INFO alembic.runtime.migration: Will assume non-transactional DDL.


<Experiment: artifact_location='file:c:/Users/Riya/Downloads/FlipkartML/NoteBook/mlruns/1', creation_time=1770722122071, experiment_id='1', last_update_time=1770722122071, lifecycle_stage='active', name='Flipkart_Sentiment_Analysis', tags={}>

In [33]:
with mlflow.start_run(run_name="test_run"):
    mlflow.log_param("project", "Flipkart Sentiment Analysis")
    mlflow.log_metric("accuracy", 0.88)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



In [34]:
mlflow.get_experiment_by_name("Flipkart_Sentiment_Analysis")

<Experiment: artifact_location='file:c:/Users/Riya/Downloads/FlipkartML/NoteBook/mlruns/1', creation_time=1770722122071, experiment_id='1', last_update_time=1770722122071, lifecycle_stage='active', name='Flipkart_Sentiment_Analysis', tags={}>

In [35]:
import mlflow # type: ignore

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Flipkart_Sentiment_Analysis")

with mlflow.start_run(run_name="Initial_Test_Run"):
    mlflow.log_param("model", "LogisticRegression")
    mlflow.log_param("vectorizer", "TF-IDF")
    mlflow.log_metric("accuracy", 0.91)
    mlflow.log_metric("f1_score", 0.89)