In [3]:
# ML Flow for Flipkart Sentiment Analysis

In [1]:
# Warnings

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Load the Data

In [6]:
Badminton = pd.read_csv("data.csv")
Badminton

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1
...,...,...,...,...,...,...,...,...
8513,,,,,,,,5
8514,,,,,,,,2
8515,,,,,,,,4
8516,,,,,,,,1


In [7]:
# Dropping Null Values

In [8]:
Badminton.dropna(inplace = True)

In [9]:
# Duplicates

In [10]:
Badminton.duplicated().sum()

0

In [11]:
# Reset Index

In [12]:
Badminton.reset_index(drop=True, inplace=True)

In [13]:
# Function to classify reviews as positive or negative based on ratings

In [14]:
def classify_review(rating):
    if rating >= 3.0:
        return 'Positive'
    else:
        return 'Negative'

In [15]:
Badminton['Sentiment'] = Badminton['Ratings'].apply(classify_review)

In [16]:
# Splitting the data

In [17]:
x = Badminton['Review text']

y = Badminton['Sentiment']

In [18]:
# Machine Learning Libraries

In [19]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline

In [20]:
# Train Test Split

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 1)

print(x_train.shape, x_test.shape)

(6410,) (1603,)


In [22]:
# Pre processing on Train and Test Data

In [23]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [24]:
# Declaring 'Stemming' and 'Lemmatization' variables

In [25]:
Stemming = PorterStemmer()
Lemmatization = WordNetLemmatizer()

In [26]:
# # Steps involved in data preprocesing :

# 1. Removing special characters,unwanted numericals 
# 2. Normalize the case(lower)
# 3. Word Tokenization
# 4. Removing stop words
# 5. Stemming or Lemmatization

In [27]:
# Download NLTK stopwords
nltk.download('stopwords')

# Remove specific words from the NLTK stopwords list
stop_words = set(stopwords.words('english'))
stop_words.update({'Hii', 'it', 'Product', 'Shuttle', 'hii', 'flipkart', 'flipkartread', 'product', 'productread', 'read', 'goodread','shuttle', 'Readmore'})


def preprocess(data):
    # Removes special characters
    sentence = re.sub("[^a-zA-Z]", " ", data)
    
    # Converts words to lowercase
    sentence = sentence.lower()
    
    # Tokenization
    sentence = sentence.split()
    
    # Removes the stop words
    sentence = [word for word in sentence if word not in stop_words]
    
    # Applying lemmatization
    sentence = [Lemmatization.lemmatize(word) for word in sentence]
    
    # Join the tokens back into a string
    sentence = " ".join(sentence)
    
    return sentence

# Apply preprocess function to the 'Review text' column in the Badminton dataset
Badminton['Cleaned Review Text'] = Badminton['Review text'].apply(preprocess)

# Display the modified dataset
# print(Badminton)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rjsek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
#applying preprocesing on train_data

x_train= x_train.apply(preprocess)

In [29]:
#applying preprocessing on test_data

x_test = x_test.apply(preprocess)

In [30]:
# Bag of Words / Count Vectorizer

In [31]:
# import feature extraction methods from sklearn
from sklearn.feature_extraction.text import CountVectorizer

# instantiate a vectorizer
vect = CountVectorizer(preprocessor=preprocess)

# use it to extract features from training data
%time x_train_dtm = vect.fit_transform(x_train)

print(x_train_dtm.shape)

CPU times: total: 93.8 ms
Wall time: 315 ms
(6410, 2602)


In [32]:
# TF - IDF

In [33]:
# Import TF-IDF vectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate a TF-IDF vectorizer
tfidf_vect = TfidfVectorizer(preprocessor=preprocess)

# Use it to extract features from training data
%time x_train_tfidf = tfidf_vect.fit_transform(x_train)

print(x_train_tfidf.shape)

CPU times: total: 15.6 ms
Wall time: 244 ms
(6410, 2602)


In [None]:
# Saving the final Data File

In [34]:
import pandas as pd

# Assuming you already have the data in a DataFrame named "Badminton"

# Define the file path
file_path = 'cleaned_data.csv'

# Export the DataFrame to a CSV file
Badminton.to_csv(file_path, index=False)

print("Data has been successfully saved to", file_path)

Data has been successfully saved to cleaned_data.csv


In [None]:
# Pipeline Code

In [50]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler
import joblib
from joblib import Memory

# Define a memory object to cache intermediate results
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

# Define pipelines for selected models with both CountVectorizer and TfidfVectorizer
pipelines = {
    'naive_bayes': Pipeline([
        ('vectorization', FeatureUnion([
            ('count_vectorizer', CountVectorizer()),
            ('tfidf_vectorizer', TfidfVectorizer())
        ])),
        ('classifier', MultinomialNB())
    ], memory=memory),
    'decision_tree': Pipeline([
        ('vectorization', FeatureUnion([
            ('count_vectorizer', CountVectorizer()),
            ('tfidf_vectorizer', TfidfVectorizer())
        ])),
        ('classifier', DecisionTreeClassifier())
    ], memory=memory),
    'logistic_regression': Pipeline([
        ('vectorization', FeatureUnion([
            ('count_vectorizer', CountVectorizer()),
            ('tfidf_vectorizer', TfidfVectorizer())
        ])),
        ('classifier', LogisticRegression())
    ], memory=memory),
    'svm': Pipeline([
        ('vectorization', FeatureUnion([
            ('count_vectorizer', CountVectorizer()),
            ('tfidf_vectorizer', TfidfVectorizer())
        ])),
        ('classifier', SVC())
    ], memory=memory),
    'random_forest': Pipeline([
        ('vectorization', FeatureUnion([
            ('count_vectorizer', CountVectorizer()),
            ('tfidf_vectorizer', TfidfVectorizer())
        ])),
        ('classifier', RandomForestClassifier())
    ], memory=memory),
    'knn': Pipeline([
        ('vectorization', FeatureUnion([
            ('count_vectorizer', CountVectorizer()),
            ('tfidf_vectorizer', TfidfVectorizer())
        ])),
        ('classifier', KNeighborsClassifier())
    ], memory=memory)
}

# Define parameter grid for each algorithm
param_grids = {
    'naive_bayes': {
        'vectorization__count_vectorizer__max_features': [1000, 2000, 5000],
        'vectorization__tfidf_vectorizer__max_features': [1000, 2000, 5000],
        'classifier__alpha': [1, 10]
    },
    'decision_tree': {
        'vectorization__count_vectorizer__max_features': [1000, 2000, 5000],
        'vectorization__tfidf_vectorizer__max_features': [1000, 2000, 5000],
        'classifier__max_depth': [None, 5, 10]
    },
    'logistic_regression': {
        'vectorization__count_vectorizer__max_features': [1000, 2000, 5000],
        'vectorization__tfidf_vectorizer__max_features': [1000, 2000, 5000],
        'classifier__C': [0.1, 1, 10],
        'classifier__penalty': ['l2']
    },
    'svm': {
        'vectorization__count_vectorizer__max_features': [1000, 2000, 5000],
        'vectorization__tfidf_vectorizer__max_features': [1000, 2000, 5000],
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf']
    },
    'random_forest': {
        'vectorization__count_vectorizer__max_features': [1000, 2000, 5000],
        'vectorization__tfidf_vectorizer__max_features': [1000, 2000, 5000],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 5, 10]
    },
    'knn': {
        'vectorization__count_vectorizer__max_features': [1000, 2000, 5000],
        'vectorization__tfidf_vectorizer__max_features': [1000, 2000, 5000],
        'classifier__n_neighbors': [i for i in range(3, 21, 2)],  # Experiment with different values of n_neighbors
        'classifier__p': [1, 2, 3]
    }
}

# # Perform GridSearchCV for each algorithm
# best_models = {}

# for algo in pipelines.keys():
#     print("*" * 10, algo, "*" * 10)
#     grid_search = GridSearchCV(estimator=pipelines[algo],
#                                param_grid=param_grids[algo],
#                                cv=5,
#                                scoring='f1',
#                                return_train_score=True,
#                                verbose=1)
#     grid_search.fit(x_train, y_train)
#     best_models[algo] = grid_search.best_estimator_
#     y_pred = grid_search.best_estimator_.predict(x_test)
#     f1 = f1_score(y_test, y_pred, pos_label='Positive', average='weighted')
#     print('F1 Score on Test Data:', f1)

In [None]:
# 

In [37]:
import sys
sys.executable

'C:\\Users\\rjsek\\anaconda3\\python.exe'

In [None]:
# 

In [39]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.11.3-py3-none-any.whl (19.7 MB)
     --------------------------------------- 19.7/19.7 MB 11.3 MB/s eta 0:00:00
Collecting waitress<4
  Downloading waitress-3.0.0-py3-none-any.whl (56 kB)
     ---------------------------------------- 56.7/56.7 kB ? eta 0:00:00
Collecting querystring-parser<2
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting gitpython<4,>=3.1.9
  Downloading GitPython-3.1.42-py3-none-any.whl (195 kB)
     -------------------------------------- 195.4/195.4 kB 2.4 MB/s eta 0:00:00
Collecting alembic!=1.10.0,<2
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
     -------------------------------------- 233.4/233.4 kB 2.4 MB/s eta 0:00:00
Collecting docker<8,>=4.0.0
  Downloading docker-7.0.0-py3-none-any.whl (147 kB)
     -------------------------------------- 147.6/147.6 kB 9.2 MB/s eta 0:00:00
Collecting pyarrow<16,>=4.0.0
  Downloading pyarrow-15.0.2-cp310-cp310-win_amd64.whl (24.8 MB)
     ---

In [34]:
# Auto Logging Experiment Run using MLFlow

In [None]:
# Step 1 - Import MLFlow and set the experiment name

In [55]:
import mlflow

mlflow.set_experiment("Flipkart_Sentiment_Prediction")

2024/03/24 20:21:40 INFO mlflow.tracking.fluent: Experiment with name 'Flipkart_Sentiment_Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/rjsek/OneDrive/Desktop/ML%20Flow/mlruns/818060538876186520', creation_time=1711291900972, experiment_id='818060538876186520', last_update_time=1711291900972, lifecycle_stage='active', name='Flipkart_Sentiment_Prediction', tags={}>

In [None]:
# Step 2 - Start the auto logger

In [56]:
mlflow.sklearn.autolog()

# Initialize the auto logger
# max_tuning_runs=None will make sure that all the runs are recorded.
# By default top 5 runs will be recorded for each experiment

In [None]:
# Step 3 - Start the experiment run

In [48]:
with mlflow.start_run() as run:
    grid_search.fit(x_train, y_train)



Fitting 5 folds for each of 243 candidates, totalling 1215 fits


2024/03/24 20:05:43 INFO mlflow.sklearn.utils: Logging the 5 best runs, 238 runs will be omitted.


In [57]:
best_models = {}

# Run the Pipeline
for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )
    
    mlflow.sklearn.autolog(max_tuning_runs=None)
    
    with mlflow.start_run() as run:
        %time grid_search.fit(x_train, y_train)
        
    print('Train Score: ', grid_search.best_score_)
    print('Test Score: ', grid_search.score(x_test, y_test))
    
    best_models[algo] = grid_search.best_estimator_
    print()



********** naive_bayes **********
Fitting 5 folds for each of 18 candidates, totalling 90 fits




CPU times: total: 12 s
Wall time: 19.7 s
Train Score:  0.9232449297971919
Test Score:  0.9238927011852776

********** decision_tree **********
Fitting 5 folds for each of 27 candidates, totalling 135 fits




CPU times: total: 34.9 s
Wall time: 42.8 s
Train Score:  0.9141965678627146
Test Score:  0.9114160948222083

********** logistic_regression **********
Fitting 5 folds for each of 27 candidates, totalling 135 fits




CPU times: total: 35.7 s
Wall time: 29.4 s
Train Score:  0.9205928237129484
Test Score:  0.916406737367436

********** svm **********
Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: total: 4min 3s
Wall time: 4min 17s
Train Score:  0.9177847113884555
Test Score:  0.9139114160948222

********** random_forest **********




Fitting 5 folds for each of 81 candidates, totalling 405 fits
CPU times: total: 10min 42s
Wall time: 11min 11s
Train Score:  0.9205928237129484




Test Score:  0.9151590767311292

********** knn **********
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
CPU times: total: 1h 29min 30s
Wall time: 9min 21s
Train Score:  0.909984399375975
Test Score:  0.9107922645040549

