In [14]:
import numpy as np
import pandas as pd
import os
import regex as re


In [89]:
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [52]:
df = pd.read_csv(r"C:\Users\Suswarah\Downloads\MLOps\MLFlow\badminton_review_data.csv")


In [53]:
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [54]:
df.columns

Index(['Reviewer Name', 'Review Title', 'Place of Review', 'Up Votes',
       'Down Votes', 'Month', 'Review text', 'Ratings'],
      dtype='object')

In [55]:
df.replace("", pd.NA, inplace = True)
df.replace(" ", pd.NA, inplace = True)
df.isnull().sum()

Reviewer Name       10
Review Title        10
Place of Review     50
Up Votes            10
Down Votes          10
Month              465
Review text          8
Ratings              0
dtype: int64

In [56]:
df.drop(['Reviewer Name', 'Place of Review', 'Up Votes', 'Down Votes', 'Month'], axis=1, inplace = True)

In [57]:
df.shape

(8518, 3)

In [58]:
df.replace("", pd.NA, inplace = True)
df.replace(" ", pd.NA, inplace = True)
df.isnull().sum()

Review Title    10
Review text      8
Ratings          0
dtype: int64

In [59]:
# Drop rows where both 'Review text' and 'Review Title' are null
df.dropna(subset=['Review text', 'Review Title'], how='all', inplace=True)
df.isnull().sum()

Review Title    2
Review text     0
Ratings         0
dtype: int64

In [60]:
df['Review Title'].replace(pd.NA, "None", inplace = True)
df['Review text'] = df['Review text'].str.replace(r'READ MORE', '', regex=True)
# Use replace() with if condition to create the target variable 'Sentiment'
df['Sentiment'] = df['Ratings'].replace({rating: 1 if rating >= 3 else 0 for rating in df['Ratings']})
df.head()
df.isnull().sum()

Review Title    0
Review text     0
Ratings         0
Sentiment       0
dtype: int64

In [61]:
df.shape

(8510, 4)

In [62]:
#Classification
df["Review"] = df['Review Title'] + " " + df['Review text']
df.drop(['Review Title', 'Review text', 'Ratings'], axis = 1, inplace = True)
df.head()

Unnamed: 0,Sentiment,Review
0,1,"Nice product Nice product, good quality, but p..."
1,0,Don't waste your money They didn't supplied Yo...
2,0,Did not meet expectations Worst product. Damag...
3,1,"Fair Quite O. K. , but nowadays the quality o..."
4,0,Over priced Over pricedJust â?¹620 ..from reta...


In [63]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm, tqdm_notebook

In [64]:
nltk.download('stopwords')
# Downloading wordnet before applying Lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Suswarah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Suswarah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Suswarah\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [85]:
y = df['Sentiment']
x = df['Review']

In [86]:
# Splitting into train and test

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [87]:
x_train.shape, x_test.shape, df.shape, y_train.shape, y_test.shape

((6808,), (1702,), (8510, 2), (6808,), (1702,))

In [88]:
x_train

1372                                 Brilliant Very good.
3846    Must buy! Have been purchasing the shuttles fr...
333                    Highly recommended Nice product. .
1259    Terrible product Worst on recent times.We play...
3532                         Mind-blowing purchase Better
                              ...                        
5734                Pretty good genuine & orginal product
5191             Delightful Badminton shuttle is too good
5390                     Wonderful best quality shuttle 🖤
860                         Highly recommended Nice happy
7270                          Classy product Good product
Name: Review, Length: 6808, dtype: object

In [90]:
import time
import joblib
import os

In [91]:
import warnings

warnings.filterwarnings('ignore')

In [92]:
# !pip install mlflow

In [93]:
import mlflow

mlflow.set_experiment("FlipkartReview_prediction")

<Experiment: artifact_location='file:///C:/Users/Suswarah/Downloads/MLOps/MLFlow/mlruns/884060392742436811', creation_time=1711191325036, experiment_id='884060392742436811', last_update_time=1711191325036, lifecycle_stage='active', name='FlipkartReview_prediction', tags={}>

In [47]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((6808,), (6808,), (1702,), (1702,))

In [49]:
x_test.head()

4392                            awesome good
8424    simply awesome good original product
2138             classy product good quality
3538                            awesome good
2684                 could way better thanks
Name: clean_review_lemma, dtype: object

In [50]:
x_test.head()

4392                            awesome good
8424    simply awesome good original product
2138             classy product good quality
3538                            awesome good
2684                 could way better thanks
Name: clean_review_lemma, dtype: object

In [94]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Define text preprocessing function
def preprocessor(text):
    # Removing special characters and digits
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    # change sentence to lower case
    letters_only = letters_only.lower()
    # tokenize into words
    words = letters_only.split()
    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

# Create pipelines for all models
pipelines = {
    'logistic_regression': Pipeline([
        ('vectorization', TfidfVectorizer(preprocessor=preprocessor)),
        ('classifier', LogisticRegression(max_iter=5000))
    ]),
    'SVC': Pipeline([
        ('vectorization', TfidfVectorizer(preprocessor=preprocessor)),
        ('classifier', SVC())
    ])
}

# Define parameter grids for all models
param_grids = {
    'logistic_regression': {
        'vectorization__max_features': [1000, 1500, 2000],
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__penalty': ['l2']
    },
    'SVC' : {
        'classifier__C': [0.01, 0.1, 1, 10],  # Regularization parameter
        'classifier__kernel': ['linear', 'rbf', 'sigmoid'],  # Kernel type
        'classifier__gamma': ['scale', 'auto']  # Kernel coefficient (for rbf, poly, sigmoid)
    }
}



In [84]:
best_models = {}

# Run the Pipeline
for model_name, pipeline in pipelines.items():
    print("-" * 10, model_name, "-" * 10)
    grid_search = GridSearchCV(estimator=pipeline,
                               param_grid=param_grids[model_name],
                               scoring='f1_weighted',
                               cv=5,
                               return_train_score=True,
                               verbose=1
                               )
    
    mlflow.sklearn.autolog(max_tuning_runs=None)
    
    with mlflow.start_run() as run:
        %time grid_search.fit(x_train, y_train)
        
    print('Train Score: ', grid_search.best_score_)
    print('Test Score: ', grid_search.score(x_test, y_test))
    
    best_models[model_name] = grid_search.best_estimator_
    print()

---------- logistic_regression ----------




Fitting 5 folds for each of 18 candidates, totalling 90 fits
CPU times: total: 12.6 s
Wall time: 31.1 s
Train Score:  0.7333333333333333
Test Score:  0.3333333333333333

---------- SVC ----------




Fitting 5 folds for each of 24 candidates, totalling 120 fits
CPU times: total: 14.5 s
Wall time: 40.7 s
Train Score:  0.7333333333333333
Test Score:  0.3333333333333333



In [97]:
# Perform GridSearchCV for all models
dev = 'Suswarah'
best_models = {}

for model_name, pipeline in pipelines.items():
    print("-" * 10, model_name, "-" * 10)
    grid_search = GridSearchCV(estimator=pipeline,
                               param_grid=param_grids[model_name],
                               scoring='f1_weighted',
                               cv=5,
                               return_train_score=True,
                               verbose=1
                               )
#     mlflow.sklearn.autolog(max_tuning_runs=None)
    
    # Fit
    start_fit_time = time.time()
    grid_search.fit(x_train, y_train)
    end_fit_time = time.time()

    # Predict
    start_predict_time = time.time()
    y_pred = grid_search.predict(x_test)
    end_predict_time = time.time()

    # Saving the best model
    joblib.dump(grid_search.best_estimator_, f'best_models/{model_name}.pkl')
    model_size = os.path.getsize(f'best_models/{model_name}.pkl')

    # Pring Log
    print('Train Score: ', grid_search.best_score_)
    print('Test Score: ', grid_search.score(x_test, y_test))
    print("Fit Time: ", end_fit_time - start_fit_time)
    print("Predict Time: ", end_predict_time - start_predict_time)
    print("Model Size: ", model_size)
    
    # Start the experiment run
    with mlflow.start_run() as run:
        # Log tags with mlflow.set_tag()
        mlflow.set_tag("developer", dev)

        # Log Parameters with mlflow.log_param()
        mlflow.log_param("algorithm", model_name)
        mlflow.log_param("hyperparameter_grid", param_grids[model_name])
        mlflow.log_param("best_hyperparameter", grid_search.best_params_)

        # Log Metrics with mlflow.log_metric()
        mlflow.log_metric("train_score", grid_search.best_score_)
        mlflow.log_metric("test_score", grid_search.score(x_test, y_test))
        mlflow.log_metric("fit_time", end_fit_time - start_fit_time)
        mlflow.log_metric("predict_time", end_predict_time - start_predict_time)
        mlflow.log_metric("model_size", model_size)

        # Log Model using mlflow.sklearn.log_model()
        mlflow.sklearn.log_model(grid_search.best_estimator_, f"{model_name}_model")


2024/03/24 13:36:53 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '7ad0d59092ae463da9f726d46ef710b4', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


---------- logistic_regression ----------
Fitting 5 folds for each of 18 candidates, totalling 90 fits




Train Score:  0.9084859713112505
Test Score:  0.9125609189924552
Fit Time:  3189.569856405258
Predict Time:  6.048084259033203
Model Size:  106561
---------- SVC ----------


2024/03/24 14:30:27 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e32dd9ecc4484301818ce20ce8bf0006', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting 5 folds for each of 24 candidates, totalling 120 fits




Train Score:  0.9056792395725901
Test Score:  0.9102225686431268
Fit Time:  3369.839942216873
Predict Time:  5.240501403808594
Model Size:  261261


**Best Model**: Logistic Regression

**Parameters**:
        
        'vectorization__max_features': 2000,
        'classifier__C': 10,
        'classifier__penalty': l2
        
**Accuracy**: 92%

**F1 - score Positive Review Prediction** : 96%

**F1 - score Negative Review Prediction** : 62%