In [23]:
# Standard imports
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [24]:
#from google.colab import drive
#drive.mount('/content/drive')

In [25]:
df=pd.read_csv('/content/drive/MyDrive/Dataset/product_reviews.csv')
df.head(2)

Unnamed: 0,reviewer_name,reviewer_rating,review_title,review_text,place_of_review,date_of_review,up_votes,down_votes,product_name,sentiment
0,Kamal Suresh,4.0,Nice product,"Nice product, good quality, but price is now r...","Certified Buyer, Chirakkal",Feb 2021,889.0,64.0,badminton,1
1,Flipkart Customer,1.0,Don't waste your money,They didn't supplied Yonex Mavis 350. Outside ...,"Certified Buyer, Hyderabad",Feb 2021,109.0,6.0,badminton,0


In [26]:
from sklearn.model_selection import train_test_split

# Split Data
X = df['review_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state=0)

In [27]:
print(X_train.shape, X_test.shape)

(8604,) (2151,)


In [28]:
!pip install mlflow

!pip install pyngrok



In [29]:
import os
import mlflow
from pyngrok import ngrok

# Create the directory if it doesn't exist
os.makedirs("/content/mlruns", exist_ok=True)

# Set up ngrok authtoken
ngrok.set_auth_token("2fbObODvpVzGoEqtfHZFQUk4cvh_7fyrjhJU9gnywDH7Xd3vp")

# Set the MLflow tracking URI to store runs locally
mlflow.set_tracking_uri("/content/mlruns")

# Start the MLflow UI on a different port
get_ipython().system_raw("mlflow ui --port 5050 &")

# Tunnel to access the UI
ngrok_url = ngrok.connect(addr="5050", proto="http")
print("MLflow Tracking UI:", ngrok_url)


MLflow Tracking UI: NgrokTunnel: "https://4898-34-23-203-167.ngrok-free.app" -> "http://localhost:5050"


In [30]:
mlflow.set_experiment("sentiment_prediction")

<Experiment: artifact_location='/content/mlruns/862844624206381041', creation_time=1714165770212, experiment_id='862844624206381041', last_update_time=1714165770212, lifecycle_stage='active', name='sentiment_prediction', tags={}>

In [31]:
#ngrok.kill()

In [32]:
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
# initialise the Stemmer
stemmer = PorterStemmer()

In [34]:
# initialise Lemmatizer
lemmatizer = WordNetLemmatizer()

## Data Preprocessing on train data

In [35]:
def preprocess(raw_text):

    # Removing special characters and digits

    sentence = re.sub("[^a-zA-Z]|READ MORE", " ", raw_text)

    # change sentence to lower case
    sentence = sentence.lower()

    # tokenize into words
    tokens = nltk.word_tokenize(sentence)

    # Lemmatize
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Stop word removal
    stop_words = set(stopwords.words('english'))
    cleaned_tokens = [word for word in lemmatized_tokens if word.lower() not in stop_words]

    # Join and return
    return " ".join(cleaned_tokens)

In [36]:
# sample of processed word
preprocess(df['review_text'][0])

'nice product good quality price rising bad sign wa affordable price especially play everyday kindly help u term price thank'

## Converting Text to Numerical vectors - BOW Representation

In [37]:
# import feature extraction methods from sklearn
from sklearn.feature_extraction.text import CountVectorizer

# instantiate a vectorizer
vect = CountVectorizer(preprocessor=preprocess)
%time
X_train_bow = vect.fit_transform(X_train)
print(X_train_bow.shape)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 8.58 µs
(8604, 2554)


In [38]:
# transform testing data (using training data's features)
%time
X_test_bow = vect.transform(X_test)
print(X_test_bow.shape)


CPU times: user 14 µs, sys: 0 ns, total: 14 µs
Wall time: 18.4 µs
(2151, 2554)


## **Auto Logging Naive Bayes Demo Experiment Run using MLFlow**


In [39]:
# import classifier from sklearn
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB() # instantiate a Multinomial Naive Bayes model

mlflow.sklearn.autolog(max_tuning_runs=None)

with mlflow.start_run() as run:
    %time
    nb.fit(X_train_bow, y_train) # train the model(timing it with an IPython "magic command")

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.06 µs




## **Create an optimal workflow**

In [40]:
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import  RandomForestClassifier

from sklearn.model_selection import GridSearchCV

In [41]:
# Define the pipeline with caching
pipe = Pipeline(
    [
        ('vectorization', CountVectorizer()),
        ('nb', MultinomialNB())
    ]
)

MAX_FEATURES = [1000, 1500, 2000]
ALPHA = [1, 10]

# Observe the Key Value Pair format
parameter_grid = [
    {
        'vectorization__preprocessor': [preprocess],
        'vectorization__max_features': MAX_FEATURES,
        'nb__alpha': ALPHA
    }
]

clf = GridSearchCV(
    estimator=pipe,
    param_grid=parameter_grid,
    scoring='accuracy',
    cv=5,
    return_train_score=True,
    verbose=1
)

# Initialize the auto logger
# max_tuning_runs=None will make sure that all the runs are recorded.
# By default top 5 runs will be recorded for each experiment
mlflow.sklearn.autolog(max_tuning_runs=None)

with mlflow.start_run() as run:
    %time
    clf.fit(X_train, y_train)



CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.2 µs
Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [42]:
# Improving the efficiency by applying cleaning the text data before hand

%time
X_train_clean = X_train.apply(lambda raw_txt: preprocess(raw_txt))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


In [43]:
%time
X_test_clean = X_test.apply(lambda raw_txt: preprocess(raw_txt))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs


## **Auto Logging All Experiment Runs using MLFlow**

In [44]:
pipelines = {
    'naive_bayes': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', MultinomialNB())
    ]),
    'logistic_regression': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', LogisticRegression())
    ]),
    'random_forest': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', RandomForestClassifier())
    ])
}

# Define parameter grid for each algorithm
param_grids = {
    'naive_bayes': [
        {
            'vectorization': [CountVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__alpha' : [1, 10]
        }
    ],
    'logistic_regression': [
        {
            'vectorization': [CountVectorizer(), TfidfVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__C': [0.1, 1, 10],
            'classifier__penalty': ['elasticnet'],
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga'],
            'classifier__class_weight': ['balanced'],
        }
    ],
    'random_forest': [
        {
            'vectorization': [CountVectorizer(), TfidfVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__max_depth': [None, 5, 10],
            'classifier__n_estimators': [10,20,25],
            'classifier__min_samples_leaf': [2],
            'classifier__bootstrap': [True,False],
            'classifier__class_weight': ['balanced']
        }
    ]
}

# Perform GridSearchCV for each algorithm
best_models = {}

for algorithm in pipelines.keys():
    print("*"*10, algorithm, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algorithm],
                               param_grid=param_grids[algorithm],
                               cv=5,
                               scoring='accuracy',
                               return_train_score=True,
                               verbose=1
                              )
    mlflow.sklearn.autolog(max_tuning_runs=None)

    with mlflow.start_run() as run:
        %time
        grid_search.fit(X_train_clean, y_train)

    best_models[algorithm] = grid_search.best_estimator_

    print('Train Score: ', grid_search.best_score_)
    print('Score on Test Data: ', grid_search.score(X_test_clean, y_test))

********** naive_bayes **********




CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs
Fitting 5 folds for each of 8 candidates, totalling 40 fits




Train Score:  0.9341006445684634
Score on Test Data:  0.9265457926545793
********** logistic_regression **********
CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 5.72 µs
Fitting 5 folds for each of 72 candidates, totalling 360 fits




Train Score:  0.9121340351066849
Score on Test Data:  0.905625290562529
********** random_forest **********
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Train Score:  0.9073692958393579
Score on Test Data:  0.9079497907949791


In [45]:
# Stop the auto logger

mlflow.sklearn.autolog(disable=True)

## **Custom Experiment Tracking and Database Integration with MLFlow**

In [46]:
import time
import joblib
import os

In [47]:
# Set the MLflow tracking URI to store runs locally
mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Set the MLflow experiment
mlflow.set_experiment("Sentiment Prediction")

# Set up ngrok authtoken
ngrok.set_auth_token("2fbObODvpVzGoEqtfHZFQUk4cvh_7fyrjhJU9gnywDH7Xd3vp")

# Start the MLflow UI on a different port
get_ipython().system_raw("mlflow ui --port 5050 --backend-store-uri sqlite:///mlflow.db &")

# Tunnel to access the UI
ngrok_url = ngrok.connect(addr="5050", proto="http")
print("MLflow Tracking UI:", ngrok_url)

2024/04/26 21:53:58 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/04/26 21:53:58 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

MLflow Tracking UI: NgrokTunnel: "https://712f-34-23-203-167.ngrok-free.app" -> "http://localhost:5050"


In [48]:
os.makedirs('best_models')
dev = "Aminat Owodunni"
best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo],
                               param_grid=param_grids[algo],
                               cv=5,
                               scoring='accuracy',
                               return_train_score=True,
                               verbose=1
                              )

    # Fit
    start_fit_time = time.time()
    grid_search.fit(X_train_clean, y_train)
    end_fit_time = time.time()

    # Predict
    start_predict_time = time.time()
    y_pred = grid_search.predict(X_test_clean)
    end_predict_time = time.time()

    # Saving the best model
    joblib.dump(grid_search.best_estimator_, f'best_models/{algo}.pkl')
    model_size = os.path.getsize(f'best_models/{algo}.pkl')

    # Pring Log
    print('Train Score: ', grid_search.best_score_)
    print('Test Score: ', grid_search.score(X_test_clean, y_test))
    print("Fit Time: ", end_fit_time - start_fit_time)
    print("Predict Time: ", end_predict_time - start_predict_time)
    print("Model Size: ", model_size)

    print()

    # Start the experiment run
    with mlflow.start_run() as run:
        # Log tags with mlflow.set_tag()
        mlflow.set_tag("developer", dev)

        # Log Parameters with mlflow.log_param()
        mlflow.log_param("algorithm", algo)
        mlflow.log_param("hyperparameter_grid", param_grids[algo])
        mlflow.log_param("best_hyperparameter", grid_search.best_params_)

        # Log Metrics with mlflow.log_metric()
        mlflow.log_metric("train_score", grid_search.best_score_)
        mlflow.log_metric("test_score", grid_search.score(X_test_clean, y_test))
        mlflow.log_metric("fit_time", end_fit_time - start_fit_time)
        mlflow.log_metric("predict_time", end_predict_time - start_predict_time)
        mlflow.log_metric("model_size", model_size)

        # Log Model using mlflow.sklearn.log_model()
        mlflow.sklearn.log_model(grid_search.best_estimator_, f"{algo}_model")

********** naive_bayes **********
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Train Score:  0.9341006445684634
Test Score:  0.9265457926545793
Fit Time:  6.591938018798828
Predict Time:  0.020749568939208984
Model Size:  82458

********** logistic_regression **********
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Train Score:  0.9120177560369175
Test Score:  0.905625290562529
Fit Time:  911.0573816299438
Predict Time:  0.018605470657348633
Model Size:  94525

********** random_forest **********
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Train Score:  0.9079504209288812
Test Score:  0.902835890283589
Fit Time:  166.20816087722778
Predict Time:  0.037503957748413086
Model Size:  711666

