### import libraries

In [7]:
import os 
# changing directory path 
os.chdir('F:\\New Start\\My projects\\Ecommerce Sentiment Analysis')
import numpy  as np
import pandas  as pd
import matplotlib.pyplot as plt 
from scripts.data_preprocessing import CleanReview
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

### Loading Dataset

In [21]:
data=pd.read_csv('F:\\New Start\\My projects\\Ecommerce Sentiment Analysis\\data\\cleaned\\cleaned_data.csv',encoding='utf-8')

In [22]:
# checking null values 
data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [23]:
data.shape

(95272, 2)

In [24]:
### Imbalanced dataset
data['Sentiment'].value_counts()

Sentiment
positive    70510
negative    18907
neutral      5855
Name: count, dtype: int64

In [30]:
positive_reviews=data[data['Sentiment']=='positive'].head(40000)


In [31]:
positive_reviews

Unnamed: 0,Review,Sentiment
0,great cooler excellent air flow and for this p...,positive
1,best budget 2 fit cooler nice cooling,positive
2,the quality is good but the power of air is de...,positive
5,the cooler is really fantastic and provides go...,positive
6,very good product,positive
...,...,...
53851,nice television and its too slim its really an...,positive
53852,facing trouble 1 screen mirroring not working ...,positive
53853,this is one of the best tv i got it in rs 1930...,positive
53857,while playing high definition video there is e...,positive


In [34]:
other_sentiment=data[data['Sentiment']!='positive']
other_sentiment

Unnamed: 0,Review,Sentiment
3,very bad product its a only a fan,negative
4,ok ok product,neutral
8,very bad cooler,negative
28,bad quality,negative
43,small wire and moter capacity is very low fan ...,negative
...,...,...
95195,Getting 2 Blurred patches on the right hand si...,negative
95198,HDMI CABLE VERY POOR QUALITY,negative
95253,Pros - Slim border line style. Lightweight & c...,neutral
95262,I am little bit angry. This is LCD Monitor not...,neutral


### Add some more data from negative and neutral reviews

In [12]:
balancer=pd.read_csv('data\\cleaned\\generated_reviews.csv')

In [13]:
balancer['sentiment'].value_counts()

sentiment
negative    30000
neutral     30000
Name: count, dtype: int64

In [36]:
balancer.rename(columns={'review': 'Review', 'sentiment': 'Sentiment'}, inplace=True)

In [38]:
Balanced_data = pd.concat([positive_reviews, other_sentiment, balancer], axis=0, ignore_index=True)


In [39]:
Balanced_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124762 entries, 0 to 124761
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Review     124762 non-null  object
 1   Sentiment  124762 non-null  object
dtypes: object(2)
memory usage: 1.9+ MB


In [40]:
Balanced_data['Sentiment'].value_counts()

Sentiment
negative    48907
positive    40000
neutral     35855
Name: count, dtype: int64

In [41]:
Balanced_data.to_csv("data\\balanced\\balanced_data.csv", index=False)

In [None]:
## convert output column into numbers
def convert(text):
    if text=='positive':
       return 1
    elif text=='negative':
        return -1
    else:
        return 0
    
Balanced_data['Sentiment']=Balanced_data['Sentiment'].apply(convert)

### Data Preprocessing on Reviews

In [44]:
cr=CleanReview()

In [45]:
print("Processing has started")
Balanced_data['Review']=Balanced_data['Review'].apply(cr.clean_html)
print("clean html has completed")
Balanced_data['Review']=Balanced_data['Review'].apply(cr.convert_lower)
print("text has been converted into lower case ")
Balanced_data['Review']=Balanced_data['Review'].apply(cr.remove_special)
print("Special charactors has been removed")
Balanced_data['Review']=Balanced_data['Review'].apply(cr.remove_stopwords)
print("Stopwords has been removed")
Balanced_data['Review']=Balanced_data['Review'].apply(cr.stem_words)
print("Stemmetization has been done.")
Balanced_data['Review']=Balanced_data['Review'].apply(cr.join_back)
print("Text has been joined back")
Balanced_data['Review']=Balanced_data['Review'].apply(cr.remove_emojis)
print("Emojis has been removed.")
print("Preprocessing has been ended.")


Processing has started
clean html has completed
text has been converted into lower case 
Special charactors has been removed
Stopwords has been removed
Stemmetization has been done.
Text has been joined back
Emojis has been removed.
Preprocessing has been ended.


### Train-Test-Split

In [51]:
X=Balanced_data['Review']
y=Balanced_data['Sentiment']

In [52]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=51)

### Model + Embedding Pipeline 

- count vectorizer

In [53]:
from sklearn.feature_extraction.text import CountVectorizer

vec=CountVectorizer(max_features=10000)

- Naive Bayes Classifier

In [54]:
nbc=MultinomialNB()

In [55]:
from sklearn.pipeline import Pipeline

In [56]:
model=Pipeline(
    [
        ('Count vectorizer',vec),
        ('Naive Bayes Classifier',nbc)
    ]
)

In [57]:
model.fit(X_train,y_train)

0,1,2
,steps,"[('Count vectorizer', ...), ('Naive Bayes Classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [58]:
y_pred=model.predict(X_test)

- evaluation

In [59]:
print("accurracy of Naive Bayes Classifier :", accuracy_score(y_test,y_pred))
print("Precision Score of Naive Bayes Classifier :",precision_score(y_test,y_pred,average='macro'))
print("Recall Score of Naive Bayes Classifier :",recall_score(y_test,y_pred,average='macro'))
print("F1 Score of Naive bayes Classifier :",f1_score(y_test,y_pred,average='macro'))

accurracy of Naive Bayes Classifier : 0.8690053167330145
Precision Score of Naive Bayes Classifier : 0.8864179936240686
Recall Score of Naive Bayes Classifier : 0.8735956071239744
F1 Score of Naive bayes Classifier : 0.8718312118058473


- Apply mlflow for monitoring

In [60]:
import mlflow 
import warnings
warnings.filterwarnings('ignore')
from mlflow.models import infer_signature
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
signature=infer_signature(X_train,y_train)
# start the Mlflow experiments
with mlflow.start_run():
# log metrics 
    mlflow.log_metric("Accuracy ",accuracy_score(y_test,y_pred))
    mlflow.log_metric("Precision ",precision_score(y_test,y_pred,average='macro'))
    mlflow.log_metric("Recall",recall_score(y_test,y_pred,average='macro'))
    mlflow.log_metric("F1 Score",f1_score(y_test,y_pred,average='macro'))

    mlflow.sklearn.log_model(
    sk_model=model,
    artifact_path="Adding more data",
    registered_model_name="Naive bayes Classifier-Best model",
   )

Successfully registered model 'Naive bayes Classifier-Best model'.
2025/09/14 01:33:03 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Naive bayes Classifier-Best model, version 1
Created version '1' of model 'Naive bayes Classifier-Best model'.


🏃 View run redolent-eel-55 at: http://127.0.0.1:5000/#/experiments/0/runs/c816e2d795524b229680393e1ff59c98
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0
