In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [3]:
review1 = pd.read_csv("data.csv")
review1.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [4]:
review1.shape

(8518, 8)

In [5]:
review1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8518 entries, 0 to 8517
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Reviewer Name    8508 non-null   object 
 1   Review Title     8508 non-null   object 
 2   Place of Review  8468 non-null   object 
 3   Up Votes         8508 non-null   float64
 4   Down Votes       8508 non-null   float64
 5   Month            8053 non-null   object 
 6   Review text      8510 non-null   object 
 7   Ratings          8518 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 532.5+ KB


In [6]:
review1.duplicated().sum()

4

In [7]:
review1.drop_duplicates(inplace=True)

In [8]:
review1.duplicated().sum()

0

In [9]:
review1.isna().sum()

Reviewer Name        6
Review Title         6
Place of Review     46
Up Votes             6
Down Votes           6
Month              461
Review text          4
Ratings              0
dtype: int64

In [10]:
review1.dropna(inplace=True)

In [11]:
review1.isna().sum()

Reviewer Name      0
Review Title       0
Place of Review    0
Up Votes           0
Down Votes         0
Month              0
Review text        0
Ratings            0
dtype: int64

In [12]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+','',text)
    text = re.sub(r'[^\w\s]','',text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    text = ' '.join(words).strip()
    return text

In [13]:
review1['cleaned_text']=review1['Review text'].apply(clean_text)

In [14]:
cleaned_df = review1

In [15]:
def assign_ratings(ratings):
    if ratings == 5 or ratings == 4:
        return "Positive"
    else:
        return "Negative"


cleaned_df["Target"]=cleaned_df["Ratings"].apply(assign_ratings)

In [16]:
cleaned_df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings,cleaned_text,Target
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4,nice product good quality price rising bad sig...,Positive
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1,didnt supplied yonex mavis outside cover yonex...,Negative
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1,worst product damaged shuttlecock packed new b...,Negative
5,Baji Sankar,Mind-blowing purchase,"Certified Buyer, Hyderabad",173.0,45.0,Oct 2018,Good quality product. Delivered on time.READ MORE,5,good quality product delivered timeread,Positive
6,Flipkart Customer,Must buy!,"Certified Buyer, Doom Dooma",403.0,121.0,Jan 2020,BEST PURCHASE It is a good quality and is more...,5,best purchase good quality durable average shu...,Positive


In [17]:
cleaned_df.Target.value_counts()

Target
Positive    6462
Negative    1551
Name: count, dtype: int64

In [18]:
cleaned_df['Target'] = cleaned_df['Target'].replace({'Positive':1,'Negative':0}, regex=True)

  cleaned_df['Target'] = cleaned_df['Target'].replace({'Positive':1,'Negative':0}, regex=True)


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [20]:
x=tfidf.fit_transform(cleaned_df['cleaned_text']).toarray()
y=cleaned_df['Target']

In [21]:
y.value_counts()

Target
1    6462
0    1551
Name: count, dtype: int64

In [22]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [24]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(6410, 3461)
(1603, 3461)
(6410,)
(1603,)


In [25]:
import mlflow

In [26]:
mlflow.set_experiment("sentiment_classification")

<Experiment: artifact_location='file:///C:/Users/Manisha/machine%20learning/mlops_pipelines/mlruns/791971615677141057', creation_time=1756704313296, experiment_id='791971615677141057', last_update_time=1756704313296, lifecycle_stage='active', name='sentiment_classification', tags={}>

In [27]:
mlflow.sklearn.autolog

<function mlflow.sklearn.autolog(log_input_examples=False, log_model_signatures=True, log_models=True, log_datasets=True, disable=False, exclusive=False, disable_for_unsupported_versions=False, silent=False, max_tuning_runs=5, log_post_training_metrics=True, serialization_format='cloudpickle', registered_model_name=None, pos_label=None, extra_tags=None)>

### Define pipeline

In [28]:
from sklearn.pipeline import Pipeline

In [29]:
pipe = Pipeline(
    [
        ('classifier', RandomForestClassifier())
    ]
)

In [30]:
# parameter_grid = [
#     {
#         'classifier__n_estimators':[50,100,200]
#     }
# ]

In [31]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [32]:
param_dist = {
    'classifier__n_estimators': [50, 100, 200, 300],
    'classifier__max_depth': [None, 5, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2']
}

In [37]:
# try only 20 random combinations
clf = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=20,             
    cv=5,
    verbose=2,
    n_jobs=-1
)

In [38]:
# clf = GridSearchCV(
#     estimator=pipe,
#     param_grid=parameter_grid,
#     scoring='accuracy',
#     verbose=1,
#     return_train_score=True
# )

In [39]:
mlflow.sklearn.autolog(max_tuning_runs=True)

In [40]:
with mlflow.start_run() as run:
    %time clf.fit(x_train, y_train)



Fitting 5 folds for each of 20 candidates, totalling 100 fits




CPU times: total: 54.8 s
Wall time: 10min 15s
