## Data Formation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
# Load data for badminton
data_badminton = pd.read_csv("reviews_data_dump/reviews_badminton/data.csv")

# Load data for tawa
data_tawa = pd.read_csv("reviews_data_dump/reviews_tawa/data.csv")

# Load data for tea
data_tea = pd.read_csv("reviews_data_dump/reviews_tea/data.csv")

In [24]:
# EDA for data_badminton
print("EDA for data_badminton:")
print(data_badminton.info())
print(data_badminton.describe())

# EDA for data_tawa
print("\nEDA for data_tawa:")
print(data_tawa.info())
print(data_tawa.describe())

# EDA for data_tea
print("\nEDA for data_tea:")
print(data_tea.info())
print(data_tea.describe())

EDA for data_badminton:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8518 entries, 0 to 8517
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Reviewer Name    8508 non-null   object 
 1   Review Title     8508 non-null   object 
 2   Place of Review  8468 non-null   object 
 3   Up Votes         8508 non-null   float64
 4   Down Votes       8508 non-null   float64
 5   Month            8053 non-null   object 
 6   Review text      8510 non-null   object 
 7   Ratings          8518 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 532.5+ KB
None
          Up Votes   Down Votes      Ratings
count  8508.000000  8508.000000  8518.000000
mean      0.391396     0.121768     4.181028
std      11.613909     3.248022     1.262200
min       0.000000     0.000000     1.000000
25%       0.000000     0.000000     4.000000
50%       0.000000     0.000000     5.000000
75%       0.000000     0.0

In [25]:
# Rename columns to have consistent names
data_badminton.rename(columns={'Reviewer Name': 'reviewer_name',
                                'Review Title': 'review_title',
                                'Place of Review': 'place_of_review',
                                'Up Votes': 'up_votes',
                                'Down Votes': 'down_votes',
                                'Month': 'date_of_review',
                                'Review text': 'review_text',
                                'Ratings': 'reviewer_rating'}, inplace=True)

data_tawa.rename(columns={'Reviewer_Name': 'reviewer_name',
                            'Reviewer_Rating': 'reviewer_rating',
                            'Review_Title': 'review_title',
                            'Review_Text': 'review_text',
                            'Place_of_Review': 'place_of_review',
                            'Date_of_Review': 'date_of_review',
                            'Up_Votes': 'up_votes',
                            'Down_Votes': 'down_votes'}, inplace=True)

data_tea.rename(columns={'reviewer_rating': 'reviewer_rating',
                            'reviewer_name': 'reviewer_name',
                            'review_title': 'review_title',
                            'review_text': 'review_text',
                            'place_of_review': 'place_of_review',
                            'Date_of_review': 'date_of_review',
                            'up_votes': 'up_votes',
                            'Down_votes': 'down_votes'}, inplace=True)

In [26]:
# Add a 'Product' column to each dataset
data_badminton['Product'] = 'Badminton'
data_tawa['Product'] = 'Tawa'
data_tea['Product'] = 'Tea'

# Concatenate datasets for easier analysis
data_combined = pd.concat([data_badminton, data_tawa, data_tea], ignore_index=True)

In [27]:
data_combined.columns

Index(['reviewer_name', 'review_title', 'place_of_review', 'up_votes',
       'down_votes', 'date_of_review', 'review_text', 'reviewer_rating',
       'Product'],
      dtype='object')

In [28]:
data_combined.shape

(20219, 9)

In [29]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    if isinstance(text, str):
        # Text Cleaning: Remove special characters, punctuation, and stopwords
        text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
        text = re.sub(r'\d+', '', text)  # Remove digits
        text = text.lower()  # Convert text to lowercase
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(text)
        filtered_text = [word for word in word_tokens if word not in stop_words]

        # Text Normalization: Lemmatization
        lemmatizer = WordNetLemmatizer()
        normalized_text = [lemmatizer.lemmatize(word) for word in filtered_text]

        return normalized_text
    else:
        return ''

# Apply preprocessing function to the 'review_text' column
data_combined['cleaned_review_text'] = data_combined['review_text'].apply(preprocess_text)

# Preview the preprocessed text
print(data_combined[['review_text', 'cleaned_review_text']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swast\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\swast\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\swast\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                         review_text  \
0  Nice product, good quality, but price is now r...   
1  They didn't supplied Yonex Mavis 350. Outside ...   
2  Worst product. Damaged shuttlecocks packed in ...   
3  Quite O. K. , but nowadays  the quality of the...   
4  Over pricedJust â?¹620 ..from retailer.I didn'...   

                                 cleaned_review_text  
0  [nice, product, good, quality, price, rising, ...  
1  [didnt, supplied, yonex, mavis, outside, cover...  
2  [worst, product, damaged, shuttlecock, packed,...  
3  [quite, k, nowadays, quality, cork, like, year...  
4  [pricedjust, â¹, retaileri, didnt, understand,...  


In [30]:
data_combined.columns

Index(['reviewer_name', 'review_title', 'place_of_review', 'up_votes',
       'down_votes', 'date_of_review', 'review_text', 'reviewer_rating',
       'Product', 'cleaned_review_text'],
      dtype='object')

In [31]:
data_combined.head()

Unnamed: 0,reviewer_name,review_title,place_of_review,up_votes,down_votes,date_of_review,review_text,reviewer_rating,Product,cleaned_review_text
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4.0,Badminton,"[nice, product, good, quality, price, rising, ..."
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1.0,Badminton,"[didnt, supplied, yonex, mavis, outside, cover..."
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1.0,Badminton,"[worst, product, damaged, shuttlecock, packed,..."
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3.0,Badminton,"[quite, k, nowadays, quality, cork, like, year..."
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1.0,Badminton,"[pricedjust, â¹, retaileri, didnt, understand,..."


In [32]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Define a function to analyze the sentiment of each review text

def analyze_sentiment(word_list):
    # Convert the list of words into a single string
    text = ' '.join(word_list)
    
    compound_score = sid.polarity_scores(text)['compound']
    
    # Assign positive sentiment if compound score is greater than 0, else negative sentiment
    if compound_score > 0:
        return 1  # Positive sentiment
    else:
        return 0  # Negative sentiment

# Apply the function to the 'cleaned_review_text' column
data_combined['sentiments'] = data_combined['cleaned_review_text'].apply(analyze_sentiment)

# Display the DataFrame with the new 'sentiments' column
print(data_combined.head())


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\swast\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


            reviewer_name               review_title  \
0            Kamal Suresh               Nice product   
1       Flipkart Customer     Don't waste your money   
2  A. S. Raja Srinivasan   Did not meet expectations   
3     Suresh Narayanasamy                       Fair   
4               ASHIK P A                Over priced   

               place_of_review  up_votes  down_votes date_of_review  \
0   Certified Buyer, Chirakkal     889.0        64.0       Feb 2021   
1   Certified Buyer, Hyderabad     109.0         6.0       Feb 2021   
2  Certified Buyer, Dharmapuri      42.0         3.0       Apr 2021   
3     Certified Buyer, Chennai      25.0         1.0            NaN   
4                          NaN     147.0        24.0       Apr 2016   

                                         review_text  reviewer_rating  \
0  Nice product, good quality, but price is now r...              4.0   
1  They didn't supplied Yonex Mavis 350. Outside ...              1.0   
2  Worst product.

In [33]:
data_combined.head()

Unnamed: 0,reviewer_name,review_title,place_of_review,up_votes,down_votes,date_of_review,review_text,reviewer_rating,Product,cleaned_review_text,sentiments
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4.0,Badminton,"[nice, product, good, quality, price, rising, ...",1
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1.0,Badminton,"[didnt, supplied, yonex, mavis, outside, cover...",0
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1.0,Badminton,"[worst, product, damaged, shuttlecock, packed,...",0
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3.0,Badminton,"[quite, k, nowadays, quality, cork, like, year...",1
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1.0,Badminton,"[pricedjust, â¹, retaileri, didnt, understand,...",0


In [34]:
clean_data=data_combined[["review_text","sentiments"]]

In [35]:
clean_data.head()

Unnamed: 0,review_text,sentiments
0,"Nice product, good quality, but price is now r...",1
1,They didn't supplied Yonex Mavis 350. Outside ...,0
2,Worst product. Damaged shuttlecocks packed in ...,0
3,"Quite O. K. , but nowadays the quality of the...",1
4,Over pricedJust â?¹620 ..from retailer.I didn'...,0


In [36]:
clean_data.shape

(20219, 2)

In [21]:
clean_data.to_csv('clean_data.csv', index=False)

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
clean_data=pd.read_csv("clean_data.csv")

In [4]:
clean_data.columns

Index(['review_text', 'sentiments'], dtype='object')

In [5]:
clean_data['review_text'] = clean_data['review_text'].fillna('')

# Or drop rows with NaN values
clean_data = clean_data.dropna(subset=['review_text'])
X=clean_data['review_text']
y=clean_data['sentiments']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

print(X_train.shape, X_test.shape)

(15164,) (5055,)


## MLflow Integration

In [7]:
import mlflow

mlflow.set_experiment("Flipkart_Product_Reviews")

2024/03/25 16:15:12 INFO mlflow.tracking.fluent: Experiment with name 'Flipkart_Product_Reviews' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/swast/Documents/Innomatics_intern/task_8/mlruns/830375497645841527', creation_time=1711363512746, experiment_id='830375497645841527', last_update_time=1711363512746, lifecycle_stage='active', name='Flipkart_Product_Reviews', tags={}>

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# Define pipelines for each classifier
pipelines = {
    'knn': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('classifier', KNeighborsClassifier())
    ]),
    'svc': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('classifier', SVC())
    ]),
    'logistic_regression': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('classifier', LogisticRegression())
    ]),
    'random_forest': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('classifier', RandomForestClassifier())
    ]),
    'decision_tree': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('classifier', DecisionTreeClassifier())
    ])
}

# Define parameter grid for each algorithm
param_grids = {
    'knn': [
        {
            'tfidf__max_features': [1000, 5000, None],
            'classifier__n_neighbors': [3, 5, 7]
        }
    ],
    'svc': [
        {
            'tfidf__max_features': [1000, 5000, None],
            'classifier__kernel': ['linear', 'rbf'],
            'classifier__C': [0.1, 1, 10]
        }
    ],
    'logistic_regression': [
        {
            'tfidf__max_features': [1000, 5000, None],
            'classifier__C': [0.1, 1, 10],
            'classifier__penalty': ['l1', 'l2']
        }
    ],
    'random_forest': [
        {
            'tfidf__max_features': [1000, 5000, None],
            'classifier__n_estimators': [50, 100, 200]
        }
    ],
    'decision_tree': [
        {
            'tfidf__max_features': [1000, 5000, None],
            'classifier__max_depth': [None, 5, 10]
        }
    ]
}


In [9]:
import mlflow
from sklearn.model_selection import GridSearchCV

best_models = {}

# Run the Pipeline
for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )
    mlflow.sklearn.autolog(max_tuning_runs=None)
    with mlflow.start_run() as run:
        %time grid_search.fit(X_train, y_train)
       
    
    print('Train Score: ', grid_search.best_score_)
    print('Test Score: ', grid_search.score(X_test, y_test))
    
    best_models[algo] = grid_search.best_estimator_
    print()


********** knn **********




Fitting 5 folds for each of 9 candidates, totalling 45 fits
CPU times: total: 38min 58s
Wall time: 4min 12s
Train Score:  0.9023331958831047




Test Score:  0.8864490603363007

********** svc **********
Fitting 5 folds for each of 18 candidates, totalling 90 fits
CPU times: total: 6min 13s
Wall time: 6min 22s
Train Score:  0.9726326807927224




Test Score:  0.9728981206726014

********** logistic_regression **********
Fitting 5 folds for each of 18 candidates, totalling 90 fits
CPU times: total: 24.6 s
Wall time: 31.6 s
Train Score:  0.9696649955154688
Test Score:  0.970919881305638

********** random_forest **********




Fitting 5 folds for each of 9 candidates, totalling 45 fits
CPU times: total: 4min 14s
Wall time: 4min 22s
Train Score:  0.9672249494783417
Test Score:  0.9647873392680514

********** decision_tree **********




Fitting 5 folds for each of 9 candidates, totalling 45 fits
CPU times: total: 21.3 s
Wall time: 27.5 s
Train Score:  0.964653042565204
Test Score:  0.9641938674579624



## MLflow dashboard

![image.png](attachment:b38442ff-3860-4c29-8894-52e624522f71.png)

## mlflow model metrics

![image.png](attachment:89066a32-f56c-42c5-b70e-3bb326a23392.png)

## svm hyperparameter plots
![image.png](attachment:5728e6a0-215e-4646-93c3-3f544f37c4bb.png)

## register models and manage by tagging them


![image.png](attachment:cb611acc-80d7-4f97-96ae-f118b636a808.png)

# Build a Prefect Workflow and Auto Schedule it

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics

In [20]:
def load_data(file_path):
    """
    Load data from a CSV file.
    """
    return pd.read_csv(file_path)

In [28]:
def split_inputs_output(data, inputs, output):
    """
    Split features and target variables.
    """
    data[inputs] = data[inputs].fillna('')

    # Or drop rows with NaN values
    data = data.dropna(subset=[inputs])
    X = data[inputs]
    y = data[output]
    return X, y

In [29]:
def split_train_test(X, y, test_size=0.25, random_state=0):
    """
    Split data into train and test sets.
    """
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocess_data(X_train, X_test, y_train, y_test):
    """
    Apply TF-IDF vectorization to the text data.
    """
    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Fit TF-IDF vectorizer on training data and transform both training and testing data
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    return X_train_tfidf, X_test_tfidf, y_train, y_test


In [31]:
def train_model(X_train_scaled, y_train, hyperparameters):
    """
    Training the machine learning model.
    """
    clf = KNeighborsClassifier(**hyperparameters)
    clf.fit(X_train_scaled, y_train)
    return clf

In [32]:
def evaluate_model(model, X_train_scaled, y_train, X_test_scaled, y_test):
    """
    Evaluating the model.
    """
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    train_score = metrics.accuracy_score(y_train, y_train_pred)
    test_score = metrics.accuracy_score(y_test, y_test_pred)
    
    return train_score, test_score

In [33]:
def workflow(data_path):
    DATA_PATH = data_path
    INPUTS = 'review_text'
    OUTPUT = 'sentiments'
    HYPERPARAMETERS = {'n_neighbors': 3, 'p': 2}
    
    # Load data
    iris = load_data(DATA_PATH)

    # Identify Inputs and Output
    X, y = split_inputs_output(iris, INPUTS, OUTPUT)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = split_train_test(X, y)

    # Preprocess the data
    X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(X_train, X_test, y_train, y_test)

    # Build a model
    model = train_model(X_train_scaled, y_train, HYPERPARAMETERS)
    
    # Evaluation
    train_score, test_score = evaluate_model(model, X_train_scaled, y_train, X_test_scaled, y_test)
    
    print("Train Score:", train_score)
    print("Test Score:", test_score)

In [34]:
if __name__ == "__main__":
    workflow(data_path="clean_data.csv")

2024/03/25 16:56:12 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd4aa41cd347a4a4d9d96275b42e7c599', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Train Score: 0.9107095753099446
Test Score: 0.8896142433234422


In [35]:
from prefect import task, flow

In [36]:
@task
def load_data(file_path):
    """
    Load data from a CSV file.
    """
    return pd.read_csv(file_path)


@task
def split_inputs_output(data, inputs, output):
    """
    Split features and target variables.
    """
    data[inputs] = data[inputs].fillna('')

    # Or drop rows with NaN values
    data = data.dropna(subset=[inputs])
    X = data[inputs]
    y = data[output]
    return X, y
	

@task
def split_train_test(X, y, test_size=0.25, random_state=0):
    """
    Split data into train and test sets.
    """
    return train_test_split(X, y, test_size=test_size, random_state=random_state)
	
	
@task
def preprocess_data(X_train, X_test, y_train, y_test):
    """
    Apply TF-IDF vectorization to the text data.
    """
    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Fit TF-IDF vectorizer on training data and transform both training and testing data
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    return X_train_tfidf, X_test_tfidf, y_train, y_test

@task
def train_model(X_train_scaled, y_train, hyperparameters):
    """
    Training the machine learning model.
    """
    clf = KNeighborsClassifier(**hyperparameters)
    clf.fit(X_train_scaled, y_train)
    return clf
	

@task
def evaluate_model(model, X_train_scaled, y_train, X_test_scaled, y_test):
    """
    Evaluating the model.
    """
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    train_score = metrics.accuracy_score(y_train, y_train_pred)
    test_score = metrics.accuracy_score(y_test, y_test_pred)
    
    return train_score, test_score

In [37]:
# Workflow

@flow(name="KNN Training Flow")
def workflow():
    DATA_PATH = "clean_data.csv"
    INPUTS = 'review_text'
    OUTPUT = 'sentiments'
    HYPERPARAMETERS = {'n_neighbors': 3, 'p': 2}
    
    # Load data
    iris = load_data(DATA_PATH)

    # Identify Inputs and Output
    X, y = split_inputs_output(iris, INPUTS, OUTPUT)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = split_train_test(X, y)

    # Preprocess the data
    X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(X_train, X_test, y_train, y_test)

    # Build a model
    model = train_model(X_train_scaled, y_train, HYPERPARAMETERS)
    
    # Evaluation
    train_score, test_score = evaluate_model(model, X_train_scaled, y_train, X_test_scaled, y_test)
    
    print("Train Score:", train_score)
    print("Test Score:", test_score)

In [38]:
if __name__ == "__main__":
    workflow()

2024/03/25 16:59:47 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'cf1ffe6bac2d4a2db2d252d3592e5523', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Train Score: 0.9107095753099446
Test Score: 0.8896142433234422


In [39]:
if __name__ == "__main__":
    workflow.serve(
        name="my-first-deployment",
        cron="* * * * *"
    )

![image.png](attachment:ac76fc19-edb8-4322-a7bf-776cf33574c4.png)

![image.png](attachment:69280c6f-aa5b-40d6-859d-e4c3c44df8f4.png)