# Flipkart Reviews Sentiment Analysis using MLFlow and PREFECT

# Loading the Data


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_df = pd.read_csv("/Users/rachusarang/Downloads/ILR/reviews_data_dump/reviews_badminton/data.csv")

data_df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [3]:
data_df.shape

(8518, 8)

In [4]:
col_names = [ col.strip().replace(' ', '_') for col in data_df.columns ]

data_df.columns = col_names

data_df.columns


Index(['Reviewer_Name', 'Review_Title', 'Place_of_Review', 'Up_Votes',
       'Down_Votes', 'Month', 'Review_text', 'Ratings'],
      dtype='object')

In [5]:
data_df['Review_Title'].value_counts()

Wonderful                                       416
Brilliant                                       303
Classy product                                  299
Excellent                                       298
Perfect product!                                295
                                               ... 
Great shuttle but wised if it’s more durable      1
Better game play experience                       1
awesome shuttle                                   1
Worst experience with Flipkart.                   1
For Mavis350                                      1
Name: Review_Title, Length: 194, dtype: int64

In [6]:
# replace null values with nan and remove
data_df.replace('',np.nan,inplace=True)
data_df.dropna(inplace=True)

# Identify X and y

In [7]:
X = data_df['Review_text']

In [8]:
conditions = [(data_df['Ratings'] >= 4),  # Positive sentiment
              (data_df['Ratings'] <= 3),  # Negative sentiment
             ]
values = [1, 0]  # Labels for positive(1) and negative(0) sentiments

y = np.select(conditions, values)

y = pd.Series(y)

In [9]:
y.value_counts(normalize=True)

1    0.80644
0    0.19356
dtype: float64

# Split X and y

In [10]:
#split the data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head()

250     Product is good as like as bought in the open ...
7735                                  Love it...READ MORE
2805                                        GoodREAD MORE
4914                                       superREAD MORE
1539                                   excellentREAD MORE
Name: Review_text, dtype: object

In [11]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(6410,) (6410,)
(1603,) (1603,)


# Preprocessing

In [12]:
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [13]:
lemmatizer = WordNetLemmatizer()

def clean(doc):
    
    #doc = str(doc)
    
    # Removing special characters and digits
    doc = re.sub(r'[^a-zA-Z\s]', ' ', doc)
    
    # Remove HTML tags
    doc = re.sub(r'<.*?>', ' ', doc)
    
    #removing 'READMORE' from reviews
    doc = doc.replace("READ MORE", " ")
    
    # change sentence to lower case
    doc = doc.lower()

    # Tokenization
    tokens = nltk.word_tokenize(doc)
    
    # Lemmatize
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Stop word removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in lemmatized_tokens if word.lower() not in stop_words]

    # Remove punctuation and numbers.
    doc = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    
    # Join and return
    return " ".join(filtered_tokens)

In [14]:
# import feature extraction methods from sklearn
from sklearn.feature_extraction.text import CountVectorizer

# instantiate a vectorizer
vect = CountVectorizer(preprocessor=clean)

# use it to extract features from training data
%time X_train_dtm = vect.fit_transform(X_train)

print(X_train_dtm.shape)

CPU times: user 1.8 s, sys: 87.4 ms, total: 1.89 s
Wall time: 1.9 s
(6410, 2086)


In [15]:
# transform testing data (using training data's features)
X_test_dtm = vect.transform(X_test)

print(X_test_dtm.shape)

(1603, 2086)


# Running the Experiment


In [16]:
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

In [17]:
import warnings

warnings.filterwarnings('ignore')

In [18]:
#pip install mlflow

In [None]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [21]:
import mlflow

mlflow.set_experiment("Flipkart_Reviews_sentiment_Analysis")

2024/03/26 20:52:12 INFO mlflow.tracking.fluent: Experiment with name 'Flipkart_Reviews_sentiment_Analysis' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/518147944355896295', creation_time=1711466532436, experiment_id='518147944355896295', last_update_time=1711466532436, lifecycle_stage='active', name='Flipkart_Reviews_sentiment_Analysis', tags={}>

In [22]:
pipe = Pipeline(
    [
        ('vectorization', CountVectorizer()),
        ('nb', MultinomialNB())
    ]
)

MAX_FEATURES = [1000, 1500, 2000]
ALPHA = [1, 10]

# Observe the Key Value Pair format
parameter_grid = [{'vectorization__preprocessor' : [clean],
                   'vectorization__max_features' : MAX_FEATURES, 
                   'nb__alpha' : ALPHA}]



In [23]:
clf = GridSearchCV(
    estimator=pipe, 
    param_grid=parameter_grid, 
    scoring='f1',
    cv=5,
    return_train_score=True,
    verbose=1
)

%time clf.fit(X_train, y_train)

print("Best estimator found on train set")
print(clf.best_estimator_)
print()

print('Score on Test Data: ', clf.score(X_test, y_test))

# Initialize the auto logger
# max_tuning_runs=None will make sure that all the runs are recorded.
# By default top 5 runs will be recorded for each experiment
mlflow.sklearn.autolog(max_tuning_runs=None)

with mlflow.start_run() as run:
    %time clf.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
CPU times: user 51.6 s, sys: 2.92 s, total: 54.6 s
Wall time: 55.2 s
Best estimator found on train set
Pipeline(steps=[('vectorization',
                 CountVectorizer(max_features=2000,
                                 preprocessor=<function clean at 0x7f95a3f364c0>)),
                ('nb', MultinomialNB(alpha=1))])

Score on Test Data:  0.9334819769602378




Fitting 5 folds for each of 6 candidates, totalling 30 fits
CPU times: user 59.2 s, sys: 3.64 s, total: 1min 2s
Wall time: 1min 8s


In [27]:
import joblib
from joblib import Memory

import os

In [24]:
# Improving the efficiency by applying cleaning the text data before hand

%time X_train_clean = X_train.apply(lambda doc: clean(doc))

CPU times: user 946 ms, sys: 71.2 ms, total: 1.02 s
Wall time: 1.12 s


In [25]:
%time X_test_clean = X_test.apply(lambda doc: clean(doc))


CPU times: user 238 ms, sys: 17.1 ms, total: 255 ms
Wall time: 272 ms


In [28]:
# Define a memory object to cache intermediate results
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

# Define the pipeline with caching
pipe = Pipeline(
    [
        ('vectorization', CountVectorizer()),
        ('nb', MultinomialNB())
    ], 
    memory=memory
)

MAX_FEATURES = [1000, 1500, 2000]
ALPHA = [1, 10]

# Observe the Key Value Pair format
parameter_grid = [
    {
        'vectorization__max_features': MAX_FEATURES,
        'nb__alpha': ALPHA
    }
]

clf = GridSearchCV(
    estimator=pipe,
    param_grid=parameter_grid,
    scoring='f1',
    cv=5,
    return_train_score=True,
    verbose=1
)

%time clf.fit(X_train_clean, y_train)

print("Best estimator found on train set")
print(clf.best_estimator_)
print()

print('Score on Test Data: ', clf.score(X_test_clean, y_test))

2024/03/26 20:56:03 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '63890af00c7e4ea891f14374f736e0e3', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting 5 folds for each of 6 candidates, totalling 30 fits
CPU times: user 2.1 s, sys: 108 ms, total: 2.2 s
Wall time: 6.54 s
Best estimator found on train set
Pipeline(memory=Memory(location=.cache/joblib),
         steps=[('vectorization', CountVectorizer(max_features=2000)),
                ('nb', MultinomialNB(alpha=1))])

Score on Test Data:  0.9334819769602378


# Tool - MLFlow


MLFlow helps to organize your experiments into runs.

## MLFlow keeps track of - 

* Tags
* Parameters
* Metrics
* Models
* Artifact
* Source code, Start and End Time, Authors etc..

# Auto Logging All Experiment Runs using MLFlow


In [53]:
#import joblib
#from joblib import Memory

#import os

In [29]:
# Define a memory object to cache intermediate results
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

pipelines = {
    'naive_bayes': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', MultinomialNB())
    ], memory=memory),
    'decision_tree': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', DecisionTreeClassifier())
    ], memory=memory),
    'logistic_regression': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', LogisticRegression())
    ], memory=memory),
    'svc': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', SVC())
    ], memory=memory)
    
}

# Define parameter grid for each algorithm
param_grids = {
    'naive_bayes': [
        {
            'vectorization': [CountVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000], 
            'classifier__alpha' : [1, 10]
        }
    ],
    'decision_tree': [
        {
            'vectorization': [CountVectorizer(), TfidfVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__max_depth': [None, 5, 10]
        }
    ],
    'logistic_regression': [
        {
            'vectorization': [CountVectorizer(), TfidfVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000], 
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['elasticnet'], 
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga'],
            'classifier__class_weight': ['balanced']
        }
    ],
    'svc': [
        {
            'vectorization': [CountVectorizer(), TfidfVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__C': [0.1, 1, 10],
            'classifier__kernel': ['linear', 'poly', 'rbf','sigmoid']

        }
    ]
}



In [30]:
# Perform GridSearchCV for each algorithm
best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='f1', 
                               return_train_score=True,
                               verbose=1
                              )
    
    mlflow.sklearn.autolog(max_tuning_runs=None)
    
    with mlflow.start_run() as run:
        %time grid_search.fit(X_train, y_train)
        
    print('Train Score: ', grid_search.best_score_)
    print('Test Score: ', grid_search.score(X_test, y_test))
        
    best_models[algo] = grid_search.best_estimator_
   



********** naive_bayes **********
Fitting 5 folds for each of 8 candidates, totalling 40 fits




CPU times: user 3.3 s, sys: 118 ms, total: 3.41 s
Wall time: 7.24 s
Train Score:  0.9286989633932802
Test Score:  0.9356287425149701
********** decision_tree **********
Fitting 5 folds for each of 24 candidates, totalling 120 fits




CPU times: user 14.8 s, sys: 176 ms, total: 14.9 s
Wall time: 18.8 s
Train Score:  0.9333582776997875
Test Score:  0.9291044776119403
********** logistic_regression **********
Fitting 5 folds for each of 72 candidates, totalling 360 fits




CPU times: user 12min 10s, sys: 2.82 s, total: 12min 13s
Wall time: 14min 53s
Train Score:  0.9332957892990097
Test Score:  0.9367481567714396
********** svc **********
Fitting 5 folds for each of 96 candidates, totalling 480 fits
CPU times: user 8min 41s, sys: 1.59 s, total: 8min 42s
Wall time: 8min 50s
Train Score:  0.9426915961922535
Test Score:  0.9466966966966966


In [31]:
for name, model in best_models.items():
    print(f"{name}")
    print(f"{model}")
    print()

naive_bayes
Pipeline(memory=Memory(location=.cache/joblib),
         steps=[('vectorization', CountVectorizer(max_features=5000)),
                ('classifier', MultinomialNB(alpha=1))])

decision_tree
Pipeline(memory=Memory(location=.cache/joblib),
         steps=[('vectorization', CountVectorizer(max_features=1500)),
                ('classifier', DecisionTreeClassifier(max_depth=10))])

logistic_regression
Pipeline(memory=Memory(location=.cache/joblib),
         steps=[('vectorization', CountVectorizer(max_features=5000)),
                ('classifier',
                 LogisticRegression(C=1, class_weight='balanced', l1_ratio=0.5,
                                    penalty='elasticnet', solver='saga'))])

svc
Pipeline(memory=Memory(location=.cache/joblib),
         steps=[('vectorization', TfidfVectorizer(max_features=2000)),
                ('classifier', SVC(C=1))])



In [33]:
from sklearn import metrics

In [34]:
for name, model in best_models.items():
    print("*"*10, name, "*"*10)
    
    joblib.dump(model, f'/Users/rachusarang/Downloads/ILR/reviews_data_dump/Sentimentanalysis_badminton/{name}.pkl')
    model = joblib.load(f'/Users/rachusarang/Downloads/ILR/reviews_data_dump/Sentimentanalysis_badminton/{name}.pkl')
    
    %time y_test_pred = model.predict(X_test_clean)
    print("Test Score (F1)", metrics.f1_score(y_test, y_test_pred))
    
    print("Model Size:", os.path.getsize(f'/Users/rachusarang/Downloads/ILR/reviews_data_dump/Sentimentanalysis_badminton/{name}.pkl'), "Bytes")

********** naive_bayes **********
CPU times: user 6.11 ms, sys: 185 µs, total: 6.29 ms
Wall time: 6.3 ms
Test Score (F1) 0.9312920089619119
Model Size: 179127 Bytes
********** decision_tree **********
CPU times: user 6.16 ms, sys: 184 µs, total: 6.34 ms
Wall time: 6.28 ms
Test Score (F1) 0.9190751445086706
Model Size: 81385 Bytes
********** logistic_regression **********
CPU times: user 5.72 ms, sys: 128 µs, total: 5.84 ms
Wall time: 5.85 ms
Test Score (F1) 0.919226393629124
Model Size: 109498 Bytes
********** svc **********
CPU times: user 145 ms, sys: 674 µs, total: 146 ms
Wall time: 146 ms
Test Score (F1) 0.9275045537340619
Model Size: 465562 Bytes


In [35]:
# Stop the auto logger

mlflow.sklearn.autolog(disable=True)

In [36]:
import time
import joblib
import os

# Custom Experiment Tracking and Database Integration with MLFlow


In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow_1.db")

mlflow.set_experiment("Flipkart reviews sentiment analysis")

In [37]:
dev = "Rachana"
best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )

    # Fit
    start_fit_time = time.time()
    grid_search.fit(X_train, y_train)
    end_fit_time = time.time()

    # Predict
    start_predict_time = time.time()
    y_pred = grid_search.predict(X_test)
    end_predict_time = time.time()

    # Saving the best model
    joblib.dump(grid_search.best_estimator_, f'/Users/rachusarang/Downloads/ILR/reviews_data_dump/Sentimentanalysis_badminton/{algo}.pkl')
    model_size = os.path.getsize(f'/Users/rachusarang/Downloads/ILR/reviews_data_dump/Sentimentanalysis_badminton/{algo}.pkl')

    # Pring Log
    print('Train Score: ', grid_search.best_score_)
    print('Test Score: ', grid_search.score(X_test, y_test))
    print("Fit Time: ", end_fit_time - start_fit_time)
    print("Predict Time: ", end_predict_time - start_predict_time)
    print("Model Size: ", model_size)
    
    print()

    # Start the experiment run
    with mlflow.start_run() as run:
        # Log tags with mlflow.set_tag()
        mlflow.set_tag("developer", dev)

        # Log Parameters with mlflow.log_param()
        mlflow.log_param("algorithm", algo)
        mlflow.log_param("hyperparameter_grid", param_grids[algo])
        mlflow.log_param("best_hyperparameter", grid_search.best_params_)

        # Log Metrics with mlflow.log_metric()
        mlflow.log_metric("train_score", grid_search.best_score_)
        mlflow.log_metric("test_score", grid_search.score(X_test, y_test))
        mlflow.log_metric("fit_time", end_fit_time - start_fit_time)
        mlflow.log_metric("predict_time", end_predict_time - start_predict_time)
        mlflow.log_metric("model_size", model_size)

        # Log Model using mlflow.sklearn.log_model()
        mlflow.sklearn.log_model(grid_search.best_estimator_, f"{algo}_model")

********** naive_bayes **********
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Train Score:  0.8812792511700469
Test Score:  0.8920773549594511
Fit Time:  2.6011757850646973
Predict Time:  0.008444070816040039
Model Size:  87911

********** decision_tree **********
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Train Score:  0.8875195007800312
Test Score:  0.8814722395508422
Fit Time:  13.859179973602295
Predict Time:  0.010019063949584961
Model Size:  68873

********** logistic_regression **********
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Train Score:  0.8931357254290171
Test Score:  0.8983156581409857
Fit Time:  739.7374629974365
Predict Time:  0.008857965469360352
Model Size:  109498

********** svc **********
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Train Score:  0.9042121684867395
Test Score:  0.9082969432314411
Fit Time:  523.3775610923767
Predict Time:  0.11654090881347656
Model Size:  346010



###### Model Registry provides functionality for managing and versioning machine learning models and their associated metadata. It allows data scientists and machine learning engineers to track, share, and collaborate on models throughout their lifecycle, from experimentation to production deployment.

Key Features:

* Model Registration
* Model Versioning
* Stage Transitions
* Intra Team Collaboration

### Archived: These versions are no longer in active use.
### Staged: These versions are ready for deployment pending final validation.
### Production: These versions are actively serving users in live environments.

# Machine Learning Workflow Orchestration

Orchestration refers to the coordination and management of various tasks, resources, and processes involved in the end-to-end machine learning lifecycle. This includes:

1. Data Preparation and Management
2. Model Training
3. Experimentation and Evaluaiton
4. Model Deployment
5. Monitor and Management
6. Automation of repetitive tasks

## Introducing Prefect
Prefect is an open-source orchestration and observability platform that empowers developers to build and scale resilient code quickly, turning their Python scripts into resilient, recurring workflows.

## Why Prefect?

* Python based open source tool
* Manage ML Pipelines
* Schedule and Monitor the flow
* Gives observability into failures
* Native dask integration for scaling (Dask is used for parallel computing)

# Refactoring the ML Workflow

In [1]:
from prefect import task, flow

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
#import string
#import re
#import nltk
##from nltk.tokenize import word_tokenize
#from nltk.corpus import stopwords
#from nltk.stem import WordNetLemmatizer

import joblib
from sklearn.naive_bayes import MultinomialNB
from joblib import Memory


import os

import warnings
warnings.filterwarnings('ignore')


from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline

from sklearn import metrics

In [3]:
@task
def load_data(file_path):
    """
    Load data from a CSV file.
    """
    df = pd.read_csv(file_path)
    df.columns = [ col.strip().replace(' ', '_') for col in df.columns ]
    
    df['Ratings'] = pd.Series(np.select([(df['Ratings'] >= 4), (df['Ratings'] <= 3)], [1, 0]))
    
    # replace null values with nan and remove
    df.replace('',np.nan,inplace=True)
    df.dropna(inplace=True)
    
    return df


@task
def input_output(data, input, output):
    """
    Split features and target variables.
    """
    X = data[input]
    y = data[output]
    return X, y


@task
def split_train_test(X, y, test_size=0.25, random_state=42):
    """
    Split data into train and test sets.
    """
    return train_test_split(X, y, test_size=test_size, random_state=random_state)


@task
def train_model(X_train, y_train, **hyperparameters):
    """
    Training the machine learning model.
    """
    cachedir = '.cache'
    memory = Memory(location=cachedir, verbose=0)

    pipe = Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', MultinomialNB())
    ], memory=memory)

    clf = GridSearchCV(estimator=pipe,
                       param_grid=hyperparameters,
                       scoring='f1',
                       cv=4,
                       return_train_score=True,
                       verbose=1
                      )

    clf.fit(X_train, y_train)
    return clf


@task
def evaluate_model(model, X_train, y_train, X_test, y_test):
    """
    Evaluating the model.
    """
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_score = metrics.f1_score(y_train, y_train_pred)
    test_score = metrics.f1_score(y_test, y_test_pred)
    
    return train_score, test_score

In [4]:
#X = data_df['Review_text']

#y = pd.Series(np.select([(data_df['Ratings'] >= 4), (data_df['Ratings'] <= 3)], [1, 0]))


In [4]:
# Workflow

@flow(name="Multinomial Naive Bayes Training")
def workflow():
    data_path = "/Users/rachusarang/Downloads/ILR/reviews_data_dump/reviews_badminton/data.csv"
    INPUT = 'Review_text'
    OUTPUT = 'Ratings'
    HYPERPARAMETERS = {
        'vectorization': [CountVectorizer()],
        'vectorization__max_features' : [5000], 
        'classifier__alpha' : [1]
    }
    
    # Load data
    df = load_data(data_path)

    # Identify Inputs and Output
    X, y = input_output(df, INPUT, OUTPUT)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = split_train_test(X, y)

    # Build a model
    model = train_model(X_train, y_train, **HYPERPARAMETERS)
    
    # Evaluation
    train_score, test_score = evaluate_model(model, X_train, y_train, X_test, y_test)
    
    print("Train Score:", train_score)
    print("Test Score:", test_score)
    

In [5]:
if __name__ == "__main__":
    
    workflow()

Fitting 4 folds for each of 1 candidates, totalling 4 fits


Train Score: 0.9410821643286573
Test Score: 0.9363989250522544
