# Libs

In [0]:
import pandas as pd
import numpy as np
import math
from warnings import filterwarnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,f1_score,precision_score,recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
import mlflow
import datetime
filterwarnings('ignore')

# Read Data from Feature Store

In [0]:
data = spark.sql("select * from feature_store.customer_sentiment_analysis").toPandas()

# Checking Class Imbalance

In [0]:
data.value_counts('Sentiment').reset_index().plot.bar(x='Sentiment')

# Class imbalance exits, let's treat it

In [0]:
# Upsampling
max_count = data.Sentiment.value_counts().max()
max_category = data.Sentiment.value_counts().sort_values(ascending=False).index[0]
up_sampled_dataframe = pd.DataFrame()

# Upsampling the minority categories
for sentiment_category in data.Sentiment.unique():
    n_count=data[data['Sentiment']==sentiment_category]['Sentiment'].count()
    multiplyfactor = int(math.ceil(max_count/n_count))
    print('category ',sentiment_category)
    print('count :',n_count)
    print('multiply_fac',multiplyfactor)
    for i in range(0,multiplyfactor):
        up_sampled_dataframe= pd.concat([up_sampled_dataframe,data[data['Sentiment']==sentiment_category]])

# set sample size to maximum sample size
sample_size = max_count
df_equal_overall = pd.DataFrame()
for i in up_sampled_dataframe.Sentiment.unique():
  X = up_sampled_dataframe[up_sampled_dataframe.Sentiment == i].sample(sample_size)
  df_equal_overall = df_equal_overall.append(X)

# Checking Class Imblance now

In [0]:
df_equal_overall.value_counts('Sentiment').reset_index().plot.bar(x='Sentiment')

# Class imbalance doesn't exist

# Train Test Split

In [0]:
data=data.dropna()
x_data = data['combined_cleaned_lemmatized']
y_data = data['Sentiment']
X_train,X_test,y_train,y_test = train_test_split(x_data,y_data,test_size=0.2,stratify=y_data,random_state=44)

# Baseline Models

In [0]:
def metric_for_model(actual,pred):
    accuracy_score1 = accuracy_score(actual,pred)
    confusion_matrix1 = confusion_matrix(actual,pred)
    classification_report1= classification_report(actual,pred)
    f1_score1=f1_score(actual,pred,average='weighted')
    precision_score1=precision_score(actual,pred,average='weighted')
    recall_score1=recall_score(actual,pred,average='weighted')   
    return accuracy_score1, confusion_matrix1, classification_report1,f1_score1,precision_score1,recall_score1

In [0]:
# Base Model
model_name='Naive Bayes'
mlflow.set_experiment(experiment_id="2555925875978836")
with mlflow.start_run() as run:
        current_datetime = datetime.datetime.now()
        model = Pipeline([('vectorize',CountVectorizer(ngram_range=(1,2))),
                ('tfidf',TfidfTransformer()),
                ('clf',MultinomialNB())])
        model.fit(X_train,y_train)
        y_pred_model = model.predict(X_test)
        (as1,cm1,cr1,f1s1,ps1,rs1)=metric_for_model(y_test,y_pred_model)
        print(f'{model_name} Model ')
        print('Accuracy Score : %s' %as1)
        print('Precision Score : %s' %ps1)
        print('Recall Score : %s' %rs1)
        print('F1 Score : %s' %f1s1)
        print('Confusion Matrix : %s' %cm1)
        print('Classification Report : %s' %cr1)
        mlflow.log_param('Model',f"{model_name}")
        mlflow.log_param('Params',f"{model_name}")
        mlflow.log_metric('Accuracy Score',as1)
        mlflow.log_metric('Precision Score',ps1)
        mlflow.log_metric('Recall Score',rs1)
        mlflow.log_metric('F1 Score',f1s1)
        mlflow.log_param('Confusion Matrix',cm1)
        mlflow.log_param('Classification Report',cr1)
        try:
                mlflow.sklearn.log_model(model,f'{model_name}')
                modelpath=f'/dbfs/group1_mlops/model/{model_name}/'
                mlflow.sklearn.save_model(model,modelpath)
                best_run=run.info
        except:
                pass