In [2]:
#importing important libraries
import pandas as pd
import numpy as np 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix,f1_score
import joblib

In [3]:
#import warnings
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

In [4]:
#loading the dataset
df = pd.read_excel('sample_data.xlsx')
df.head()

Unnamed: 0,Income,LoanAmount,CreditScore,InterestRate,Target
0,85994,50587,520,15.23,0
1,50432,124440,458,4.81,0
2,84208,129188,451,21.17,1
3,31713,44799,743,7.07,0
4,20437,9139,633,6.51,0


In [5]:
#Seperate Target and independent variables
X = df.drop(columns=['Target'])
y = df['Target']

In [6]:
#Seperate numeric and categorical columns

numeric_features = X.select_dtypes(include=['int64','float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [8]:
#Create a transformer for data preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean'))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1))])

preprocessor= ColumnTransformer(
    transformers=[
        ('num',numeric_transformer,numeric_features),
        ('cat',categorical_transformer,categorical_features)])


In [12]:
#Create a pipeline with a Naive Bayes model(Gaussian NB)
base_pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',GaussianNB())
])

In [13]:
#Perform train test split
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state=42, stratify=y)

In [14]:
#Train and evaluate the model
base_pipeline.fit(X_train,y_train)

In [17]:
#Predictions and evaluation metrics
print("Eval Metrics For Base Model")
y_train_pred = base_pipeline.predict(X_train)
y_test_pred = base_pipeline.predict(X_test)
def evaluate_model(y_true, y_pred, data_type='Train'):
    print(f"Evaluation Metrics for {data_type} data:")
    print(f"Accuracy: {accuracy_score(y_true,y_pred)}")
    print(f"F1 Score: {f1_score(y_true, y_pred)}")
    print("Classification Report:")
    print(classification_report(y_true,y_pred))
    print("Confusion Matric:")
    print(confusion_matrix(y_true, y_pred))
    print("\n")

evaluate_model(y_train,y_train_pred,"Train")
evaluate_model(y_test,y_test_pred,"Test")
    

Eval Metrics For Base Model
Evaluation Metrics for Train data:
Accuracy: 0.8975
F1 Score: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95       359
           1       0.00      0.00      0.00        41

    accuracy                           0.90       400
   macro avg       0.45      0.50      0.47       400
weighted avg       0.81      0.90      0.85       400

Confusion Matric:
[[359   0]
 [ 41   0]]


Evaluation Metrics for Test data:
Accuracy: 0.9
F1 Score: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95        90
           1       0.00      0.00      0.00        10

    accuracy                           0.90       100
   macro avg       0.45      0.50      0.47       100
weighted avg       0.81      0.90      0.85       100

Confusion Matric:
[[90  0]
 [10  0]]




In [19]:
#Hyperparameter tuning
param_grid={
    'classifier__var_smoothing':np.logspace(0,-9,num=100)
}
grid_search = GridSearchCV(base_pipeline, param_grid,cv=5, n_jobs=-1,scoring='accuracy')
grid_search.fit(X_train,y_train)
print(f"Best Hyperparameters : {grid_search.best_params_}")

Best Hyperparameters : {'classifier__var_smoothing': 1.0}


In [21]:
#Build and evaluate the ebst model
print("")
print("-----EVAL METRICS FOR BEST MODEL-----")
print("")

best_pipeline = grid_search.best_estimator_

y_train_best_pred = best_pipeline.predict(X_train)
y_test_best_pred = best_pipeline.predict(X_test)

evaluate_model(y_train_best_pred,y_train,"Train(Best Model)")
evaluate_model(y_test_best_pred,y_test,"Test(Best Model)")



-----EVAL METRICS FOR BEST MODEL-----

Evaluation Metrics for Train(Best Model) data:
Accuracy: 0.8975
F1 Score: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.90      0.95       400
           1       0.00      0.00      0.00         0

    accuracy                           0.90       400
   macro avg       0.50      0.45      0.47       400
weighted avg       1.00      0.90      0.95       400

Confusion Matric:
[[359  41]
 [  0   0]]


Evaluation Metrics for Test(Best Model) data:
Accuracy: 0.9
F1 Score: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.90      0.95       100
           1       0.00      0.00      0.00         0

    accuracy                           0.90       100
   macro avg       0.50      0.45      0.47       100
weighted avg       1.00      0.90      0.95       100

Confusion Matric:
[[90 10]
 [ 0  0]]




In [23]:
#Compare metrics

metrics_comparison = pd.DataFrame({
    'Metric':['Accuracy','F1_Score'],
    'Base_ModelTrain':[accuracy_score(y_train,y_train_pred),f1_score(y_train,y_train_pred)],
    'Base_ModelTest':[accuracy_score(y_test,y_test_pred),f1_score(y_test,y_test_pred)],
    'Best_ModelTrain':[accuracy_score(y_train,y_train_best_pred),f1_score(y_train,y_train_best_pred)],
    'Best_ModelTest':[accuracy_score(y_test,y_test_best_pred),f1_score(y_test,y_test_best_pred)]
})
print(metrics_comparison)

     Metric  Base_ModelTrain  Base_ModelTest  Best_ModelTrain  Best_ModelTest
0  Accuracy           0.8975             0.9           0.8975             0.9
1  F1_Score           0.0000             0.0           0.0000             0.0


In [24]:
#Train the final model
final_model = best_pipeline.fit(X,y)

In [25]:
#Save the final model
joblib.dump(final_model,'final_naive_bayes_model.pkl')


['final_naive_bayes_model.pkl']

In [26]:
print("")
print("Final model saved as 'final_naive_bayes_model.pkl'")


Final model saved as 'final_naive_bayes_model.pkl'
