In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle

# Load and preprocess the data
dataset1 = pd.read_csv("Preprocessed_Loan_Default.csv", index_col=None)
df2 = dataset1.copy()
df2 = pd.get_dummies(df2, drop_first=True)

# Define independent and dependent variables
indep_X = df2.drop('Loan_Status_Non-Default', axis=1)
dep_Y = df2['Loan_Status_Non-Default']

# Function to split and scale the data
def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test, sc  # return scaler if saving needed later

# Function to evaluate model performance
def cm_prediction(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))
    return model

# Train and evaluate Naive Bayes model
X_train, X_test, y_train, y_test, scaler = split_scalar(indep_X, dep_Y)

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Evaluate performance
nb_model = cm_prediction(nb_model, X_test, y_test)

# Save the model and scaler using pickle
with open('naive_bayes_model.pkl', 'wb') as f:
    pickle.dump(nb_model, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


Confusion Matrix:
[[   0  501]
 [   0 1999]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       501
           1       0.80      1.00      0.89      1999

    accuracy                           0.80      2500
   macro avg       0.40      0.50      0.44      2500
weighted avg       0.64      0.80      0.71      2500

Accuracy Score: 0.7996


  'precision', 'predicted', average, warn_for)


In [30]:
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Predict probabilities
y_proba = nb_model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

# Calculate AUC-ROC score
auc_score = roc_auc_score(y_test, y_proba)
print("AUC-ROC Score:", auc_score)

AUC-ROC Score: 0.5028422394830149


In [31]:
import pandas as pd
import pickle

# Load the trained model and scaler
with open('naive_bayes_model.pkl', 'rb') as f:
    nb_model = pickle.load(f)

with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

# Define user input data with correct feature count (14 columns)
user_data = pd.DataFrame([[2, 3, 1, 4, 5, 6, 2, 3, 1, 1, 5, 3, 1, 1]],
                         columns=[
                             'Age', 'Income', 'Credit_Score', 'Debt_to_Income_Ratio',
                             'Existing_Loan_Balance', 'Loan_Amount', 'Interest_Rate',
                             'Loan_Duration_Months',
                             # Include only one dummy per category (due to drop_first=True)
                             'Gender_Male',
                             'Employment_Status_Unemployed',
                             'Location_Suburban',
                             'Location_Urban',
                             'Other_Feature_1',
                             'Other_Feature_2'
                         ])

# Apply scaling
user_data_scaled = scaler.transform(user_data)

# Make prediction
Loan_prediction = nb_model.predict(user_data_scaled)
Loan_probability = nb_model.predict_proba(user_data_scaled)[0]

# Output prediction
result = 'Non-Default' if Loan_prediction[0] == 1 else 'Default'
print(f'Loan Prediction: {result}')
print(f'Probability of Non-Default: {Loan_probability[1]:.2f}')


Loan Prediction: Default
Probability of Non-Default: 0.47
