###  LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
import seaborn as sns
import pickle

In [None]:
# Load your dataset
df_3 = pd.read_csv("TobeBalanced1.csv")

In [None]:
# Split data into features and target
X = df_3.drop('Default', axis=1)
y = df_3['Default']

In [None]:
# Apply RandomUnderSampler to handle class imbalance for the whole dataset
under_sampler = RandomUnderSampler(random_state=42)
X_under, y_under = under_sampler.fit_resample(X, y)

In [None]:
# Check the class distribution after undersampling
print("Class distribution after undersampling:")
print(pd.Series(y_under).value_counts())

In [None]:
# Train-test split after undersampling
X_train, X_test, y_train, y_test = train_test_split(X_under, y_under, test_size=0.2, random_state=42, stratify=y_under)

In [None]:
# Initialize and train Logistic Regression
log_reg_clf = LogisticRegression(random_state=42, solver='liblinear')
log_reg_clf.fit(X_train, y_train)

In [None]:
# Predictions for both train and test sets
y_train_pred = log_reg_clf.predict(X_train)
y_test_pred = log_reg_clf.predict(X_test)

In [None]:
# Train and test accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

In [None]:
# Confusion Matrix
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)

In [None]:
# Classification Report
classification_rep_train = classification_report(y_train, y_train_pred)
classification_rep_test = classification_report(y_test, y_test_pred)

In [None]:
# Print results
print("\nTrain Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

print("\nTrain Confusion Matrix:")
print(conf_matrix_train)
print("\nTest Confusion Matrix:")
print(conf_matrix_test)

print("\nTrain Classification Report:")
print(classification_rep_train)
print("\nTest Classification Report:")
print(classification_rep_test)

In [None]:
# Hyperparameter Tuning using RandomizedSearchCV for Logistic Regression
param_distributions = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

In [None]:
log_reg_clf = LogisticRegression(random_state=42, max_iter=1000)

In [None]:
random_search = RandomizedSearchCV(estimator=log_reg_clf, param_distributions=param_distributions, cv=5, n_jobs=-1, verbose=2)

In [None]:
# Train the model with the best parameters
random_search.fit(X_train, y_train)
best_log_reg_clf = random_search.best_estimator_

In [None]:
# Print the best parameters
print("Best parameters found: ", random_search.best_params_)

In [None]:
# Train the best model on the undersampled dataset
best_log_reg_clf.fit(X_train, y_train)

In [None]:
# Predictions for both train and test sets
y_train_pred = best_log_reg_clf.predict(X_train)
y_test_pred = best_log_reg_clf.predict(X_test)

In [None]:
# Train and test accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

In [None]:
# Confusion Matrix
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)

In [None]:
# Print results
print("\nTrain Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

print("\nTrain Confusion Matrix:")
print(conf_matrix_train)
print("\nTest Confusion Matrix:")
print(conf_matrix_test)

In [None]:
# Classification Report
classification_rep_train = classification_report(y_train, y_train_pred)
classification_rep_test = classification_report(y_test, y_test_pred)

In [None]:
print("\nTrain Classification Report:")
print(classification_rep_train)
print("\nTest Classification Report:")
print(classification_rep_test)

In [None]:
# Function to plot confusion matrix as heatmap
def plot_confusion_matrix(cm, title='Confusion Matrix', cmap=plt.cm.Blues):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap=cmap, cbar=False, square=True,
                xticklabels=['Not Default', 'Default'], yticklabels=['Not Default', 'Default'])
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()

In [None]:
# Plot heatmaps for confusion matrices
plot_confusion_matrix(conf_matrix_train, title='Training Confusion Matrix Heatmap')
plot_confusion_matrix(conf_matrix_test, title='Test Confusion Matrix Heatmap')

In [None]:
# Plot Learning Curve to detect overfitting/underfitting
def plot_learning_curve(estimator, X, y, title="Learning Curve", cv=None, n_jobs=None, train_sizes=np.linspace(0.1, 1.0, 5)):
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")

    # Compute the learning curve
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)

    # Calculate mean and standard deviation for train and test scores
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # Plot learning curve
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()

In [None]:

# Plot the learning curve for the best Logistic Regression model
plot_learning_curve(best_log_reg_clf, X_train, y_train, title="Logistic Regression Learning Curve", cv=5)

In [None]:
# Save the best model
with open('log_reg_undersampled.pkl', 'wb') as file:
    pickle.dump(best_log_reg_clf, file)

print("Model saved as 'log_reg_undersampled.pkl'")