In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
# Load datasets
loan_data = pd.read_csv("loan_data.csv")
data_dict = pd.read_csv("Data_Dictionary.csv", encoding="latin1")

# Check for missing values
null_counts = loan_data.isnull().sum()
null_percent = (null_counts / len(loan_data)) * 100


In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve

# Load dataset
loan_data = pd.read_csv("loan_data.csv")

# Step 1: Class distribution
target_col = 'TARGET'
class_distribution = loan_data[target_col].value_counts(normalize=True) * 100
print("Class Distribution (%):\n", class_distribution)

# Step 2: Plot original distribution
sns.countplot(data=loan_data, x=target_col)
plt.title("Original Class Distribution")
plt.savefig("original_distribution.png")
plt.clf()

# Step 3: Balance data using undersampling
class_0 = loan_data[loan_data[target_col] == 0]
class_1 = loan_data[loan_data[target_col] == 1]
min_size = min(len(class_0), len(class_1))
balanced_df = pd.concat([class_0.sample(min_size), class_1.sample(min_size)])

# Step 4: Encode categorical columns
label_encoders = {}
for col in balanced_df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    balanced_df[col] = le.fit_transform(balanced_df[col].astype(str))
    label_encoders[col] = le

# Step 5: Prepare features and target
X = balanced_df.drop(columns=[target_col])
y = balanced_df[target_col]
X = X.fillna(0)

# Step 6: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

# Step 7: Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 8: Evaluate model
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

# Step 9: Sensitivity (Recall)
conf_matrix = confusion_matrix(y_test, y_pred)
TP = conf_matrix[1, 1]
FN = conf_matrix[1, 0]
sensitivity = TP / (TP + FN)
print(f"Sensitivity: {sensitivity:.2f}")

# Step 10: AUC
auc = roc_auc_score(y_test, y_proba)
print(f"AUC Score: {auc:.2f}")

# Step 11: Plot balanced class distribution
sns.countplot(data=balanced_df, x=target_col)
plt.title("Balanced Class Distribution")
plt.savefig("balanced_distribution.png")
plt.clf()

# Step 12: Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.savefig("roc_curve.png")
plt.clf()


Class Distribution (%):
 TARGET
0    91.927118
1     8.072882
Name: proportion, dtype: float64


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Sensitivity: 0.58
AUC Score: 0.62


<Figure size 640x480 with 0 Axes>