In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [None]:
df_customer=pd.read_csv(r"C:\Users\skd53\OneDrive\Machine learning\ML_classification_project\customers.csv")
df_loan=pd.read_csv(r"C:\Users\skd53\OneDrive\Machine learning\ML_classification_project\loans.csv")
df_bureau=pd.read_csv(r"C:\Users\skd53\OneDrive\Machine learning\ML_classification_project\bureau_data.csv")


In [None]:
print(df_bureau.shape,df_customer.shape,df_loan.shape)

In [None]:
df_loan.head(3)

In [None]:
df_customer.head(3)

In [None]:
df=pd.merge(df_customer,df_loan,on="cust_id")
df.head(3)

In [None]:
df=pd.merge(df,df_bureau)
df.head()

In [None]:
df["default"]=df["default"].astype(int)
df.default.value_counts()

In [None]:
X=df.drop("default",axis="columns")
y=df["default"]
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,test_size=0.25,random_state=42)

In [None]:
df_train=pd.concat([X_train,y_train],axis='columns')
df_test=pd.concat([X_test,y_test],axis='columns')


In [None]:
df_train.shape,df_test.shape

In [None]:
df_train.isna().sum()

In [None]:
df_train["residence_type"].unique()

In [None]:
df_train.residence_type.mode()[0]

In [None]:
df_train["residence_type"].fillna(df_train.residence_type.mode()[0],inplace=True)
df_train.isna().sum()

In [None]:
df_train["residence_type"].unique()

In [None]:
df_test.residence_type.fillna(df_train.residence_type.mode()[0],inplace=True)

In [None]:
df_train.duplicated().sum()

In [None]:
df_train.describe()

In [None]:
df_train.columns

In [None]:
numerical_columns=["age","income","number_of_dependants","years_at_current_address","zipcode","sanction_amount","loan_amount","processing_fee","gst","net_disbursement","principal_outstanding","bank_balance_at_application","number_of_open_accounts","number_of_closed_accounts","total_loan_months","delinquent_months","total_dpd"
,"enquiry_count","credit_utilization_ratio"]
Category_columns=["gender","marital_status","employment_status","residence_type","city","state","zipcode","loan_purpose","loan_type","default"]




In [None]:
cols_per_row = 4  

# Calculate rows needed
rows = (len(numerical_columns) + cols_per_row - 1) // cols_per_row

# Set figure size
plt.figure(figsize=(cols_per_row*5, rows*4))

# Loop through each numerical column
for i, col in enumerate(numerical_columns, 1):
    plt.subplot(rows, cols_per_row, i)
    sns.boxplot(x=df_train[col])  # vertical boxplot
    plt.title(col)

plt.tight_layout()
plt.show()

# for col in numerical_columns:
#     sns.boxplot(df_train[col])

In [None]:
cols_per_row = 4  

# Calculate rows needed
rows = (len(numerical_columns) + cols_per_row - 1) // cols_per_row

# Set figure size
plt.figure(figsize=(cols_per_row*5, rows*4))

# Loop through each numerical column
for i, col in enumerate(numerical_columns, 1):
    plt.subplot(rows, cols_per_row, i)
    sns.histplot(x=df_train[col])  # vertical boxplot
    plt.title(col)

plt.tight_layout()
plt.show()

In [None]:
df_train[df_train["processing_fee"] > df_train["loan_amount"].max()][["loan_amount", "processing_fee"]]


In [None]:
df_train[(df_train["processing_fee"] / df_train["loan_amount"].max())>0.03][["loan_amount", "processing_fee"]]


In [None]:
for col in Category_columns:
    print(df_train[col].unique())

In [None]:
df_train["loan_purpose"]=df_train["loan_purpose"].replace("Personaal","Personal")
df_test["loan_purpose"]=df_test["loan_purpose"].replace("Personaal","Personal")

In [None]:
df_test.loan_purpose.unique()

In [None]:
df_train[df_train.loan_amount > df_train.sanction_amount]

### Feature Engineering

In [None]:
df_train["loan_to_income"]=round(df_train["loan_amount"] / df_train["income"],2)

df_test["loan_to_income"]=round(df_test["loan_amount"] / df_test["income"],2)


In [None]:
df_train["loan_to_income"].describe()


In [None]:
sns.histplot(df_train["loan_to_income"],kde=True)
plt.tight_layout()
plt.show()

In [None]:
df_train[["delinquent_months","total_loan_months"]]

In [None]:
df_train["delinquency_ratio"]=round(df_train["delinquent_months"]*100 / df_train["total_loan_months"],2)
df_train[["delinquent_months","total_loan_months","delinquency_ratio"]].head()

In [None]:
df_test["delinquency_ratio"]=round(df_test["delinquent_months"]*100 / df_test["total_loan_months"],2)


In [None]:
sns.histplot(df_train["delinquency_ratio"],kde=True)
plt.tight_layout()
plt.show()

In [None]:
df_train[["total_dpd","delinquent_months"]]

In [None]:
df_train["avg_dpd_per_delinquent"]=(df_train["total_dpd"] / df_train["delinquent_months"]).round(1)


In [None]:
df_train["avg_dpd_per_delinquent"].describe()


In [None]:
df_train["avg_dpd_per_delinquent"].isna().sum()


In [None]:
df_train["avg_dpd_per_delinquent"] = np.where(
    df_train["delinquent_months"] != 0 ,
    (df_train["total_dpd"] / df_train["delinquent_months"]).round(1),
    0
)


df_test["avg_dpd_per_delinquent"] = np.where(
    df_test["delinquent_months"] != 0 ,
    (df_test["total_dpd"] / df_test["delinquent_months"]).round(1),
    0
)


In [None]:
df_train["avg_dpd_per_delinquent"].describe()


In [None]:
df_train.columns

### Feature Selection

In [None]:
# Dropping the coloumns which are not that useful from both test and train dataframe

df_train=df_train.drop(["cust_id","loan_id"],axis="columns")
df_test=df_test.drop(["cust_id","loan_id"],axis="columns")

In [None]:
df_train=df_train.drop(["disbursal_date","installment_start_dt","loan_amount","income","total_loan_months","delinquent_months","total_dpd"],axis="columns")
df_test=df_test.drop(["disbursal_date","installment_start_dt","loan_amount","income","total_loan_months","delinquent_months","total_dpd"],axis="columns")

In [None]:
df_train.columns

In [None]:
df_train.select_dtypes(["int64","float64"]).columns

In [None]:
df_train["default"].info()

In [None]:
# Scaling the columns

X_train=df_train.drop("default" , axis="columns")
y_train=df_train["default"]

col_to_scale=X_train.select_dtypes(["int64","float64"]).columns

scaler=MinMaxScaler()
X_train[col_to_scale]=scaler.fit_transform(X_train[col_to_scale])
X_train.head()


In [None]:
X_train.describe()

In [None]:
X_test=df_test.drop("default" , axis="columns")
y_test=df_test["default"]

X_test[col_to_scale]=scaler.transform(X_test[col_to_scale])
X_test.describe()


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(data):
    data = data.fillna(data.mean()) #we had done this because we can see the 7 na values in the loan_to_income column

    vif_df = pd.DataFrame()
    vif_df['Column'] = data.columns
    vif_df['VIF'] = [variance_inflation_factor(data.values,i) for i in range(data.shape[1])]
    return vif_df

In [None]:
calculate_vif(X_train[col_to_scale])

In [None]:
print(X_train[col_to_scale].isna().sum())


In [None]:
feature_to_drop_vif=["sanction_amount","processing_fee","gst","net_disbursement","principal_outstanding"]

In [None]:
X_train=X_train.drop(feature_to_drop_vif,axis='columns')


In [None]:
vif_df=calculate_vif(X_train.select_dtypes(["int64","float64"]))
vif_df


In [None]:
# i will store all the vif columns as feature coloumns
selected_numeric_feature_vif=vif_df.Column.values
selected_numeric_feature_vif

In [None]:
# correlation matrix
numeric_df = X_train.select_dtypes(include=['int64', 'float64']).copy()

# Include the target column
numeric_df['default'] = df_train['default']

# Compute correlation matrix
cm = numeric_df.corr()

# Plot heatmap
plt.figure(figsize=(12,12))
sns.heatmap(cm, annot=True)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
def calculate_woe_iv(df,feature,target):
    grouped=df.groupby(feature)[target].agg(["count","sum"])
    grouped=grouped.rename(columns={"count":"total","sum":"good"})
    grouped["bad"]=grouped["total"]-grouped["good"]

    total_good=grouped["good"].sum()
    total_bad=grouped["bad"].sum()

    grouped["good_pct"]=grouped["good"]/total_good
    grouped["bad_pct"]=grouped["bad"]/total_bad
    grouped["woe"]=np.log(grouped["good_pct"]/grouped["bad_pct"])
    grouped["iv"]=(grouped["good_pct"]-grouped["bad_pct"])*grouped["woe"]
    grouped["woe"] = grouped["woe"].replace([np.inf, -np.inf], 0)
    grouped["iv"] = grouped["iv"].replace([np.inf, -np.inf], 0)
    
    total_iv=grouped["iv"].sum()


    return grouped, total_iv

grouped,total_iv=calculate_woe_iv(pd.concat([X_train,y_train],axis=1),"loan_purpose","default")
grouped


In [None]:
iv_values = {}

for feature in X_train.columns:
    if X_train[feature].dtype == "object":
        # directly calculate woe/iv for categorical
        _, iv = calculate_woe_iv(
            pd.concat([X_train[[feature]], y_train], axis=1), 
            feature, 
            "default"
        )
    else:
        # bin continuous variable
        x_binned = pd.cut(X_train[feature], bins=10, labels=False)
        df_temp = pd.DataFrame({feature: x_binned, "default": y_train})
        _, iv = calculate_woe_iv(df_temp, feature, "default")
    
    iv_values[feature] = round(iv,2)

iv_values


In [None]:
iv_df=pd.DataFrame(list(iv_values.items()),columns=["Feature","IV"])
iv_df=iv_df.sort_values(by="IV",ascending=False)
iv_df

In [None]:
selected_features_iv=[feature for feature,iv in iv_values.items() if iv > 0.02]
selected_features_iv

### Feature Encoding

In [None]:
X_train_reduced=X_train[selected_features_iv]
X_test_reduced=X_test[selected_features_iv]
X_train_reduced.head()

In [None]:
X_train_encoded=pd.get_dummies(X_train_reduced,drop_first=True)
X_test_encoded=pd.get_dummies(X_test_reduced,drop_first=True)
X_train_encoded.head()

In [None]:
X_train_encoded.isna().sum()


In [None]:
# Impute missing values in numeric columns
for col in X_train_encoded.columns:
    median_val = X_train_encoded[col].median()
    X_train_encoded[col] = X_train_encoded[col].fillna(median_val)
    X_test_encoded[col] = X_test_encoded[col].fillna(median_val)  # use train median

### Logistic model,RandomForest Model,XGBoost model


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
log_model = LogisticRegression()
log_model.fit(X_train_encoded,y_train)


In [None]:
y_pred=log_model.predict(X_test_encoded)
report=classification_report(y_test,y_pred)
print(report)


In [None]:
y_train.value_counts()

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_model = RandomForestClassifier()
forest_model.fit(X_train_encoded,y_train)
forest_pred=forest_model.predict(X_test_encoded)
forest_report=classification_report(y_test,y_pred)
print(forest_report)

In [None]:
from xgboost import XGBClassifier
xg_model=XGBClassifier()
xg_model.fit(X_train_encoded,y_train)
xg_pred=xg_model.predict(X_test_encoded)
xg_report=classification_report(y_test,y_pred)
print(xg_report)

In [None]:
# i had tried linear model , randomforest model , xgboost model all are giving the similar performace due to class imbalance.

### Class Imbalance Model

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus=RandomUnderSampler(random_state=42)
X_train_rus,y_train_rus=rus.fit_resample(X_train_encoded,y_train)
y_train_rus.value_counts()

In [None]:
model2=LogisticRegression()
model2.fit(X_train_rus,y_train_rus)
pred=model2.predict(X_test_encoded)
report2=classification_report(y_test,pred)
print(report2)

In [None]:
xg_model2=XGBClassifier()
xg_model2.fit(X_train_rus,y_train_rus)
xg_pred2=xg_model2.predict(X_test_encoded)
xg_report2=classification_report(y_test,xg_pred2)
print(xg_report2)

### Handeling class imbalance using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=42)
X_train_smote,y_train_smote=smote.fit_resample(X_train_encoded,y_train)
print(y_train_smote.value_counts())


In [None]:
model3=LogisticRegression()
model3.fit(X_train_smote,y_train_smote)
model_pred=model3.predict(X_test_encoded)
report3=classification_report(y_test,model_pred)
print(report3)

In [None]:
best_model = model3


In [None]:
from sklearn.metrics import roc_curve, auc
y_prob = model3.predict_proba(X_test_encoded)[:, 1]

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, color="blue", lw=2, label="ROC curve (AUC = %0.2f)" % roc_auc)
plt.plot([0, 1], [0, 1], color="red", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for logistic regression model")
plt.legend(loc="lower right")
plt.show()

In [None]:
probabilities=np.round(best_model.predict_proba(X_test_encoded)[:, 1], 3)
probabilities

In [None]:
df_eval=pd.DataFrame({
    "default_truth":y_test,
    "default_probability":probabilities
})

df_eval.head()

In [None]:
df_eval["decile"]=pd.qcut(df_eval["default_probability"], 10, labels=False, duplicates="drop")
df_eval

In [None]:
df_grouped= df_eval.groupby('decile').apply(
    lambda x: pd.Series({
        "min_probability": x['default_probability'].min(),
        "max_probability": x['default_probability'].max(),
        "events": x['default_truth'].sum(),   # defaults = 1
        "non_events": (x['default_truth'] == 0).sum()
    }))

df_grouped.reset_index(inplace=True)
df_grouped


In [None]:
total_events = df_eval["default_truth"].sum()
total_non_events = (df_eval["default_truth"] == 0).sum()



df_grouped["event_rate"] = (df_grouped["events"] / total_events * 100).round(2)
df_grouped["non_event_rate"] = (df_grouped["non_events"] / total_non_events * 100).round(2)

# Sort from highest probability decile to lowest
df_grouped = df_grouped.sort_values(by="decile", ascending=False).reset_index(drop=True)

print(df_grouped)


In [None]:
df_grouped["cum_event"]=df_grouped["events"].cumsum()
df_grouped["cum_non-event"]=df_grouped["non_events"].cumsum()
df_grouped

In [None]:
df_grouped["cum_event_rate"]=df_grouped["cum_event"]*100/df_grouped["events"].sum()
df_grouped["cum_non-event_rate"]=df_grouped["cum_non-event"]*100/df_grouped["non_events"].sum()
df_grouped

In [None]:
df_grouped["KS"]=abs(df_grouped["cum_event_rate"]-df_grouped["cum_non-event_rate"])
df_grouped

In [None]:
feature_imp=best_model.coef_[0]
coef_df=pd.DataFrame(feature_imp,index=X_train_encoded.columns,columns=["Coefficients"])

# Sort the coefficients for better visualization
coef_df=coef_df.sort_values(by="Coefficients",ascending=True)

plt.figure(figsize=(8, 4))
plt.barh(coef_df.index, coef_df['Coefficients'], color='steelblue')
plt.xlabel('Coefficient Value')
plt.title('Feature Importance in Linear Regression')
plt.show()

In [None]:
from joblib import dump 
model_data={
    "model":best_model,
    "features":X_train_encoded.columns,
    "scaler":scaler,
    "cols_to_scale":col_to_scale
}

dump(model_data,r"C:\Users\skd53\OneDrive\Machine learning\ML_classification_project\Artifacts\model_data.joblib")

In [None]:
best_model