**IMPORTS**

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
# Tree-based models
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score


import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('financial_loan_data_csv.csv')

**EDA**

In [3]:
df.shape
df.isnull().sum()

id                          0
address_state               0
application_type            0
emp_length                  0
emp_title                1438
grade                       0
home_ownership              0
issue_date                  0
last_credit_pull_date       0
last_payment_date           0
loan_status                 0
next_payment_date           0
member_id                   0
purpose                     0
sub_grade                   0
term                        0
verification_status         0
annual_income               0
dti                         0
installment                 0
int_rate                    0
loan_amount                 0
total_acc                   0
total_payment               0
dtype: int64

In [4]:
# Drops all rows that are exact duplicates
df.drop_duplicates(inplace=True)

In [5]:
# Drop uninformative columns
df = df.drop(['id', 'emp_title', 'member_id', 'address_state', 'issue_date', 'last_credit_pull_date', 'last_payment_date', 'next_payment_date' ], axis=1, errors='ignore')

# df['application_type'].value_counts() # Only INDIVIDUAL is the value in all rows
df = df.drop(['application_type'], axis=1, errors='ignore')

# total payments : This is post-loan info → leaks future outcome!
df = df.drop(['total_payment'], axis=1, errors='ignore')

df.sample(10)

Unnamed: 0,emp_length,grade,home_ownership,loan_status,purpose,sub_grade,term,verification_status,annual_income,dti,installment,int_rate,loan_amount,total_acc
32637,10+ years,A,RENT,Fully Paid,other,A3,36 months,Not Verified,84996.0,0.0106,313.37,0.08,10000,27
6782,6 years,A,RENT,Fully Paid,Debt consolidation,A3,36 months,Not Verified,42000.0,0.2806,385.0,0.0751,12375,34
30916,< 1 year,E,OWN,Fully Paid,medical,E5,60 months,Not Verified,85000.0,0.0788,228.27,0.1743,9100,24
20761,10+ years,E,RENT,Charged Off,Debt consolidation,E3,60 months,Source Verified,40000.0,0.2052,309.91,0.1879,12000,21
14348,2 years,E,OWN,Fully Paid,Debt consolidation,E5,36 months,Source Verified,65000.0,0.2413,432.26,0.1774,12000,20
25553,10+ years,B,MORTGAGE,Fully Paid,home improvement,B4,36 months,Not Verified,91200.0,0.2361,65.95,0.1149,2000,32
13663,10+ years,B,MORTGAGE,Fully Paid,Debt consolidation,B4,36 months,Source Verified,63189.0,0.1812,260.93,0.1074,8000,12
12558,3 years,D,RENT,Fully Paid,Debt consolidation,D5,36 months,Not Verified,66847.3,0.103,351.58,0.16,10000,15
15773,1 year,D,RENT,Fully Paid,Debt consolidation,D4,36 months,Source Verified,57840.0,0.0761,421.59,0.1595,12000,14
11834,5 years,C,RENT,Fully Paid,Debt consolidation,C3,36 months,Not Verified,35000.0,0.1413,339.25,0.1348,10000,18


**ENCODING**

In [6]:

df['emp_length'].value_counts()

# Clean
df['emp_length_clean'] = df['emp_length'].str.replace(r'[^0-9]+', '', regex=True)
df['emp_length_clean'] = df['emp_length_clean'].replace('', '0').astype(int)

# Ordinal Encoding Use Scikit-Learn’s OrdinalEncoder
# from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[[
    '< 1 year', '1 year', '2 years', '3 years', '4 years', 
    '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years'
]])

df['emp_length_encoded'] = encoder.fit_transform(df[['emp_length']])

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['emp_length', 'emp_length_clean'], axis=1, errors='ignore')
df.rename(columns={'emp_length_encoded': 'emp_length'}, inplace=True)


In [7]:

df['grade'].value_counts()

# Ordinal Encoding Use Scikit-Learn’s OrdinalEncoder
# from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[['A','B','C','D','E','F','G']])
df['grade_encoded'] = encoder.fit_transform(df[['grade']])

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['grade'], axis=1, errors='ignore')
df.rename(columns={'grade_encoded': 'grade'}, inplace=True)

In [8]:
df['purpose'].value_counts()

# Frequency Encoding (More compact & meaningful)
# Replace each category with the proportion of loans with that purpose

freq = df['purpose'].value_counts(normalize=True)
df['purpose_encoded'] = df['purpose'].map(freq)

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['purpose'], axis=1, errors='ignore')
df.rename(columns={'purpose_encoded': 'purpose'}, inplace=True)

In [9]:
df['sub_grade'].value_counts()

# from sklearn.preprocessing import OrdinalEncoder

# Define all 35 sub-grades in correct order (A1 = best, G5 = worst)
letters = ['A','B','C','D','E','F','G']
sub_grades_ordered = [f"{l}{n}" for l in letters for n in range(1,6)]

# Create the encoder with the specified categories
encoder = OrdinalEncoder(categories=[sub_grades_ordered])

# Fit and transform the column
df['sub_grade_encoded'] = encoder.fit_transform(df[['sub_grade']])

# Optional: check
df[['sub_grade', 'sub_grade_encoded']].head(10)


# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['sub_grade'], axis=1, errors='ignore')
df.rename(columns={'sub_grade_encoded': 'sub_grade'}, inplace=True)

# # Since Subgrade is there we can drop grade
# df = df.drop(['grade'], axis=1, errors='ignore')


In [10]:
df['term'].value_counts()

# Using OrdinalEncoder (scikit-learn)
# from sklearn.preprocessing import OrdinalEncoder

# Strip leading/trailing whitespace from the 'term' column
df['term'] = df['term'].str.strip()

encoder = OrdinalEncoder(categories=[['36 months', '60 months']])
df['term_encoded'] = encoder.fit_transform(df[['term']])

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['term'], axis=1, errors='ignore')
df.rename(columns={'term_encoded': 'term'}, inplace=True)


In [11]:
df['verification_status'].value_counts()

# Using OrdinalEncoder (scikit-learn)
# from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[['Not Verified', 'Verified', 'Source Verified']])
df['verification_status_encoded'] = encoder.fit_transform(df[['verification_status']])

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['verification_status'], axis=1, errors='ignore')
df.rename(columns={'verification_status_encoded': 'verification_status'}, inplace=True)


In [12]:
df['home_ownership'].value_counts()

home_ownership_mapping = {
    'NONE': 0,
    'OTHER': 0,
    'RENT': 1,
    'MORTGAGE': 2,
    'OWN': 3
}

df['home_ownership_encoded'] = df['home_ownership'].map(home_ownership_mapping)

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['home_ownership'], axis=1, errors='ignore')
df.rename(columns={'home_ownership_encoded': 'home_ownership'}, inplace=True)


**OUTLIER REMOVAL || TRANSFORMATION** 

In [13]:
categorical_cols = ['emp_length', 'sub_grade', 'term', 'verification_status', 'home_ownership', 'purpose']

# columns for Feature Scaling
numeric_cols = ['annual_income','dti','installment','int_rate','loan_amount','total_acc']

In [14]:
# # Copy dataframe to avoid modifying original
df_clean = df.copy()

for col in numeric_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    
    # Keep only rows within 1.5*IQR
    df_clean = df_clean[~((df_clean[col] < (Q1 - 1.5 * IQR)) | 
                          (df_clean[col] > (Q3 + 1.5 * IQR)))]

# Use log1p to handle zero values safely
# df_clean['annual_income'] = np.log1p(df_clean['annual_income'])
# df_clean['loan_amount'] = np.log1p(df_clean['loan_amount'])



In [15]:
# print("Rows before:", df.shape[0])
# print("Rows after:", df_clean.shape[0])


In [16]:
df.sample(10)

Unnamed: 0,loan_status,annual_income,dti,installment,int_rate,loan_amount,total_acc,emp_length,grade,purpose,sub_grade,term,verification_status,home_ownership
2297,Fully Paid,65000.0,0.127,231.54,0.1171,7000,20,7.0,1.0,0.129562,7.0,0.0,0.0,2
15510,Fully Paid,75000.0,0.1168,186.64,0.1479,5400,20,0.0,2.0,0.472159,13.0,0.0,2.0,1
17748,Fully Paid,117500.0,0.1286,609.99,0.0617,20000,27,1.0,0.0,0.472159,2.0,0.0,1.0,3
9891,Fully Paid,24000.0,0.1765,237.25,0.1242,7100,15,0.0,1.0,0.472159,8.0,0.0,0.0,3
28963,Fully Paid,54924.0,0.0634,82.01,0.1114,2500,13,6.0,1.0,0.054697,5.0,0.0,0.0,1
26103,Fully Paid,93000.0,0.0505,112.65,0.079,3600,10,10.0,0.0,0.074554,3.0,0.0,2.0,2
9279,Fully Paid,50000.0,0.1248,203.59,0.1349,6000,31,3.0,2.0,0.472159,10.0,0.0,0.0,2
10936,Fully Paid,30000.0,0.0504,104.98,0.1114,3200,8,3.0,1.0,0.472159,5.0,0.0,0.0,1
21184,Fully Paid,85000.0,0.1766,222.93,0.1299,9800,12,4.0,2.0,0.472159,10.0,1.0,2.0,3
19446,Fully Paid,68000.0,0.0672,469.09,0.1261,14000,25,9.0,2.0,0.472159,11.0,0.0,1.0,1


**HANDLING CLASS IMBALANCE**

In [17]:
df_clean['loan_status'].value_counts()

loan_status
Fully Paid     28786
Charged Off     4718
Current          888
Name: count, dtype: int64

In [18]:
# Only train on Fully Paid vs Charged Off
df_clean = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])].copy()

# Encode target: Fully Paid = 0, Charged Off = 1
df_clean['loan_status_encoded'] = df_clean['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})


In [19]:
# class_weight
# from sklearn.utils.class_weight import compute_class_weight

# y = df_clean['loan_status_encoded'].values

# class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
# class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
# print(class_weight_dict)


In [20]:
# Import
from imblearn.over_sampling import SMOTE

# Separate features and target
X = df_clean[categorical_cols + numeric_cols]
y = df_clean['loan_status_encoded']

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

print("Original dataset shape:", X.shape)
print("Resampled dataset shape:", X_res.shape)


Original dataset shape: (37478, 12)
Resampled dataset shape: (64290, 12)


**PREPARE DATA FOR TREE BASED MODELS**

In [21]:
import numpy as np
from sklearn.model_selection import train_test_split

# Separate numeric and categorical features
X_numeric = X_res[numeric_cols].values
X_categorical = X_res[categorical_cols].values

# Combine numeric + categorical for tree models
X_final = np.hstack([X_numeric, X_categorical])
y_final = y_res

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final, test_size=0.2, random_state=42, stratify=y_final
)

print("Train set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("Train class distribution:\n", np.bincount(y_train))
print("Test class distribution:\n", np.bincount(y_test))


Train set shape: (51432, 12)
Test set shape: (12858, 12)
Train class distribution:
 [25716 25716]
Test class distribution:
 [6429 6429]


**Tree-based models**

In [28]:
# Initialize model
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    class_weight='balanced',
    random_state=42
)

# Train
rf_model.fit(X_train, y_train)

# Predict
y_pred = rf_model.predict(X_test)

# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
# print("\nClassification Report:\n", classification_report(y_test, y_pred))
# print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Random Forest Accuracy: 0.9082283403328667


In [27]:
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=1,  # can tune for imbalance
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred))
# print("\nClassification Report:\n", classification_report(y_test, y_pred))
# print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

XGBoost Accuracy: 0.9095504744128169


In [25]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=200,
    max_depth=-1,
    learning_rate=0.1,
    class_weight='balanced',
    random_state=42
)

lgb_model.fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)

print("LightGBM Accuracy:", accuracy_score(y_test, y_pred))
# print("\nClassification Report:\n", classification_report(y_test, y_pred))
# print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

[LightGBM] [Info] Number of positive: 25716, number of negative: 25716
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004301 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2626
[LightGBM] [Info] Number of data points in the train set: 51432, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM Accuracy: 0.9125058329444704
