**IMPORTS**

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('financial_loan_data_csv.csv')

**EDA**

In [3]:
df.shape
df.isnull().sum()

id                          0
address_state               0
application_type            0
emp_length                  0
emp_title                1438
grade                       0
home_ownership              0
issue_date                  0
last_credit_pull_date       0
last_payment_date           0
loan_status                 0
next_payment_date           0
member_id                   0
purpose                     0
sub_grade                   0
term                        0
verification_status         0
annual_income               0
dti                         0
installment                 0
int_rate                    0
loan_amount                 0
total_acc                   0
total_payment               0
dtype: int64

In [4]:
# Drops all rows that are exact duplicates
df.drop_duplicates(inplace=True)

In [5]:
# Drop uninformative columns
df = df.drop(['id', 'emp_title', 'member_id', 'address_state', 'issue_date', 'last_credit_pull_date', 'last_payment_date', 'next_payment_date' ], axis=1, errors='ignore')

# df['application_type'].value_counts() # Only INDIVIDUAL is the value in all rows
df = df.drop(['application_type'], axis=1, errors='ignore')

# total payments : This is post-loan info → leaks future outcome!
df = df.drop(['total_payment'], axis=1, errors='ignore')

df.sample(10)

Unnamed: 0,emp_length,grade,home_ownership,loan_status,purpose,sub_grade,term,verification_status,annual_income,dti,installment,int_rate,loan_amount,total_acc
33356,< 1 year,B,MORTGAGE,Fully Paid,other,B5,36 months,Not Verified,92000.0,0.0828,184.56,0.1146,5600,10
27115,10+ years,B,MORTGAGE,Fully Paid,home improvement,B4,60 months,Not Verified,64000.0,0.1498,165.69,0.0999,7800,41
23947,< 1 year,E,RENT,Fully Paid,Debt consolidation,E1,60 months,Verified,35004.0,0.097,388.67,0.1595,16000,19
7046,7 years,B,RENT,Charged Off,Debt consolidation,B4,36 months,Not Verified,69600.0,0.1572,162.26,0.1039,5000,13
27919,2 years,C,RENT,Charged Off,house,C2,36 months,Not Verified,29000.0,0.1717,84.05,0.1284,2500,11
6152,10+ years,D,MORTGAGE,Fully Paid,credit card,D3,60 months,Verified,82000.0,0.2138,447.96,0.1649,35000,31
23780,4 years,D,RENT,Fully Paid,Debt consolidation,D5,60 months,Verified,57250.0,0.2325,541.23,0.1825,21200,30
27532,3 years,B,MORTGAGE,Fully Paid,home improvement,B5,60 months,Verified,133000.0,0.0595,395.87,0.1199,28000,27
28046,< 1 year,D,MORTGAGE,Fully Paid,house,D4,36 months,Source Verified,305000.0,0.1215,1257.98,0.1758,35000,30
6659,< 1 year,A,RENT,Fully Paid,Debt consolidation,A4,36 months,Verified,45000.0,0.0925,625.81,0.079,20000,10


**ENCODING**

In [6]:

df['emp_length'].value_counts()

# Clean
df['emp_length_clean'] = df['emp_length'].str.replace(r'[^0-9]+', '', regex=True)
df['emp_length_clean'] = df['emp_length_clean'].replace('', '0').astype(int)

# Ordinal Encoding Use Scikit-Learn’s OrdinalEncoder
# from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[[
    '< 1 year', '1 year', '2 years', '3 years', '4 years', 
    '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years'
]])

df['emp_length_encoded'] = encoder.fit_transform(df[['emp_length']])

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['emp_length', 'emp_length_clean'], axis=1, errors='ignore')
df.rename(columns={'emp_length_encoded': 'emp_length'}, inplace=True)


In [7]:

df['grade'].value_counts()

# Ordinal Encoding Use Scikit-Learn’s OrdinalEncoder
# from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[['A','B','C','D','E','F','G']])
df['grade_encoded'] = encoder.fit_transform(df[['grade']])

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['grade'], axis=1, errors='ignore')
df.rename(columns={'grade_encoded': 'grade'}, inplace=True)

In [8]:
df['purpose'].value_counts()

# Frequency Encoding (More compact & meaningful)
# Replace each category with the proportion of loans with that purpose

freq = df['purpose'].value_counts(normalize=True)
df['purpose_encoded'] = df['purpose'].map(freq)

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['purpose'], axis=1, errors='ignore')
df.rename(columns={'purpose_encoded': 'purpose'}, inplace=True)

In [9]:
df['sub_grade'].value_counts()

# from sklearn.preprocessing import OrdinalEncoder

# Define all 35 sub-grades in correct order (A1 = best, G5 = worst)
letters = ['A','B','C','D','E','F','G']
sub_grades_ordered = [f"{l}{n}" for l in letters for n in range(1,6)]

# Create the encoder with the specified categories
encoder = OrdinalEncoder(categories=[sub_grades_ordered])

# Fit and transform the column
df['sub_grade_encoded'] = encoder.fit_transform(df[['sub_grade']])

# Optional: check
df[['sub_grade', 'sub_grade_encoded']].head(10)


# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['sub_grade'], axis=1, errors='ignore')
df.rename(columns={'sub_grade_encoded': 'sub_grade'}, inplace=True)

# # Since Subgrade is there we can drop grade
# df = df.drop(['grade'], axis=1, errors='ignore')


In [10]:
df['term'].value_counts()

# Using OrdinalEncoder (scikit-learn)
# from sklearn.preprocessing import OrdinalEncoder

# Strip leading/trailing whitespace from the 'term' column
df['term'] = df['term'].str.strip()

encoder = OrdinalEncoder(categories=[['36 months', '60 months']])
df['term_encoded'] = encoder.fit_transform(df[['term']])

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['term'], axis=1, errors='ignore')
df.rename(columns={'term_encoded': 'term'}, inplace=True)


In [11]:
df['verification_status'].value_counts()

# Using OrdinalEncoder (scikit-learn)
# from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[['Not Verified', 'Verified', 'Source Verified']])
df['verification_status_encoded'] = encoder.fit_transform(df[['verification_status']])

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['verification_status'], axis=1, errors='ignore')
df.rename(columns={'verification_status_encoded': 'verification_status'}, inplace=True)


In [12]:
df['home_ownership'].value_counts()

home_ownership_mapping = {
    'NONE': 0,
    'OTHER': 0,
    'RENT': 1,
    'MORTGAGE': 2,
    'OWN': 3
}

df['home_ownership_encoded'] = df['home_ownership'].map(home_ownership_mapping)

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['home_ownership'], axis=1, errors='ignore')
df.rename(columns={'home_ownership_encoded': 'home_ownership'}, inplace=True)


**HANDLING CLASS IMBALANCE**

In [13]:
df['loan_status'].value_counts()

loan_status
Fully Paid     32145
Charged Off     5333
Current         1098
Name: count, dtype: int64

In [14]:
# Only train on Fully Paid vs Charged Off
df_train = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])].copy()

# Encode target: Fully Paid = 0, Charged Off = 1
df_train['loan_status_encoded'] = df_train['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})


In [15]:
# class_weight
# from sklearn.utils.class_weight import compute_class_weight

y = df_train['loan_status_encoded'].values

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
print(class_weight_dict)


# S_M_O_T_E
# from imblearn.over_sampling import SMOTE

# X = df_train[categorical_cols + numeric_cols]
# y = df_train['loan_status_encoded']

# smote = SMOTE(random_state=42)
# X_res, y_res = smote.fit_resample(X, y)



{0: np.float64(0.582952247627936), 1: np.float64(3.5137821113819614)}


**FEATURE SCALING**

In [16]:
df_train.sample(10)

Unnamed: 0,loan_status,annual_income,dti,installment,int_rate,loan_amount,total_acc,emp_length,grade,purpose,sub_grade,term,verification_status,home_ownership,loan_status_encoded
20456,Charged Off,99996.0,0.1578,445.1,0.1036,25000,21,10.0,1.0,0.472159,9.0,1.0,2.0,2,1
4693,Fully Paid,100000.0,0.0,487.99,0.0617,16000,21,8.0,0.0,0.129562,2.0,0.0,1.0,2,0
1041,Fully Paid,36000.0,0.1283,152.95,0.0999,7200,35,6.0,1.0,0.038807,5.0,1.0,0.0,2,0
21362,Fully Paid,30000.0,0.1572,221.44,0.1361,9600,15,0.0,2.0,0.472159,11.0,1.0,2.0,1,0
20084,Fully Paid,53000.0,0.0369,70.15,0.0751,3500,29,1.0,0.0,0.472159,3.0,1.0,0.0,1,0
23213,Fully Paid,155000.0,0.1205,789.66,0.1969,30000,52,5.0,4.0,0.472159,24.0,1.0,1.0,2,0
24894,Fully Paid,14000.0,0.1011,57.41,0.1316,1700,26,0.0,2.0,0.008166,12.0,0.0,0.0,1,0
18547,Fully Paid,56544.0,0.1986,760.75,0.1171,23000,25,3.0,1.0,0.472159,7.0,0.0,1.0,1,0
1721,Fully Paid,50000.0,0.091,322.25,0.0991,10000,11,3.0,1.0,0.129562,5.0,0.0,2.0,1,0
12370,Fully Paid,61284.0,0.2187,544.17,0.1825,15000,30,3.0,3.0,0.472159,19.0,0.0,0.0,1,0


In [17]:
categorical_cols = ['emp_length', 'sub_grade', 'term', 'verification_status', 'home_ownership', 'purpose']

# columns for Feature Scaling
numeric_cols = ['annual_income','dti','installment','int_rate','loan_amount','total_acc']

In [18]:
# from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_train[numeric_cols] = scaler.fit_transform(df_train[numeric_cols])

In [19]:
# from sklearn.model_selection import train_test_split

X = df_train[categorical_cols + numeric_cols]
y = df_train['loan_status_encoded']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [20]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Initialize model with class weights to handle imbalance
model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

# Train
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]  # For ROC-AUC


In [21]:
import pandas as pd
coefficients = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': model.coef_[0]
}).sort_values(by='coefficient', ascending=False)

print(coefficients)


                feature  coefficient
9              int_rate     0.595555
2                  term     0.487873
10          loan_amount     0.102840
7                   dti     0.051874
0            emp_length     0.012751
11            total_acc    -0.005135
1             sub_grade    -0.007220
3   verification_status    -0.008588
4        home_ownership    -0.032698
8           installment    -0.109834
5               purpose    -0.251750
6         annual_income    -0.287677


In [22]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.6420757737459979
