**IMPORTS**

In [563]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


import warnings
warnings.filterwarnings("ignore")

In [564]:
df = pd.read_csv('financial_loan_data_csv.csv')

**EDA**

In [565]:
df.shape
df.isnull().sum()

id                          0
address_state               0
application_type            0
emp_length                  0
emp_title                1438
grade                       0
home_ownership              0
issue_date                  0
last_credit_pull_date       0
last_payment_date           0
loan_status                 0
next_payment_date           0
member_id                   0
purpose                     0
sub_grade                   0
term                        0
verification_status         0
annual_income               0
dti                         0
installment                 0
int_rate                    0
loan_amount                 0
total_acc                   0
total_payment               0
dtype: int64

In [566]:
# Drops all rows that are exact duplicates
df.drop_duplicates(inplace=True)

In [567]:
# Drop uninformative columns
df = df.drop(['id', 'emp_title', 'member_id', 'address_state', 'issue_date', 'last_credit_pull_date', 'last_payment_date', 'next_payment_date' ], axis=1, errors='ignore')

# df['application_type'].value_counts() # Only INDIVIDUAL is the value in all rows
df = df.drop(['application_type'], axis=1, errors='ignore')

# total payments : This is post-loan info → leaks future outcome!
df = df.drop(['total_payment'], axis=1, errors='ignore')

df.sample(10)

Unnamed: 0,emp_length,grade,home_ownership,loan_status,purpose,sub_grade,term,verification_status,annual_income,dti,installment,int_rate,loan_amount,total_acc
10230,3 years,A,RENT,Fully Paid,Debt consolidation,A4,36 months,Not Verified,33000.0,0.1822,149.29,0.0749,4800,11
2800,4 years,A,RENT,Fully Paid,credit card,A5,36 months,Not Verified,65000.0,0.1124,272.15,0.0788,8700,8
31107,8 years,C,MORTGAGE,Fully Paid,moving,C4,36 months,Not Verified,48000.0,0.104,66.4,0.1197,2000,16
36522,< 1 year,B,MORTGAGE,Fully Paid,small business,B1,36 months,Verified,80004.0,0.124,471.08,0.1095,14400,29
30288,5 years,B,RENT,Fully Paid,major purchase,B5,60 months,Verified,78000.0,0.0472,221.74,0.1186,10000,31
25476,10+ years,A,MORTGAGE,Fully Paid,home improvement,A3,36 months,Not Verified,80000.0,0.1137,308.73,0.0699,10000,36
20538,5 years,F,MORTGAGE,Charged Off,Debt consolidation,F4,60 months,Source Verified,72000.0,0.2333,491.94,0.1941,18800,42
36344,< 1 year,D,RENT,Fully Paid,small business,D3,36 months,Source Verified,75000.0,0.0989,522.04,0.1528,15000,22
7725,1 year,D,MORTGAGE,Fully Paid,Debt consolidation,D3,36 months,Not Verified,80000.0,0.2073,676.2,0.1324,20000,22
15653,6 years,D,RENT,Fully Paid,Debt consolidation,D1,36 months,Source Verified,52000.0,0.2492,308.06,0.1411,9000,11


**ENCODING**

In [568]:

df['emp_length'].value_counts()

# Clean
df['emp_length_clean'] = df['emp_length'].str.replace(r'[^0-9]+', '', regex=True)
df['emp_length_clean'] = df['emp_length_clean'].replace('', '0').astype(int)

# Ordinal Encoding Use Scikit-Learn’s OrdinalEncoder
# from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[[
    '< 1 year', '1 year', '2 years', '3 years', '4 years', 
    '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years'
]])

df['emp_length_encoded'] = encoder.fit_transform(df[['emp_length']])

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['emp_length', 'emp_length_clean'], axis=1, errors='ignore')
df.rename(columns={'emp_length_encoded': 'emp_length'}, inplace=True)


In [569]:

df['grade'].value_counts()

# Ordinal Encoding Use Scikit-Learn’s OrdinalEncoder
# from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[['A','B','C','D','E','F','G']])
df['grade_encoded'] = encoder.fit_transform(df[['grade']])

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['grade'], axis=1, errors='ignore')
df.rename(columns={'grade_encoded': 'grade'}, inplace=True)

In [570]:
df['purpose'].value_counts()

# Frequency Encoding (More compact & meaningful)
# Replace each category with the proportion of loans with that purpose

freq = df['purpose'].value_counts(normalize=True)
df['purpose_encoded'] = df['purpose'].map(freq)

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['purpose'], axis=1, errors='ignore')
df.rename(columns={'purpose_encoded': 'purpose'}, inplace=True)

In [571]:
df['sub_grade'].value_counts()

# from sklearn.preprocessing import OrdinalEncoder

# Define all 35 sub-grades in correct order (A1 = best, G5 = worst)
letters = ['A','B','C','D','E','F','G']
sub_grades_ordered = [f"{l}{n}" for l in letters for n in range(1,6)]

# Create the encoder with the specified categories
encoder = OrdinalEncoder(categories=[sub_grades_ordered])

# Fit and transform the column
df['sub_grade_encoded'] = encoder.fit_transform(df[['sub_grade']])

# Optional: check
df[['sub_grade', 'sub_grade_encoded']].head(10)


# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['sub_grade'], axis=1, errors='ignore')
df.rename(columns={'sub_grade_encoded': 'sub_grade'}, inplace=True)

# # Since Subgrade is there we can drop grade
# df = df.drop(['grade'], axis=1, errors='ignore')


In [572]:
df['term'].value_counts()

# Using OrdinalEncoder (scikit-learn)
# from sklearn.preprocessing import OrdinalEncoder

# Strip leading/trailing whitespace from the 'term' column
df['term'] = df['term'].str.strip()

encoder = OrdinalEncoder(categories=[['36 months', '60 months']])
df['term_encoded'] = encoder.fit_transform(df[['term']])

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['term'], axis=1, errors='ignore')
df.rename(columns={'term_encoded': 'term'}, inplace=True)


In [573]:
df['verification_status'].value_counts()

# Using OrdinalEncoder (scikit-learn)
# from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[['Not Verified', 'Verified', 'Source Verified']])
df['verification_status_encoded'] = encoder.fit_transform(df[['verification_status']])

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['verification_status'], axis=1, errors='ignore')
df.rename(columns={'verification_status_encoded': 'verification_status'}, inplace=True)


In [574]:
df['home_ownership'].value_counts()

home_ownership_mapping = {
    'NONE': 0,
    'OTHER': 0,
    'RENT': 1,
    'MORTGAGE': 2,
    'OWN': 3
}

df['home_ownership_encoded'] = df['home_ownership'].map(home_ownership_mapping)

# Drop original column and rename the new column as the original one  || just to keep things clean
df = df.drop(['home_ownership'], axis=1, errors='ignore')
df.rename(columns={'home_ownership_encoded': 'home_ownership'}, inplace=True)


**OUTLIER REMOVAL || TRANSFORMATION** 

In [575]:
categorical_cols = ['emp_length', 'sub_grade', 'term', 'verification_status', 'home_ownership', 'purpose']

# columns for Feature Scaling
numeric_cols = ['annual_income','dti','installment','int_rate','loan_amount','total_acc']

In [576]:
# # Copy dataframe to avoid modifying original
df_clean = df.copy()

for col in numeric_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    
    # Keep only rows within 1.5*IQR
    df_clean = df_clean[~((df_clean[col] < (Q1 - 1.5 * IQR)) | 
                          (df_clean[col] > (Q3 + 1.5 * IQR)))]

# Use log1p to handle zero values safely
# df_clean['annual_income'] = np.log1p(df_clean['annual_income'])
# df_clean['loan_amount'] = np.log1p(df_clean['loan_amount'])



In [577]:
# print("Rows before:", df.shape[0])
# print("Rows after:", df_clean.shape[0])


In [578]:
df.sample(10)

Unnamed: 0,loan_status,annual_income,dti,installment,int_rate,loan_amount,total_acc,emp_length,grade,purpose,sub_grade,term,verification_status,home_ownership
3909,Fully Paid,62000.0,0.1746,318.82,0.1199,9600,19,7.0,1.0,0.129562,9.0,0.0,2.0,2
36193,Fully Paid,90000.0,0.1124,483.94,0.0999,15000,29,1.0,1.0,0.046039,5.0,0.0,2.0,2
23493,Fully Paid,49000.0,0.1136,434.75,0.1099,20000,17,10.0,1.0,0.472159,7.0,1.0,1.0,1
32721,Fully Paid,37500.0,0.0771,135.72,0.0542,4500,33,0.0,0.0,0.099129,0.0,0.0,0.0,1
1627,Charged Off,28000.0,0.0849,411.71,0.1427,12000,10,2.0,2.0,0.129562,11.0,0.0,2.0,1
38347,Fully Paid,63800.0,0.1621,386.82,0.1474,11200,19,6.0,3.0,0.024056,17.0,0.0,1.0,1
7365,Charged Off,30000.0,0.056,317.72,0.0894,10000,7,0.0,0.0,0.472159,4.0,0.0,0.0,1
32315,Fully Paid,116000.0,0.0078,388.62,0.1025,12000,30,3.0,1.0,0.099129,6.0,0.0,0.0,2
20646,Charged Off,64000.0,0.2023,291.76,0.1599,12000,21,0.0,3.0,0.472159,16.0,1.0,2.0,1
3420,Fully Paid,106000.0,0.0932,676.02,0.1322,20000,17,9.0,2.0,0.129562,11.0,0.0,0.0,1


**HANDLING CLASS IMBALANCE**

In [579]:
df_clean['loan_status'].value_counts()

loan_status
Fully Paid     28786
Charged Off     4718
Current          888
Name: count, dtype: int64

In [580]:
# Only train on Fully Paid vs Charged Off
df_clean = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])].copy()

# Encode target: Fully Paid = 0, Charged Off = 1
df_clean['loan_status_encoded'] = df_clean['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})


In [581]:
# class_weight
# from sklearn.utils.class_weight import compute_class_weight

# y = df_clean['loan_status_encoded'].values

# class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
# class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
# print(class_weight_dict)


In [582]:
# Import
from imblearn.over_sampling import SMOTE

# Separate features and target
X = df_clean[categorical_cols + numeric_cols]
y = df_clean['loan_status_encoded']

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

print("Original dataset shape:", X.shape)
print("Resampled dataset shape:", X_res.shape)


Original dataset shape: (37478, 12)
Resampled dataset shape: (64290, 12)


**FEATURE SCALING**

In [583]:
df_clean.sample(10)

Unnamed: 0,loan_status,annual_income,dti,installment,int_rate,loan_amount,total_acc,emp_length,grade,purpose,sub_grade,term,verification_status,home_ownership,loan_status_encoded
11328,Fully Paid,35000.0,0.2314,430.23,0.1242,12875,30,10.0,1.0,0.472159,8.0,0.0,0.0,1,0
25062,Fully Paid,150000.0,0.0272,235.25,0.0807,7500,60,1.0,0.0,0.074554,3.0,0.0,0.0,2,0
19034,Fully Paid,48000.0,0.2193,498.38,0.1689,14000,9,2.0,3.0,0.472159,18.0,0.0,1.0,1,0
17426,Fully Paid,48500.0,0.1136,223.01,0.1427,6500,28,1.0,2.0,0.472159,11.0,0.0,1.0,2,0
6674,Fully Paid,73000.0,0.2247,773.03,0.1596,22000,32,5.0,2.0,0.472159,14.0,0.0,1.0,2,0
18156,Fully Paid,24000.0,0.094,205.85,0.0542,6825,25,0.0,0.0,0.472159,0.0,0.0,1.0,1,0
18019,Fully Paid,162000.0,0.1281,617.46,0.0699,20000,40,5.0,0.0,0.472159,2.0,0.0,1.0,1,0
2621,Fully Paid,36700.0,0.2102,62.22,0.0751,2000,19,7.0,0.0,0.129562,3.0,0.0,0.0,3,0
11711,Fully Paid,36000.0,0.1233,135.96,0.1361,4000,21,1.0,2.0,0.472159,11.0,0.0,0.0,1,0
23444,Fully Paid,65000.0,0.0639,202.36,0.1557,8400,21,10.0,3.0,0.472159,19.0,1.0,1.0,3,0


In [584]:
# from sklearn.preprocessing import StandardScaler

# Scale only numeric columns in the resampled data
scaler = StandardScaler()
X_res[numeric_cols] = scaler.fit_transform(X_res[numeric_cols])

# Check
X_res[numeric_cols].head()


Unnamed: 0,annual_income,dti,installment,int_rate,loan_amount,total_acc
0,-0.636334,-2.029579,-1.293012,0.705412,-1.187108,-1.636555
1,-0.327235,-1.326623,-1.056964,1.64926,-1.121937,-1.636555
2,-0.292891,1.183013,0.428901,0.898663,0.051147,-0.987207
3,-0.430268,-1.318543,-1.115833,-0.588527,-0.926423,-1.172735
4,0.27379,-1.817884,-1.070765,-1.882466,-1.056765,0.589781


In [585]:
# from sklearn.model_selection import train_test_split

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)

**PCA**

In [591]:
from sklearn.decomposition import PCA

# Separate numeric and categorical features
X_train_num = X_train[numeric_cols]
X_train_cat = X_train[categorical_cols].values

X_test_num = X_test[numeric_cols]
X_test_cat = X_test[categorical_cols].values

# Apply PCA only on numeric features
n_components = 1
pca = PCA(n_components=n_components)
X_train_num_pca = pca.fit_transform(X_train_num)
X_test_num_pca = pca.transform(X_test_num)

# Concatenate PCA numeric + categorical features
X_train_final = np.hstack([X_train_num_pca, X_train_cat])
X_test_final = np.hstack([X_test_num_pca, X_test_cat])


In [593]:
# from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize model
model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

# Train
model.fit(X_train_final, y_train)

# Predict
y_pred = model.predict(X_test_final)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6658111681443459

Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.68      0.67      6429
           1       0.67      0.65      0.66      6429

    accuracy                           0.67     12858
   macro avg       0.67      0.67      0.67     12858
weighted avg       0.67      0.67      0.67     12858



In [590]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

X_train_num = X_train[numeric_cols]
X_train_cat = X_train[categorical_cols].values

X_test_num = X_test[numeric_cols]
X_test_cat = X_test[categorical_cols].values

best_accuracy = 0
best_n_components = 0

# Max components = number of numeric features
max_components = len(numeric_cols)

for i in range(1, max_components + 1):
    pca = PCA(n_components=i)
    X_train_num_pca = pca.fit_transform(X_train_num)
    X_test_num_pca = pca.transform(X_test_num)
    
    # Concatenate categorical features back
    X_train_final = np.hstack([X_train_num_pca, X_train_cat])
    X_test_final = np.hstack([X_test_num_pca, X_test_cat])
    
    model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
    model.fit(X_train_final, y_train)
    
    y_pred = model.predict(X_test_final)
    current_accuracy = accuracy_score(y_test, y_pred)
    
    if current_accuracy > best_accuracy:
        best_accuracy = current_accuracy
        best_n_components = i

print(f"Best n_components: {best_n_components}")
print(f"Maximum Accuracy: {best_accuracy:.4f}")


Best n_components: 1
Maximum Accuracy: 0.6658
