# Credit Risk


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Set pandas display options to show numbers in normal format


In [2]:
df = pd.read_csv('accepted.csv')
#df.head()
#df.info()
#df.describe()

  df = pd.read_csv('accepted.csv')


In [3]:
#Filter out all loans that are not fully paid or charged off
df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]
#Changing the loan_status to binary_encoding
df['loan_status'] = df['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})


In [4]:
#print("Application Type:")
#print(df['application_type'].value_counts())
#print("\nPurpose:")
#print(df['purpose'].value_counts())
#print("\nGrade:")
#print(df['grade'].value_counts())
#print("\nSub Grade:")
#print(df['sub_grade'].value_counts())
#print("\nFICO Range High:")
#print(df['fico_range_high'].value_counts())
#print("\nFICO Range Low:")
#print(df['fico_range_low'].value_counts())

In [5]:
#Filter for only individual loans
df = df[df['application_type'] == 'Individual']
df['fico_score'] = (df['fico_range_low'] + df['fico_range_high']) / 2

# Verify the calculation worked
print(f"Rows after filter: {len(df):,}")
print(f"FICO score created. Sample:")
print(df[['fico_range_low', 'fico_range_high', 'fico_score']].head())

Rows after filter: 1,319,510
FICO score created. Sample:
   fico_range_low  fico_range_high  fico_score
0           675.0            679.0       677.0
1           715.0            719.0       717.0
4           695.0            699.0       697.0
5           690.0            694.0       692.0
6           680.0            684.0       682.0


In [6]:
df['fico_score'].describe()

count    1.319510e+06
mean     6.980267e+02
std      3.170712e+01
min      6.270000e+02
25%      6.720000e+02
50%      6.920000e+02
75%      7.120000e+02
max      8.475000e+02
Name: fico_score, dtype: float64

In [7]:
safe_features = [
    # Loan characteristics
    'loan_amnt',
    'term',  
    'purpose',
    'installment',
    'disbursement_method',
    
    # Borrower characteristics
    'annual_inc',
    'dti',
    'emp_length',
    'home_ownership',
    'addr_state',
    
    # Credit history
    'fico_score',
    'delinq_2yrs',
    'revol_util',
    'revol_bal',
    'inq_last_6mths',
    'pub_rec',
    'mths_since_last_delinq',
]

In [8]:
X = df[safe_features]
y = df['loan_status']
#print(f"X shape: {X.shape}, y shape: {y.shape}")


In [9]:
#Check for missing values
#print(f"Missing values: {X.isnull().sum()}")




In [10]:
#print value counts for each column
#for col in X.columns:
#    print(f"{col}:")
#    print(X[col].value_counts())
#    print("\n")



In [11]:
# Handle missing values FIRST (before encoding)
X = df[safe_features].copy()

# revol_util: impute median and flag missing
X['revol_util_missing'] = X['revol_util'].isna().astype(int)
X['revol_util'] = X['revol_util'].fillna(X['revol_util'].median())

# emp_length: impute "Unknown" before one-hot
X['emp_length'] = X['emp_length'].fillna('Unknown')

# mths_since_last_delinq: impute max+1 and flag missing
msld_missing = X['mths_since_last_delinq'].isna()
max_msld = X['mths_since_last_delinq'].max(skipna=True)
X['mths_since_last_delinq_missing'] = msld_missing.astype(int)
X.loc[msld_missing, 'mths_since_last_delinq'] = max_msld + 1

# Handle remaining missing values
X['inq_last_6mths'] = X['inq_last_6mths'].fillna(0)

# Binary-encode naturally binary columns
X['term_60m'] = (X['term'] == '60 months').astype(int)
X['disbursement_directpay'] = (X['disbursement_method'] == 'DirectPay').astype(int)

print(f"Missing values after imputation: {X.isnull().sum().sum()}")

Missing values after imputation: 0


In [12]:
# Label encode remaining multi-class categoricals
from sklearn.preprocessing import LabelEncoder

X = X.drop(columns=['term', 'disbursement_method'])
cat_cols = ['purpose', 'home_ownership', 'emp_length', 'addr_state']

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

print(f"X shape after label encoding: {X.shape}")

X shape after label encoding: (1319510, 19)


In [13]:
# Random Forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.8014
ROC-AUC: 0.6781

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.99      0.89    211459
           1       0.51      0.03      0.06     52443

    accuracy                           0.80    263902
   macro avg       0.66      0.51      0.47    263902
weighted avg       0.75      0.80      0.72    263902

