In [1]:
import pandas as pd
import numpy as np
import duckdb
from pathlib import Path
import os
from sklearn.impute import SimpleImputer

PREPROCESSING

Now that we have selecte the columns we want to use, we need to prepare the data for our the model. This means, we need to transform the columns to numerical data so we are able to train our XGBoost model.

In [2]:
os.chdir('c:\\Renzo\\Projects\\credit-risk-ai\\')

df_clean = pd.read_parquet('data/processed.parquet')

**MISSING VALUE HANLING + ENCODING**

In [3]:
df_clean['emp_length'].value_counts()

emp_length
10+ years    74570
2 years      20673
< 1 year     19904
3 years      18284
1 year       14893
4 years      13886
5 years      13840
6 years       9806
7 years       8178
9 years       7403
8 years       6925
Name: count, dtype: int64

In [4]:
df_clean['emp_length'].isna().sum()

np.int64(16719)

In [5]:
emp_map = {
    '< 1 year': 0,
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
    '10+ years': 10
}

df_clean['emp_length'] = df_clean['emp_length'].map(emp_map).fillna(-1).astype(int)

In [6]:
# Fill all nan values in numeric columns with -1

num_cols = df_clean.select_dtypes(include=np.number).columns

imputer = SimpleImputer(strategy='constant', fill_value=-1)
df_clean[num_cols] = imputer.fit_transform(df_clean[num_cols])



In [7]:
# Transforming grade and subgrade to numerical values

df_clean['grade'].value_counts()

grade
C    69479
B    62105
A    39646
D    34091
E    13201
F     4296
G     2263
Name: count, dtype: int64

In [8]:
grade_map = {
    'A':1,
    'B':2,
    'C':3,
    'D':4,
    'E':5,
    'F':6,
    'G':7,
}

df_clean['grade_ord'] = df_clean['grade'].map(grade_map)

In [9]:
subgrade_map = {f'{g}{i}': (grade_map[g]-1)*5 + i for g in grade_map for i in range(1,6)}
print(subgrade_map)

{'A1': 1, 'A2': 2, 'A3': 3, 'A4': 4, 'A5': 5, 'B1': 6, 'B2': 7, 'B3': 8, 'B4': 9, 'B5': 10, 'C1': 11, 'C2': 12, 'C3': 13, 'C4': 14, 'C5': 15, 'D1': 16, 'D2': 17, 'D3': 18, 'D4': 19, 'D5': 20, 'E1': 21, 'E2': 22, 'E3': 23, 'E4': 24, 'E5': 25, 'F1': 26, 'F2': 27, 'F3': 28, 'F4': 29, 'F5': 30, 'G1': 31, 'G2': 32, 'G3': 33, 'G4': 34, 'G5': 35}


In [10]:
df_clean['sub_grade_ord'] = df_clean['sub_grade'].map(subgrade_map)

In [11]:
# One-hot encoding of Home ownership, Purpose and Verification status

df_clean = pd.get_dummies(df_clean, columns=['home_ownership', 'purpose','verification_status'], drop_first=True, dtype=int)

In [12]:
# Creating new temp column for split: issue_year

df_clean['issue_year'] = df_clean['issue_d'].dt.year

train_df = df_clean[df_clean['issue_year'] == 2017].copy()
test_df = df_clean[df_clean['issue_year'] == 2018].copy()

train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)

state_target = train_df.groupby('addr_state')['target'].mean()
train_df['addr_state_te'] = train_df['addr_state'].map(state_target)
test_df['addr_state_te'] = test_df['addr_state'].map(state_target).fillna(state_target.mean())

In [13]:
train_df.drop(columns=['issue_d', 'issue_year'], inplace=True)
test_df.drop(columns=['issue_d', 'issue_year'], inplace=True)

df_clean = df_clean.drop(columns=['issue_d', 'issue_year'])

In [14]:
final_drop = ['grade', 'sub_grade', 'addr_state']

In [15]:
train_df.drop(columns=final_drop, inplace=True)
test_df.drop(columns=final_drop, inplace=True)
df_clean = df_clean.drop(columns=final_drop)

In [16]:
train_df.shape

(168988, 43)

In [17]:
train_df.columns

Index(['target', 'loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'revol_bal', 'revol_util', 'total_acc', 'open_acc', 'pub_rec',
       'inq_last_6mths', 'delinq_2yrs', 'emp_length', 'credit_age',
       'payment_to_income', 'revol_utilization_trend', 'log_annual_inc',
       'log_loan_amnt', 'log_revol_bal', 'term_months', 'fico_avg',
       'grade_ord', 'sub_grade_ord', 'home_ownership_MORTGAGE',
       'home_ownership_NONE', 'home_ownership_OWN', 'home_ownership_RENT',
       'purpose_credit_card', 'purpose_debt_consolidation',
       'purpose_home_improvement', 'purpose_house', 'purpose_major_purchase',
       'purpose_medical', 'purpose_moving', 'purpose_other',
       'purpose_renewable_energy', 'purpose_small_business',
       'purpose_vacation', 'purpose_wedding',
       'verification_status_Source Verified', 'verification_status_Verified',
       'addr_state_te'],
      dtype='object')

In [18]:
test_df.shape

(56093, 43)

In [19]:
test_df.columns

Index(['target', 'loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'revol_bal', 'revol_util', 'total_acc', 'open_acc', 'pub_rec',
       'inq_last_6mths', 'delinq_2yrs', 'emp_length', 'credit_age',
       'payment_to_income', 'revol_utilization_trend', 'log_annual_inc',
       'log_loan_amnt', 'log_revol_bal', 'term_months', 'fico_avg',
       'grade_ord', 'sub_grade_ord', 'home_ownership_MORTGAGE',
       'home_ownership_NONE', 'home_ownership_OWN', 'home_ownership_RENT',
       'purpose_credit_card', 'purpose_debt_consolidation',
       'purpose_home_improvement', 'purpose_house', 'purpose_major_purchase',
       'purpose_medical', 'purpose_moving', 'purpose_other',
       'purpose_renewable_energy', 'purpose_small_business',
       'purpose_vacation', 'purpose_wedding',
       'verification_status_Source Verified', 'verification_status_Verified',
       'addr_state_te'],
      dtype='object')

In [20]:
df_clean.columns

Index(['target', 'loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'revol_bal', 'revol_util', 'total_acc', 'open_acc', 'pub_rec',
       'inq_last_6mths', 'delinq_2yrs', 'emp_length', 'credit_age',
       'payment_to_income', 'revol_utilization_trend', 'log_annual_inc',
       'log_loan_amnt', 'log_revol_bal', 'term_months', 'fico_avg',
       'grade_ord', 'sub_grade_ord', 'home_ownership_MORTGAGE',
       'home_ownership_NONE', 'home_ownership_OWN', 'home_ownership_RENT',
       'purpose_credit_card', 'purpose_debt_consolidation',
       'purpose_home_improvement', 'purpose_house', 'purpose_major_purchase',
       'purpose_medical', 'purpose_moving', 'purpose_other',
       'purpose_renewable_energy', 'purpose_small_business',
       'purpose_vacation', 'purpose_wedding',
       'verification_status_Source Verified', 'verification_status_Verified'],
      dtype='object')

In [21]:
df_clean.shape

(225081, 42)

In [22]:
# Save clean
df_clean.to_parquet('data/processed_encoded.parquet', index=False)

# Save final train/test
train_df.to_parquet('data/train_encoded.parquet', index=False)
test_df.to_parquet('data/test_encoded.parquet', index=False)