# **Report on Credit Default Risk Assessment Model**

## ***Introduction***

## ***Imports***

In [2]:
import os

%matplotlib inline
import string
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import os
import re
import sys
from hashlib import sha1

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import tests_hw5
from sklearn import datasets
from sklearn.compose import make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

## ***Data Reading & Data Splitting***

Dataset retrieved from: https://www.kaggle.com/datasets/uciml/default-of-credit-card-clients-dataset

In [10]:
credit_df = pd.read_csv("data/UCI_Credit_Card.csv")
credit_df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [12]:
train_df, test_df = train_test_split(credit_df, test_size = 0.3, random_state = 123)
train_df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
16395,16396,320000.0,2,1,2,36,0,0,0,0,...,19370.0,10155.0,3788.0,5000.0,5018.0,1000.0,3000.0,0.0,7013.0,0
21448,21449,440000.0,2,1,2,30,-1,-1,-1,0,...,171244.0,150897.0,117870.0,612.0,87426.0,130007.0,3018.0,15000.0,51663.0,0
20034,20035,160000.0,2,3,1,44,-2,-2,-2,-2,...,-18.0,-18.0,-18.0,0.0,0.0,0.0,0.0,0.0,0.0,0
25755,25756,120000.0,2,2,1,30,0,0,0,0,...,103058.0,71095.0,47379.0,3706.0,5502.0,4204.0,3017.0,2005.0,1702.0,0
1438,1439,50000.0,1,2,2,54,1,2,0,0,...,27585.0,27910.0,27380.0,0.0,1400.0,1200.0,1500.0,1000.0,1500.0,0


#### Key Observations from Initial Dataset Review:

All features in the dataset are represented numerically, which simplifies the preprocessing phase. However, it is important to note that the numeric values exhibit varying scales across different features. Thus, it may be beneficial to perform scaling to address this issue. Additionally, certain features, such as the ID column, are deemed irrelevant to our analysis and can be dropped from the dataset. 

Upon reviewing the feature descriptions, it becomes apparent that some columns, although initially resembling numeric values, actually represent categorical features. Regarding the target variable, it consists of binary values, indicating that we may want to perform classification.

## ***Preliminary Exploratory Data Analysis (EDA)***

### Summary Statistics

In [13]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21000 entries, 16395 to 19966
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          21000 non-null  int64  
 1   LIMIT_BAL                   21000 non-null  float64
 2   SEX                         21000 non-null  int64  
 3   EDUCATION                   21000 non-null  int64  
 4   MARRIAGE                    21000 non-null  int64  
 5   AGE                         21000 non-null  int64  
 6   PAY_0                       21000 non-null  int64  
 7   PAY_2                       21000 non-null  int64  
 8   PAY_3                       21000 non-null  int64  
 9   PAY_4                       21000 non-null  int64  
 10  PAY_5                       21000 non-null  int64  
 11  PAY_6                       21000 non-null  int64  
 12  BILL_AMT1                   21000 non-null  float64
 13  BILL_AMT2                  

#### Comments

First off, none of the features exhibit missing values, alleviating the need for imputation. All values within the dataset are appropriately represented as numeric data types, such as int64 or float64. However, it is crucial to exercise caution when interpreting certain features, namely 'SEX', 'MARRIAGE', 'EDUCATION', and 'PAY_0' ~ 'PAY_6' as they represent categorical disguised as numeric values.

In [14]:
train_df.describe()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
count,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,...,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0,21000.0
mean,14962.348238,167880.651429,1.600762,1.852143,1.554,35.50081,-0.015429,-0.137095,-0.171619,-0.225238,...,43486.610905,40428.518333,38767.202667,5673.585143,5895.027,5311.432286,4774.021381,4751.850095,5237.76219,0.223238
std,8650.73405,130202.682167,0.489753,0.792961,0.521675,9.212644,1.120465,1.194506,1.196123,1.168556,...,64843.303993,61187.200817,59587.689549,17033.241454,21801.43,18377.997079,15434.136142,15228.193125,18116.846563,0.416427
min,1.0,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,...,-50616.0,-61372.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7498.75,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,...,2293.75,1739.5,1215.75,1000.0,820.0,390.0,266.0,234.0,110.75,0.0
50%,14960.5,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,...,19102.5,18083.0,16854.5,2100.0,2007.0,1809.5,1500.0,1500.0,1500.0,0.0
75%,22458.25,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,...,54763.25,50491.0,49253.75,5007.25,5000.0,4628.5,4021.25,4016.0,4000.0,0.0
max,30000.0,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,873552.0,1227082.0,896040.0,621000.0,426529.0,528666.0,1.0


#### Comments

This collection of statistical summaries provides valuable insights. It is evident that the mean values vary significantly across different features, indicating notable differences in their average magnitudes. Similarly, the observed disparities between the maximum and minimum values further highlight the need for scaling. These observations collectively reinforce the necessity of implementing appropriate scaling techniques to ensure fair and accurate comparisons among the features.