In [1]:
import numpy as np
import matplotlib as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df = pd.read_csv('credit_risk_dataset.csv')
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [3]:
print(df.describe())
print(df.info())

         person_age  person_income  person_emp_length     loan_amnt  \
count  32581.000000   3.258100e+04       31686.000000  32581.000000   
mean      27.734600   6.607485e+04           4.789686   9589.371106   
std        6.348078   6.198312e+04           4.142630   6322.086646   
min       20.000000   4.000000e+03           0.000000    500.000000   
25%       23.000000   3.850000e+04           2.000000   5000.000000   
50%       26.000000   5.500000e+04           4.000000   8000.000000   
75%       30.000000   7.920000e+04           7.000000  12200.000000   
max      144.000000   6.000000e+06         123.000000  35000.000000   

       loan_int_rate   loan_status  loan_percent_income  \
count   29465.000000  32581.000000         32581.000000   
mean       11.011695      0.218164             0.170203   
std         3.240459      0.413006             0.106782   
min         5.420000      0.000000             0.000000   
25%         7.900000      0.000000             0.090000   
50%   

In [4]:
df = df.drop('loan_status', axis=1)

In [5]:
df.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [6]:
print(df['person_emp_length'].median())
print(df['loan_int_rate'].median())

4.0
10.99


In [7]:
df['person_emp_length'] = df['person_emp_length'].fillna(df['person_emp_length'].median())
df['loan_int_rate'] = df['loan_int_rate'].fillna(df['loan_int_rate'].median())

In [8]:
df.isna().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [9]:
pd.value_counts(df['loan_grade'])

A    10777
B    10451
C     6458
D     3626
E      964
F      241
G       64
Name: loan_grade, dtype: int64

In [10]:
X = df.drop('loan_grade', axis=1)
y = df['loan_grade']

In [11]:
oh = OneHotEncoder()
cat_vars = ['person_home_ownership', 'loan_intent', 'cb_person_default_on_file']
onehot = oh.fit_transform(X[cat_vars])
X = X.drop(cat_vars, axis=1)

X_onehot = X.join(pd.DataFrame(onehot.toarray(), columns=oh.get_feature_names()))

In [12]:
X_onehot

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,x0_MORTGAGE,x0_OTHER,x0_OWN,x0_RENT,x1_DEBTCONSOLIDATION,x1_EDUCATION,x1_HOMEIMPROVEMENT,x1_MEDICAL,x1_PERSONAL,x1_VENTURE,x2_N,x2_Y
0,22,59000,123.0,35000,16.02,0.59,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,21,9600,5.0,1000,11.14,0.10,2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,25,9600,1.0,5500,12.87,0.57,3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,23,65500,4.0,35000,15.23,0.53,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,24,54400,8.0,35000,14.27,0.55,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,1.0,5800,13.16,0.11,30,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
32577,54,120000,4.0,17625,7.49,0.15,19,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
32578,65,76000,3.0,35000,10.99,0.46,28,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
32579,56,150000,5.0,15000,11.48,0.10,26,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.2, random_state=42)

In [14]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [15]:
y_pred=rfc.predict(X_test)

In [16]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8617462022402946


In [17]:
with open("scoring.pkl", "wb") as f:
    pickle.dump(rfc, f)
with open("one_hot.pkl", "wb") as v:
    pickle.dump(oh, v)

In [18]:
l = list(df.iloc[0])

In [19]:
l

[22, 59000, 'RENT', 123.0, 'PERSONAL', 'D', 35000, 16.02, 0.59, 'Y', 3]

In [20]:
test_row = pd.DataFrame([[22, 59000, 'RENT', 123.0, 'PERSONAL', 'D', 35000, 16.02, 0.59, 'Y', 3]], columns=df.columns)
test_row = test_row.drop('loan_grade', axis=1)
test_row

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,35000,16.02,0.59,Y,3


In [21]:
df.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file',
       'cb_person_cred_hist_length'],
      dtype='object')

In [22]:
cat_vars = ['person_home_ownership', 'loan_intent', 'cb_person_default_on_file']
oh2 = oh.transform(test_row[cat_vars])

test_row = test_row.drop(cat_vars, axis=1)
test_row_onehot = test_row.join(pd.DataFrame(oh2.toarray(), columns=oh.get_feature_names()))
test_row_onehot

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,x0_MORTGAGE,x0_OTHER,x0_OWN,x0_RENT,x1_DEBTCONSOLIDATION,x1_EDUCATION,x1_HOMEIMPROVEMENT,x1_MEDICAL,x1_PERSONAL,x1_VENTURE,x2_N,x2_Y
0,22,59000,123.0,35000,16.02,0.59,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [23]:
test_row_onehot

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,x0_MORTGAGE,x0_OTHER,x0_OWN,x0_RENT,x1_DEBTCONSOLIDATION,x1_EDUCATION,x1_HOMEIMPROVEMENT,x1_MEDICAL,x1_PERSONAL,x1_VENTURE,x2_N,x2_Y
0,22,59000,123.0,35000,16.02,0.59,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [24]:
rfc.predict(test_row_onehot)

array(['D'], dtype=object)