In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import pickle
import seaborn as sns

%matplotlib inline

from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve, roc_auc_score, accuracy_score

from tqdm.auto import tqdm

# Preparing data 

We'll talk about this dataset in more details in week 6. But for now, use the following code to get started

In [2]:
df = pd.read_csv('../../data/CreditScoring.csv')
df.columns = df.columns.str.lower()

Some of the features are encoded as numbers. Use the following code to de-code them:

In [3]:
status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}
df.status = df.status.map(status_values)

home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}
df.home = df.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}
df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}
df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}
df.job = df.job.map(job_values)

In [4]:
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=0)

Remove clients with unknown default status

In [5]:
df = df[df.status != 'unk'].reset_index(drop=True)

In [6]:
df['default'] = (df.status == 'default').astype(int)
del df['status']

In [7]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.default.values
y_val = df_val.default.values
y_test = df_test.default.values

del df_train['default']
del df_val['default']
del df_test['default']

In [8]:
numeric_columns = df.select_dtypes(include=[np.number]).columns
numeric_columns

Index(['seniority', 'time', 'age', 'expenses', 'income', 'assets', 'debt',
       'amount', 'price', 'default'],
      dtype='object')

In [9]:
df[numeric_columns].head()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price,default
0,9,60,30,73,129,0,0,800,846,0
1,17,60,58,48,131,0,0,1000,1658,0
2,10,36,46,90,200,3000,0,2000,2985,1
3,0,60,24,63,182,2500,0,900,1325,0
4,0,36,26,46,107,0,0,310,910,0


In [10]:
for c in numeric_columns:
    print(c,"    \t",roc_auc_score(df['default'],df[c]).round(3))

seniority     	 0.303
time     	 0.554
age     	 0.442
expenses     	 0.502
income     	 0.333
assets     	 0.367
debt     	 0.493
amount     	 0.595
price     	 0.5
default     	 1.0


In [11]:
cols = ['seniority', 'income', 'assets', 'records', 'job', 'home']

In [12]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[cols].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model

In [13]:
def predict(df, dv, model):
    dicts = df[cols].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

# Saving model to pickle

In [14]:
C=1
dv, model = train(df_full_train, df_full_train.default, C)

In [15]:
y_pred = predict(df_test, dv, model)

In [16]:
auc = roc_auc_score(y_test, y_pred)
auc

0.775937855707055

In [17]:
output_file = f'model_C={C}.pkl'
output_file

'model_C=1.pkl'

In [18]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv,model),f_out)

# Loading model

In [19]:
import pickle

In [20]:
input_file = f'model_C=1.pkl'

In [21]:
with open(input_file, 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [22]:
model, dv

(LogisticRegression(C=1, max_iter=1000, solver='liblinear'),
 DictVectorizer(sparse=False))

In [23]:
df_test

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,parents,24,19,single,no,fixed,35,28,0,0,400,600
1,14,private,12,49,married,no,fixed,90,140,3000,500,400,1432
2,5,other,48,22,single,no,fixed,35,82,0,0,480,1910
3,2,parents,48,20,single,no,fixed,35,318,0,0,740,1325
4,0,private,36,60,married,no,fixed,60,140,3000,0,1000,2050
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,8,owner,36,48,married,yes,fixed,35,121,3000,0,400,925
887,0,parents,36,22,single,no,partime,35,81,2500,0,400,775
888,10,owner,36,50,married,no,freelance,75,0,7500,0,1000,1230
889,11,parents,60,30,married,yes,fixed,45,105,5000,0,1500,2500


In [57]:
sample_data = {"seniority":14,"home":"parents","time":24,"age":19,"marital":"single","records":"no","job":"fixed","expenses":35,"income":28,"assets":0,"debt":0,"amount":400,"price":600}
# sample_data = df_test.iloc[:2].to_dict(orient='records')

In [58]:
X = dv.transform(sample_data)
X

array([[ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0., 28.,  1.,  0.,  0.,  0.,
         0.,  1.,  0., 14.]])

In [59]:
model.predict_proba(X)

array([[0.85240024, 0.14759976]])