In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from colorsetup import colors, palette

In [2]:
gcr_data = pd.read_csv("german_credit_data.csv", index_col=0)

In [3]:
feature_cols = gcr_data.columns[:-1]
X = gcr_data[feature_cols]
y = gcr_data['Risk']

In [4]:
from sklearn.utils import shuffle
X, y = shuffle(X, y, random_state=0)

In [5]:
X.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
993,30,male,3,own,little,little,3959,36,furniture/equipment
859,26,male,2,rent,moderate,,3577,9,car
298,43,male,2,own,little,,2515,18,furniture/equipment
553,27,male,2,own,moderate,moderate,1995,12,car
672,42,male,3,own,little,,10366,60,car


In [6]:
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

In [10]:
cat_cols = X.columns[X.dtypes == 'O']
num_cols = X.columns[X.dtypes == 'int64']

In [11]:
cat_cols

Index(['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose'], dtype='object')

In [12]:
num_cols

Index(['Age', 'Job', 'Credit amount', 'Duration'], dtype='object')

In [30]:
categories = [
    X[column].unique() for column in X[cat_cols]]
categories

[array(['male', 'female'], dtype=object),
 array(['own', 'rent', 'free'], dtype=object),
 array(['little', 'moderate', 'rich', 'quite rich', nan], dtype=object),
 array(['little', nan, 'moderate', 'rich'], dtype=object),
 array(['furniture/equipment', 'car', 'radio/TV', 'vacation/others',
        'education', 'business', 'domestic appliances', 'repairs'],
       dtype=object)]

In [29]:
for col in cat_cols:
    print('{}: {} {}'.format(col, '\n', X[col].unique()))

Sex: 
 ['male' 'female']
Housing: 
 ['own' 'rent' 'free']
Saving accounts: 
 ['little' 'moderate' 'rich' 'quite rich' nan]
Checking account: 
 ['little' nan 'moderate' 'rich']
Purpose: 
 ['furniture/equipment' 'car' 'radio/TV' 'vacation/others' 'education'
 'business' 'domestic appliances' 'repairs']


In [20]:
for cat in categories:
    cat[cat == np.nan] = 'missing'

In [39]:
cat_proc_nlin = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='constant',
                  fill_value='missing'),
    OrdinalEncoder(categories=categories)
    )

In [42]:
num_proc_nlin = make_pipeline(SimpleImputer(strategy='mean'))

In [40]:
cat_proc_lin = make_pipeline(
    SimpleImputer(missing_values=np.nan,
                  strategy='constant',
                  fill_value='missing'),
    OneHotEncoder(categories=categories)
)

In [41]:
num_proc_lin = make_pipeline(
    StandardScaler()
)

In [43]:
# transformation to use for non-linear estimators
processor_nlin = make_column_transformer(
    (cat_proc_nlin, cat_cols),
    (num_proc_nlin, num_cols),
    remainder='passthrough')

In [44]:
# transformation to use for linear estimators
processor_lin = make_column_transformer(
    (cat_proc_lin, cat_cols),
    (num_proc_lin, num_cols),
    remainder='passthrough')

In [57]:
from sklearn.model_selection import StratifiedShuffleSplit

# Split the data into two parts with 1500 points in the test data
# This creates a generator
strat_shuff_split = StratifiedShuffleSplit(n_splits=1, test_size=250, random_state=42)

# Get the index values from the generator
train_idx, test_idx = next(strat_shuff_split.split(X, y))

# Create the data sets
X_train = gcr_data.loc[train_idx, feature_cols]
y_train = gcr_data.loc[train_idx, 'Risk']

X_test = gcr_data.loc[test_idx, feature_cols]
y_test = gcr_data.loc[test_idx, 'Risk']

In [53]:
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier

In [54]:
rf_pipeline = make_pipeline(processor_nlin,
                            RandomForestClassifier(random_state=42))

In [55]:
gradient_pipeline = make_pipeline(
    processor_nlin,
    HistGradientBoostingClassifier(random_state=0))

In [58]:
# Suppress warnings about too few trees from the early models
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [59]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the random forest estimator
# Note that the number of trees is not setup here
RF = RandomForestClassifier(oob_score=True, 
                            random_state=42, 
                            warm_start=True,
                            n_jobs=-1)

oob_list = list()

# Iterate through all of the possibilities for 
# number of trees
for n_trees in [15, 20, 30, 40, 50, 100, 150, 200, 300, 400]:
    
    # Use this to set the number of trees
    RF.set_params(n_estimators=n_trees)

    # Fit the model
    RF.fit(X_train, y_train)

    # Get the oob error
    oob_error = 1 - RF.oob_score_
    
    # Store it
    oob_list.append(pd.Series({'n_trees': n_trees, 'oob': oob_error}))

rf_oob_df = pd.concat(oob_list, axis=1).T.set_index('n_trees')

rf_oob_df

ValueError: could not convert string to float: 'female'