In [1]:
import pandas as pd
import numpy as np
from scipy.stats import trim_mean, kurtosis
from scipy.stats.mstats import mode, gmean, hmean
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline, make_union
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [3]:
credit_data = pd.read_csv('german_credit.csv')

In k-fold cross-validation, the original sample is randomly partitioned into k equal sized subsamples. Of the k subsamples, a single subsample is retained as the validation data for testing the model, and the remaining k − 1 subsamples are used as training data. The cross-validation process is then repeated k times, with each of the k subsamples used exactly once as the validation data. The k results can then be averaged to produce a single estimation. The advantage of this method over repeated random sub-sampling (see below) is that all observations are used for both training and validation, and each observation is used for validation exactly once. 10-fold cross-validation is commonly used, but in general k remains an unfixed parameter.

Source: https://en.wikipedia.org/wiki/Cross-validation_(statistics)#k-fold_cross-validation

In [4]:
lb_make = LabelEncoder()

credit_data2 = credit_data.copy()
credit_data2['AgeCat'] = pd.cut(credit_data2['Age (years)'], 4)
credit_data2['AgeCat'] = lb_make.fit_transform(credit_data2["AgeCat"].astype(str))

credit_data2['CredAmtCat'] = pd.cut(credit_data2['Credit Amount'],3)
credit_data2['CredAmtCat'] = lb_make.fit_transform(credit_data2["CredAmtCat"].astype(str))

credit_data2['CredDurCat'] = pd.cut(credit_data2['Duration of Credit (month)'],4)
credit_data2['CredDurCat'] = lb_make.fit_transform(credit_data2["CredDurCat"].astype(str))

In [5]:
credit_data2 = credit_data2.drop(["Age (years)", "Credit Amount", "Duration of Credit (month)"], axis=1)
credit_data2.describe()


Unnamed: 0,Creditability,Account Balance,Payment Status of Previous Credit,Purpose,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,Guarantors,Duration in Current address,...,Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker,AgeCat,CredAmtCat,CredDurCat
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.7,2.577,2.545,2.828,2.105,3.384,2.973,2.682,1.145,2.845,...,2.675,1.928,1.407,2.904,1.155,1.404,1.037,0.673,1.097,0.772
std,0.458487,1.257638,1.08312,2.744439,1.580023,1.208306,1.118715,0.70808,0.477706,1.103718,...,0.705601,0.530186,0.577654,0.653614,0.362086,0.490943,0.188856,0.818988,0.354566,0.637513
min,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,0.0,1.0,2.0,1.0,1.0,3.0,2.0,2.0,1.0,2.0,...,3.0,2.0,1.0,3.0,1.0,1.0,1.0,0.0,1.0,0.0
50%,1.0,2.0,2.0,2.0,1.0,3.0,3.0,3.0,1.0,3.0,...,3.0,2.0,1.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0
75%,1.0,4.0,4.0,3.0,3.0,5.0,4.0,3.0,1.0,4.0,...,3.0,2.0,2.0,3.0,1.0,2.0,1.0,1.0,1.0,1.0
max,1.0,4.0,4.0,10.0,5.0,5.0,4.0,4.0,3.0,4.0,...,3.0,3.0,4.0,4.0,2.0,2.0,2.0,3.0,2.0,3.0


In [8]:
X = credit_data2.loc[:, credit_data.columns != 'Creditability']
y = credit_data2["Creditability"]
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=1/3., random_state=42)

In [9]:

## function to select the columns

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)


In [10]:

pipeline = Pipeline(steps = [
        ("features", make_union(
                ColumnSelector(list(X)),
                )),
                ("model",RandomForestClassifier(random_state=42))
])

pipeline.fit(X_train, y_train)

pipeline.score(X_validation, y_validation)

print("RF Score before CV: %s" % pipeline.score(X_validation, y_validation))

# get list of hyperparameters

hyperparameters = { 'model__max_depth': [50, 70,90],
                    'model__min_samples_leaf': [1,2,3]
                  }

clf = GridSearchCV(pipeline, hyperparameters, cv=10)

clf.fit(X_train, y_train)

print("RF Score after CV: %s" % clf.score(X_validation, y_validation))

RF Score before CV: 0.739520958084
RF Score after CV: 0.793413173653
