## Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import KFold, cross_val_score

from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier

Using TensorFlow backend.


## Import Data

In [2]:
crx = pd.read_csv('resources/crx.data', header=None)

# add probable column names
crx.columns = [
    'sex', 'age', 'debt', 'married', 'bank_customer', 'education', 'ethnicity', 
    'year_employed', 'prior_default', 'employed', 'credit_score',
    'drivers_license', 'citizen', 'zip_code', 'income', 'approved',
]

# remove unhelpful features
crx = crx.drop(['zip_code'], axis=1)

# extract label
lab = (crx['approved'] == '+').to_numpy()
crx = crx.drop(['approved'], axis=1)

## Cleaning

### Impute Missing Values

In [9]:
# missing values saved as '?'
crx = crx.replace(['?'],np.NaN)
# fix column datatypes
crx['age'] = crx['age'].astype('float')
# keep track of which columns are missing data
for name, values in crx.iteritems():
    is_na = values.isna()
    if any(is_na):
        crx[f'{name}_is_na'] = is_na
# replace missing numeric values with column means
crx.fillna(crx.mean(), inplace=True)
# replace missing string values with column modes
for name, values in crx.iteritems():
    if values.dtype == 'object':
        crx[name] = values.fillna(values.value_counts().index[0])

### Encode Non-Numeric Features

In [11]:
# split numeric and non-numeric features
numeric = crx.select_dtypes(exclude='object')
string = crx.select_dtypes(include='object')
enc = OneHotEncoder()
string_enc = enc.fit_transform(string)
crx_enc = np.hstack((numeric.to_numpy(), string_enc.todense()))

### Normalise Inputs

In [13]:
scal = MinMaxScaler()
crx_enc = scal.fit_transform(crx_enc)

## Build Model

In [26]:
def build_model():
    model = Sequential([
        Dense(16, activation='relu'),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid'),
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', 
                  metrics=['accuracy'])
    return model

## Train Model

### Fit Model

In [27]:
nn = KerasClassifier(build_fn=build_model, epochs=10, batch_size=32, verbose=0)
kfold = KFold(n_splits=30)

## Evaluate Performance

In [33]:
results = cross_val_score(nn, crx_enc, lab, cv=kfold)
acc = results.mean()
print(f"Model accuracy is {round(acc * 100, 2)}%")

Model accuracy is 84.78%
