# Give Me Some Credit: Neural Networks

## Loading

In [32]:
import pandas as pd
import os

In [33]:
data_folder = "data/"

In [34]:
test = pd.read_csv(os.path.join(data_folder, "cs-test.csv"))
train = pd.read_csv(os.path.join(data_folder, "cs-training.csv"))
sample = pd.read_csv(os.path.join(data_folder, "sampleEntry.csv"))

## Preprocessing

### Remove NaN

In [36]:
print("TRAIN -- Total NaN values:\n{}".format(train.isna().sum()))
train_clean = train.dropna()
print("Total NaN values:\n{}".format(train_clean.isna().sum()))

TRAIN -- Total NaN values:
Unnamed: 0                                  0
SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64
Total NaN values:
Unnamed: 0                              0
SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEs

### Splitting inputs and ouput

In [113]:
x = train_clean[ list(train_clean.columns)[2:]]
y = train_clean[ list(train_clean.columns)[1]]

### Splitting Train Test sets

In [38]:
import numpy as np
msk = np.random.rand(len(x)) < 0.8

x_train = x[msk]
x_test = x[~msk]

y_train = y[msk]
y_test = y[~msk]

### Normalizing inputs

In [39]:
mean = x_train.mean(axis=0)
x_train -= mean
std = x_train.std(axis=0)
x_train /= std

x_test -= mean
x_test /= std

## Building the model

In [47]:
from keras import models
from keras import layers

def build_model():
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu',input_shape=(x_train.shape[1],)))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])
    return model

### Cross validation

In [49]:
import numpy as np

k = 10
num_val_samples = len(x_train) // k
num_epochs = 10
all_scores = []

for i in range(k):
    print('processing fold #', i)
    
    # prepare validation
    val_data = x_train[i * num_val_samples: (i+1) * num_val_samples]
    val_targets = y_train[i * num_val_samples: (i+1) * num_val_samples]
    
    # prepare training
    partial_train_data = np.concatenate(
        [x_train[:i * num_val_samples],
        x_train[(i+1) * num_val_samples:]],
    axis=0)
    partial_train_targets = np.concatenate(
    [y_train[:i * num_val_samples],
    y_train[(i+1) * num_val_samples:]],
    axis=0)
    
    # build model
    model = build_model()
    
    # train model
    model.fit(partial_train_data, partial_train_targets,
             epochs = num_epochs, batch_size=1024, verbose=0)
    
    # evaluate model
    val_bin, val_acc = model.evaluate(x_test,y_test, verbose=0)
    all_scores.append(val_acc)

processing fold # 0
processing fold # 1
processing fold # 2
processing fold # 3
processing fold # 4
processing fold # 5
processing fold # 6
processing fold # 7
processing fold # 8
processing fold # 9


In [50]:
print('Average accuracy: {}'.format(np.mean(all_scores)))

Average accuracy: 0.9285933482426458


# Personnalization

### Exploration

In [53]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 12 columns):
Unnamed: 0                              150000 non-null int64
SeriousDlqin2yrs                        150000 non-null int64
RevolvingUtilizationOfUnsecuredLines    150000 non-null float64
age                                     150000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    150000 non-null int64
DebtRatio                               150000 non-null float64
MonthlyIncome                           120269 non-null float64
NumberOfOpenCreditLinesAndLoans         150000 non-null int64
NumberOfTimes90DaysLate                 150000 non-null int64
NumberRealEstateLoansOrLines            150000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    150000 non-null int64
NumberOfDependents                      146076 non-null float64
dtypes: float64(4), int64(8)
memory usage: 13.7 MB


In [64]:
train.describe()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,75000.5,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,43301.414527,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37500.75,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,75000.5,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,112500.25,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,150000.0,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


### Data Engineering

In [94]:
data = train

In [95]:
def fill_na(data, *args):
    for i, arg in enumerate(args):
        data[arg] = data[arg].fillna(data[arg].mean())
    return data

In [96]:
def pipeline(data):
    
    data = fill_na(data,'MonthlyIncome','NumberOfDependents')
    
    return data

data = pipeline(data)

### NN Preparation

In [120]:
x = data[data.columns[2:]]
y = data[data.columns[1]]

In [128]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [129]:
from sklearn.preprocessing import normalize

x_train = normalize(X_train)
x_test = normalize(X_test)

### NN building

In [143]:
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu',input_shape=(x_train.shape[1],)))
    model.add(layers.Dense(1))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])
    return model

import numpy as np

k = 10
num_val_samples = len(x_train) // k
num_epochs = 10
all_scores = []

for i in range(k):
    print('processing fold #', i)
    
    # prepare validation
    val_data = x_train[i * num_val_samples: (i+1) * num_val_samples]
    val_targets = y_train[i * num_val_samples: (i+1) * num_val_samples]
    
    # prepare training
    partial_train_data = np.concatenate(
        [x_train[:i * num_val_samples],
        x_train[(i+1) * num_val_samples:]],
    axis=0)
    partial_train_targets = np.concatenate(
    [y_train[:i * num_val_samples],
    y_train[(i+1) * num_val_samples:]],
    axis=0)
    
    # build model
    model = build_model()
    
    # train model
    model.fit(partial_train_data, partial_train_targets,
             epochs = num_epochs, batch_size=1024, verbose=0)
    
    # evaluate model
    val_bin, val_acc = model.evaluate(x_test,y_test, verbose=0)
    all_scores.append(val_acc)
    
print("Average accuracy : {}".format(np.mean(all_scores)))

processing fold # 0
processing fold # 1
processing fold # 2
processing fold # 3
processing fold # 4
processing fold # 5
processing fold # 6
processing fold # 7
processing fold # 8
processing fold # 9
Average accuracy : 0.9341434343530676


In [144]:
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu',input_shape=(x_train.shape[1],)))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])
    return model

import numpy as np

k = 10
num_val_samples = len(x_train) // k
num_epochs = 10
all_scores = []

for i in range(k):
    print('processing fold #', i)
    
    # prepare validation
    val_data = x_train[i * num_val_samples: (i+1) * num_val_samples]
    val_targets = y_train[i * num_val_samples: (i+1) * num_val_samples]
    
    # prepare training
    partial_train_data = np.concatenate(
        [x_train[:i * num_val_samples],
        x_train[(i+1) * num_val_samples:]],
    axis=0)
    partial_train_targets = np.concatenate(
    [y_train[:i * num_val_samples],
    y_train[(i+1) * num_val_samples:]],
    axis=0)
    
    # build model
    model = build_model()
    
    # train model
    model.fit(partial_train_data, partial_train_targets,
             epochs = num_epochs, batch_size=1024, verbose=0)
    
    # evaluate model
    val_bin, val_acc = model.evaluate(x_test,y_test, verbose=0)
    all_scores.append(val_acc)
    
print("Average accuracy : {}".format(np.mean(all_scores)))

processing fold # 0
processing fold # 1
processing fold # 2
processing fold # 3
processing fold # 4
processing fold # 5
processing fold # 6
processing fold # 7
processing fold # 8
processing fold # 9
Average accuracy : 0.9342242424338755


In [140]:
from keras import models
from keras import layers

def build_model():
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu',input_shape=(x_train.shape[1],)))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])
    return model

In [141]:
import numpy as np

k = 10
num_val_samples = len(x_train) // k
num_epochs = 10
all_scores = []

for i in range(k):
    print('processing fold #', i)
    
    # prepare validation
    val_data = x_train[i * num_val_samples: (i+1) * num_val_samples]
    val_targets = y_train[i * num_val_samples: (i+1) * num_val_samples]
    
    # prepare training
    partial_train_data = np.concatenate(
        [x_train[:i * num_val_samples],
        x_train[(i+1) * num_val_samples:]],
    axis=0)
    partial_train_targets = np.concatenate(
    [y_train[:i * num_val_samples],
    y_train[(i+1) * num_val_samples:]],
    axis=0)
    
    # build model
    model = build_model()
    
    # train model
    model.fit(partial_train_data, partial_train_targets,
             epochs = num_epochs, batch_size=1024, verbose=0)
    
    # evaluate model
    val_bin, val_acc = model.evaluate(x_test,y_test, verbose=0)
    all_scores.append(val_acc)

processing fold # 0
processing fold # 1
processing fold # 2
processing fold # 3
processing fold # 4
processing fold # 5
processing fold # 6
processing fold # 7
processing fold # 8
processing fold # 9


In [142]:
print("Average accuracy : {}".format(np.mean(all_scores)))

Average accuracy : 0.9327797979894312


In [147]:
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(64, activation='tanh',input_shape=(x_train.shape[1],)))
    model.add(layers.Dense(64, activation='tanh'))
    model.add(layers.Dense(1))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])
    return model

import numpy as np

k = 10
num_val_samples = len(x_train) // k
num_epochs = 10
all_scores = []

for i in range(k):
    print('processing fold #', i)
    
    # prepare validation
    val_data = x_train[i * num_val_samples: (i+1) * num_val_samples]
    val_targets = y_train[i * num_val_samples: (i+1) * num_val_samples]
    
    # prepare training
    partial_train_data = np.concatenate(
        [x_train[:i * num_val_samples],
        x_train[(i+1) * num_val_samples:]],
    axis=0)
    partial_train_targets = np.concatenate(
    [y_train[:i * num_val_samples],
    y_train[(i+1) * num_val_samples:]],
    axis=0)
    
    # build model
    model = build_model()
    
    # train model
    model.fit(partial_train_data, partial_train_targets,
             epochs = num_epochs, batch_size=1024, verbose=0)
    
    # evaluate model
    val_bin, val_acc = model.evaluate(x_test,y_test, verbose=0)
    all_scores.append(val_acc)
    
print("Average accuracy : {}".format(np.mean(all_scores)))

processing fold # 0
processing fold # 1
processing fold # 2
processing fold # 3
processing fold # 4
processing fold # 5
processing fold # 6
processing fold # 7
processing fold # 8
processing fold # 9
Average accuracy : 0.9339050505146836
