In [2]:
import numpy as np
import pandas as pd
%matplotlib inline

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

#Save this for when we need to submit data back to kaggle
test_df_ids = test_df.loc[0:,"PassengerId"]
#drop data
train_df = train_df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis = 1)
test_df = test_df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis = 1)

#Add random seed so we get reproducible results.
seed = 7
np.random.seed(seed)
train_df.head(n=5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [3]:
train_df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
train_df.count(), test_df.count()

(Survived    891
 Pclass      891
 Sex         891
 Age         714
 SibSp       891
 Parch       891
 Fare        891
 Embarked    889
 dtype: int64, Pclass      418
 Sex         418
 Age         332
 SibSp       418
 Parch       418
 Fare        417
 Embarked    418
 dtype: int64)

fill the missing data with the mean value

In [18]:
missing_values = ["Age", "Fare"]
for i in missing_values:
    mean = train_df.loc[:, i].mean()
    train_df.loc[:, i].fillna(mean, inplace =True)
    test_df.loc[:, i].fillna(mean, inplace =True)
#Non int case for embarked, only 2 values missing here so we drop them
train_df = train_df.dropna(axis = 'index')

In [19]:
train_df["Age"].head(n=6)

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
Name: Age, dtype: float64

In [20]:
train_df.describe()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
count,889.0,889.0,889.0,889.0,889.0,889.0,889.0,889.0,889.0,889.0,889.0,889.0,889.0
mean,0.382452,29.653446,0.524184,0.382452,32.096681,0.350956,0.649044,0.24072,0.206974,0.552306,0.188976,0.086614,0.724409
std,0.48626,12.968366,1.103705,0.806761,49.697504,0.477538,0.477538,0.427761,0.405365,0.497536,0.39171,0.281427,0.447063
min,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,22.0,0.0,0.0,7.8958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,29.699118,0.0,0.0,14.4542,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
75%,1.0,35.0,1.0,0.0,31.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
max,1.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
train_df = pd.get_dummies(train_df, columns = ["Sex", "Pclass","Embarked"])
test_df = pd.get_dummies(test_df, columns = ["Sex", "Pclass","Embarked"])
train_df.head(n=5)

ValueError: labels ['Sex' 'Pclass' 'Embarked'] not contained in axis

In [22]:
train_df.count()

Survived      889
Age           889
SibSp         889
Parch         889
Fare          889
Sex_female    889
Sex_male      889
Pclass_1      889
Pclass_2      889
Pclass_3      889
Embarked_C    889
Embarked_Q    889
Embarked_S    889
dtype: int64

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

train, validate = train_test_split(train_df, test_size = 0.25, random_state = seed)
x_train = train.iloc[:,1:]
y_train = train.iloc[:,0]
x_val = validate.iloc[:,1:]
y_val = validate.iloc[:,0]
x_full = train_df.iloc[:,1:]
y_full = train_df.iloc[:,0]

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(test_df)
x_full = scaler.transform(x_full)

Time to make our ANN.
For the input layer we use the number of features, the hidden layer will be the mean of the input layer and the output layer.
For starters we will only apply one hidden layer as multiple layers increase training difficulty significantly and I fear we do not have enough data for that. 

Source: https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw

In [24]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import confusion_matrix

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline


#Lets make a function that changes our probabilitys into 0 or 1
def classifyBinary(predictions, requiredprobability):
    for i in range(len(predictions)):
        if predictions[i] > requiredprobability:
            predictions[i] = 1
        else:
            predictions[i]= 0
    return predictions


def test_accuracy(model):
    y_pred = model.predict(x_val)
    y_pred = classifyBinary(y_pred, 0.5)
    cm = confusion_matrix(y_pred, y_val)
    print(cm)
    print(1-(cm[0,1]+cm[1,0])/(cm[0,0]+cm[1,1]))
    
def Create_model():
    model = Sequential()
    #Create input layer and first hidden layer
    model.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 12))

    #Create output layer
    model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

    #Compile and fit the ANN
    model.compile(optimizer = 'adam', loss = "binary_crossentropy", metrics = ['accuracy'])
    #model.fit(x_train, y_train, epochs = 1000, batch_size = 666)
    return model

def create_model(Epochs, Batch_size):
    model = Sequential()
    #Create input layer and first hidden layer
    model.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 12))
    #Create output layer
    model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

    #Compile and fit the ANN
    model.compile(optimizer = 'adam', loss = "binary_crossentropy", metrics = ['accuracy'])
    model.fit(x_train, y_train, epochs = Epochs, batch_size = Batch_size, verbose = 0)
    return model

def validate_model(Epochs, Batch_size):
    classifier = KerasClassifier(build_fn=Create_model, epochs = Epochs, batch_size = Batch_size, verbose = 0)
    kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = seed)
    results = cross_val_score(classifier, x_train, y_train, cv=kfold)
    print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    
def create_validate_model(Epochs, Batch_size):
    validate_model(Epochs, Batch_size)
    return create_model(Epochs, Batch_size)

In [11]:
model = create_validate_model(500, 50)
test_accuracy(model) #Test on test set

Results: 81.38% (1.70%)
[[117  27]
 [ 17  62]]
0.754189944134


Test set and cross-validation results differ significantly.

In [47]:
def predict (Model, x):
    predictions = Model.predict(x)
    for i in range(len(predictions)):
        if predictions[i] > 0.5:
            predictions[i] = 1
        else:
            predictions[i] = 0
    predictions = np.squeeze(np.asarray(predictions.astype(int)))
    return predictions

In [15]:
data_to_submit = pd.DataFrame({
    'PassengerId':test_df_ids,
    'Survived':predict(model, x_test)
})
data_to_submit.to_csv("./results/ANN_results.csv", index=False)


NameError: name 'predict' is not defined

Testing this out gives us a 77,9% accuracy on the kaggle test set, not bad for a first run.
Lets tune the network and see what happens.

In [18]:
measures = 1
result= []
Sum = 0
for i in range(measures):
    import time
    start = time.time()
    test = create_model(500, 50)
    finish = time.time()
    result = finish-start
    Sum = Sum+result
    

mean = Sum/measures
print('Total time elapsed: {time}' .format(time = Sum))
print('Average time elapsed per measure: {time}'.format(time = mean))

Total time elapsed: 20.36952233314514
Average time elapsed per measure: 20.36952233314514


Later I realized some improvements could possibly still be made by training on the full dataset, hence I trained the model once more but this this on the full dataset given.

In [28]:
def create_model(Epochs, Batch_size):
    model = Sequential()
    #Create input layer and first hidden layer
    model.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 12))
    #Create output layer
    model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

    #Compile and fit the ANN
    model.compile(optimizer = 'adam', loss = "binary_crossentropy", metrics = ['accuracy'])
    model.fit(x_full, y_full, epochs = Epochs, batch_size = Batch_size, verbose = 0)
    return model

def validate_model(Epochs, Batch_size):
    classifier = KerasClassifier(build_fn=Create_model, epochs = Epochs, batch_size = Batch_size, verbose = 0)
    kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = seed)
    results = cross_val_score(classifier, x_full, y_full, cv=kfold)
    print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    
def create_validate_model(Epochs, Batch_size):
    validate_model(Epochs, Batch_size)
    return create_model(Epochs, Batch_size)

In [31]:
fullmodel = create_validate_model(500,50)

Results: 82.34% (0.89%)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [44]:
def predict (Model, x):
    predictions = Model.predict(x)
    for i in range(len(predictions)):
        if predictions[i] > 0.5:
            predictions[i] = 1
        else:
            predictions[i] = 0
    predictions = np.squeeze(np.asarray(predictions.astype(int)))
    return predictions

In [48]:
data_to_submit = pd.DataFrame({
    'PassengerId':test_df_ids,
    'Survived':predict(fullmodel, x_test)
})
data_to_submit.to_csv("./results/full_ANN_results.csv", index=False)


Although cross validation improved the score on the kaggle test set is still the same: 77,9%