In [309]:
## Libraries
# General purpose libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
# Machine learning libraries
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import Dense, Dropout

In [310]:
#Import data 
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample = pd.read_csv('data/sample_submission.csv')

In [311]:
#Full name -> last name only
def decompose_name(full_name):
    full_name = str(full_name)
    word_list = full_name.split()
    if len(word_list)>1:
        name_array = str(full_name).split()
        return name_array[-1]
    return(full_name)

In [312]:
#G/3/S -> G 3 S
def treat_cabin(cabin): 
    my_string = str(cabin)
    if cabin != 'Nan_Cabin': 
        cabin_1,cabin_2,cabin_3 = my_string.split('/')
        return np.array([cabin_1,cabin_2,cabin_3])
    return np.array(['Nan_Cabin_1','0','Nan_Cabin_3'])

In [313]:
def data_cleaning(dataset):
    
    #Split into features and label
    if (str(dataset))=='train':
        print('Training Dataset')
        train_labels = train['Transported']
        train_labels = train_labels.astype(int)
        train_features = train.drop(['Transported','PassengerId'],axis=1)
        
        #print(train_labels)
    else:
        print('Testing Dataset')
        train_labels = np.zeros(test.shape[0])
        train_features = test.drop(['PassengerId'],axis=1)
        
        
    #Treat the NaN values
    labels_nan_to_zero = ['CryoSleep', 'VIP']
    labels_nan_to_mean = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall' , 'Spa', 'VRDeck']

    #Get the mode of the categorical features 
    mode_HomePlanet =  train_features['HomePlanet'].value_counts().idxmax()
    mode_Destination =  train_features['Destination'].value_counts().idxmax()

    #Replace the NaN values
    for label in labels_nan_to_zero:
        train_features[label] = train_features[label].fillna(0) 
    for label in labels_nan_to_mean: 
        train_features[label] = train_features[label].fillna(train_features[label].mean())
    #For the HomePlanet and Destination, NaN replaced by the mode 
    train_features['HomePlanet'] = train_features['HomePlanet'].fillna(mode_HomePlanet)
    train_features['Destination'] = train_features['Destination'].fillna(mode_Destination)
    #For the Cabin and the Name, replace by a string indicating missing value 
    train_features['Cabin'] = train_features['Cabin'].fillna('Nan_Cabin')
    train_features['Name'] = train_features['Name'].fillna('Nan_Name')

    #Decompose the cabins into 3 different features
    cabin_1_array = []
    cabin_2_array = []
    cabin_3_array = []
    for cabin in train_features['Cabin']: 
        cabin_decomposed= treat_cabin(cabin)
        cabin_1_array.append(cabin_decomposed[0])
        cabin_2_array.append(cabin_decomposed[1])
        cabin_3_array.append(cabin_decomposed[2])
    train_features['cabin_1'] = cabin_1_array
    train_features['cabin_2'] = cabin_2_array
    train_features['cabin_3'] = cabin_3_array
    train_features = train_features.drop(['Cabin'],axis=1)
    
    #Get the last name and replace the column name by the last name only
    last_name = []
    for name in train_features['Name']: 
        last_name.append(decompose_name(name))
    last_name_df = pd.DataFrame(last_name)
    #Replace Name by LastName
    train_features['Last_Name'] = last_name_df
    train_features = train_features.drop(['Name'],axis=1)
    
    #From name to unique int
    le = preprocessing.LabelEncoder()
    le.fit(train_features['Last_Name'])
    last_name_encoded = le.transform(train_features['Last_Name'])
    train_features['Last_Name_encoded'] = last_name_encoded
    train_features = train_features.drop(['Last_Name'],axis=1)
    
    #Hot one encoding 
    features_one_hot = ['HomePlanet', 'Destination', 'cabin_1', 'cabin_3']
    for feature in features_one_hot:
        one_hot = pd.get_dummies(train_features[feature])
        # Drop column B as it is now encoded
        train_features = train_features.drop(feature,axis = 1)
        # Join the encoded df
        train_features = train_features.join(one_hot)
        
    #replace boolean by integer
    features_boolean = ['CryoSleep', 'VIP']
    for feature in features_boolean:
        train_features[feature] = train_features[feature].astype(int)


    
    return train_features, train_labels

In [314]:
train_features, train_labels = data_cleaning('train')

Training Dataset


In [315]:
test_features, test_labels = data_cleaning('test')

Testing Dataset


In [350]:
#Model for binary classification
#Define the model 
#normalize features that use different scales and ranges
normalizer = tf.keras.layers.Normalization(axis = -1)
normalizer.adapt(np.asarray(train_features).astype('float32'))
train_input_shape = train_features.shape[1]
model = Sequential()
model.add(normalizer)
model.add(Dense(50, activation='relu', input_shape=(train_input_shape,)))
model.add(Dense(25, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [351]:
model.summary()

Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization_15 (Normaliza  (None, 28)               57        
 tion)                                                           
                                                                 
 dense_63 (Dense)            (None, 50)                1450      
                                                                 
 dense_64 (Dense)            (None, 25)                1275      
                                                                 
 dropout_6 (Dropout)         (None, 25)                0         
                                                                 
 dense_65 (Dense)            (None, 10)                260       
                                                                 
 dense_66 (Dense)            (None, 1)                 11        
                                                     

In [352]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [353]:
# Fit model
print("Training model...")
history = model.fit(x = np.asarray(train_features).astype('float32'), y = train_labels, epochs=7, batch_size = 16, validation_split=0.2)
print("Training completed!")

Training model...
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Training completed!


In [354]:
#Predict
y_test = model.predict(np.asarray(test_features).astype('float32'))
y_test_df = pd.DataFrame(y_test)

In [356]:
y_test_thresh = np.where(y_test < 0.5, 0, 1)

In [357]:
y_test_tresh_df = pd.DataFrame(y_test_thresh)

In [359]:
sample['Transported'] = y_test_tresh_df
sample["Transported"] = sample["Transported"].astype(bool)
sample.to_csv('prediction_titanic.csv', index=False)