In [112]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

#### Data Load in

In [113]:
train_data = pd.read_csv('../Data/TitanicTrain.csv')
test_data = pd.read_csv('../Data/TitanicTest.csv')

train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [114]:
train_data.shape

(891, 12)

In [115]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


#### Identify Features

In [116]:
X = train_data.drop(columns=['Survived', 'PassengerId'])
y = train_data['Survived']
test_ids = test_data['PassengerId']
X_test = test_data.drop(columns=['PassengerId'])

#### Separate Numerical and Categorical Features

In [117]:
features_num = ['Age', 'SibSp', 'Parch', 'Fare']
features_cat = ['Pclass', 'Sex', 'Embarked']

Set Preprocessor based on Cat or Num Features

In [118]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), features_num),
    ('cat', OneHotEncoder(handle_unknown='ignore'), features_cat)
])

#### Handle Missing Values

In [119]:
X[features_num] = X[features_num].fillna(X[features_num].median())
X_test[features_num] = X_test[features_num].fillna(X[features_num].median())
X[features_cat] = X[features_cat].fillna('Missing')
X_test[features_cat] = X_test[features_cat].fillna('Missing')

#### Train-Test-Split

In [120]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

Applying Preprocessors

In [121]:
X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)
X_test = preprocessor.transform(X_test)

#### Create Model

In [122]:
model = keras.Sequential([
    layers.BatchNormalization(input_shape=[X_train.shape[1]]),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(16, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])

  super().__init__(**kwargs)


#### Compile Model

In [123]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

#### Training the Model

In [124]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=32,
    epochs=100,
    verbose=1
)


Epoch 1/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.4576 - loss: 0.8779 - val_accuracy: 0.6480 - val_loss: 0.6797
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5582 - loss: 0.7070 - val_accuracy: 0.7207 - val_loss: 0.6328
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6495 - loss: 0.6394 - val_accuracy: 0.7654 - val_loss: 0.5918
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6684 - loss: 0.6515 - val_accuracy: 0.7933 - val_loss: 0.5573
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6720 - loss: 0.6227 - val_accuracy: 0.7933 - val_loss: 0.5271
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7173 - loss: 0.5677 - val_accuracy: 0.8156 - val_loss: 0.5003
Epoch 7/100
[1m23/23[0m [32m━━

#### Predict for Test Data

In [125]:
predictions = model.predict(X_test).flatten()
predictions = (predictions > 0.5).astype(int)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


#### Output for Submission

In [126]:
predictions = model.predict(X_test).flatten()
predictions = (predictions > 0.5).astype(int)  # Convert probabilities to binary output
output = pd.DataFrame({'PassengerId': test_ids,
                       'Survived': predictions})
output

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [127]:
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
output.to_csv(f'../Submission/Titanicsubmission{timestr}.csv', index=False)