In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Titanic survivor prediction model
Plan:
1. Data cleanup and EDA: 
    * Identify missing data: (NaN)
    * Check size of data
    * Check distribution of features

2. Feature engineering

3. Pre-process the data
    * Normalize and scale data
    * Create dummy variables

4. Model selection (Random Forest, ANN, XGBoost) 

6. Performance metrics (Accuracy, Precision, Recall, and F1 score with sklearn)

7. Hyper parameter tuning

8. Upload model with best score (average parameters of 10 runs)


# 1A. Exploratory Data Analysis
Load training data and explore:
* Size of the data set
* Types of data
* Incomplete or missing data
* Distribution

In [None]:
# 1. Data pre processing, display training data head
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_train.head()


**Size of data and check missing data.**

In [None]:
df_train.info()

**Number of survivors vs total.**

In [None]:
sns.countplot(x='Survived', data=df_train)

**Male vs Female passengers**

In [None]:
sns.countplot(x='Sex', data=df_train)

**Age Distribution of passengers**

In [None]:
sns.histplot(data=df_train, x='Age')

**Distribution of survivors vs total**

In [None]:
sns.histplot(data=df_train, x='Age', hue='Survived', multiple='stack')

Distribution of fare vs passenger class. Removed outliers for now to enhance visibility.

In [None]:
sns.boxplot(x='Pclass', y='Fare', data=df_train, hue='Survived', showfliers = False, palette="Set3")

In [None]:

# 1 Data pre processing, display test data info
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')
#df_test.head()
df_test.info()



# 1B Data cleanup - check for missing data
Create heatmap to show missing entries

In [None]:
# 1 Data pre processing, Show NaN in heatmap
plt.figure(figsize=(12,6))
sns.heatmap(df_test.isna(), cmap='viridis')

In [None]:
# 1. Data Preprocessing One Fare is NaN, get that passenger:
df_test[df_test['Fare'].isna()]#['PassengerId']


In [None]:
df_test.head()

idx = df_test[df_test['Fare'].isna()]['PassengerId'].index
print('Index: ', idx)

# Passenger is in class 3, compute average fair of class 3 for his fair:
avg = df_test[df_test['Pclass'] == 3]['Fare'].mean()
print('avg: ', avg)

# and impute for his fair
df_test.loc[idx,'Fare'] = avg

# Check results:
df_test.iloc[152]

In [None]:
# Check if all entries in 'Fare' column are non-null now
df_test.info()

In [None]:
# Visual representation, alternative check
plt.figure(figsize=(12,6))
sns.heatmap(df_test.isna(), cmap='viridis')

# 2. Feature Engineering
Passengers that traveled alone had a lower chance of surviving. Create future.

In [None]:
# Feature engineering

# 1. Passengers that were alone seem to have a lower chance of surviving
#sns.histplot(data=df_train, x='SibSp', hue='Survived', multiple='stack')

df_train['IsAlone'] = (df_train['SibSp'] == 0) & (df_train['Parch'] == 0)
df_train['IsAlone'] = df_train['IsAlone'].apply(lambda x: int(x))
sns.histplot(data=df_train, x='IsAlone', hue='Survived', multiple='stack')

df_test['IsAlone'] = (df_test['SibSp'] == 0) & (df_test['Parch'] == 0)
df_test['IsAlone'] = df_test['IsAlone'].apply(lambda x: int(x))


# 3. Pre-process data
* Create dummy variables
* Scale and normalize the data

In [None]:
# Create dummy variables for: ['Sex', 'Embarked'] also for Pclass, SibSp and/or Parch?
    
    #1 select features
X_train = df_train[['Pclass','Sex', 'PassengerId', 'Fare', 'Parch', 'SibSp', 'Embarked', 'IsAlone']]
X_test = df_test[['Pclass','Sex', 'PassengerId', 'Fare','Parch', 'SibSp', 'Embarked', 'IsAlone']]
    
    #2 set index to PassengerId
X_train = X_train.set_index('PassengerId')
X_test = X_test.set_index('PassengerId')

    #3 get dummies
X_train_d = pd.get_dummies(X_train[['Sex', 'Embarked']])
X_train_d = X_train_d.drop(['Sex_male'], axis=1)
X_train = X_train.drop(['Sex', 'Embarked'], axis=1)
    
    #4 remove redundant data
X_test_d = pd.get_dummies(X_test[['Sex', 'Embarked']])
X_test_d = X_test_d.drop(['Sex_male'], axis=1)
X_test = X_test.drop(['Sex', 'Embarked'], axis=1)

    #5 combine dummies with other features
X_train = pd.concat([X_train, X_train_d], axis=1)
X_test = pd.concat([X_test, X_test_d], axis=1)
X_test.head()


In [None]:
#6 set y_train (y_test is absent on this competition)
y_train = df_train[['Survived']]
y_train.head()
y_train.info

In [None]:
# Scale data
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


# 4. Model Selection
* ANN
* Random Forest
* XG Boost
* Logistic Regression
* SVM

Start with a simple ANN using Tensorflow's Sequential API.

In [None]:
# ANN (TF / Keras dense layer)
model = Sequential()

model.add(Dense(9, activation="relu"))
model.add(Dense(25, activation="relu"))
model.add(Dense(90, activation='relu'))
model.add(Dense(180, activation='relu'))
model.add(Dense(360, activation='relu'))
model.add(Dense(90, activation='relu'))
#model.add(Dropout(0.2))
model.add(Dense(units=1, activation="sigmoid"))

model.compile(optimizer='adam', loss='binary_crossentropy')
model.fit(x=X_train_scaled, y=y_train.values, epochs=1000,verbose=0)
predictions = model.predict(X_test_scaled)

Random Forest.

In [None]:
# Random Forest Classifier
model_RF = RandomForestClassifier(n_estimators=2000, max_depth=5, random_state=1)
model_RF.fit(X_train_scaled, y_train.values.ravel())
predictions_RF = model.predict(X_test_scaled)


XG Boost (Beta)

In [None]:
# XGBoost
model = xgb.XGBClassifier(seed=0, nthread=1, n_estimators=2500, use_label_encoder=False, verbosity=0)
model = model.fit(X_train_scaled, y_train,verbose=False)
#test_features = ['Pclass','Sex', 'PassengerId', 'Fare', 'Parch', 'SibSp', 'Embarked', 'IsAlone']
predictions_XG = model.predict(X_test_scaled)
#y_pred = clf.predict(test_features, ntree_limit=clf.booster().best_ntree_limit)
#predictions_XG = model.predict_proba(X_test)

In [None]:
pred = predictions
pd.DataFrame(pred.ravel()).plot()
print(pred.max(), pred.min())

#X_test_scaled

In [None]:
sns.histplot(pred)

In [None]:
#print(predictions.ravel().shape)

np_pred = pred.ravel()
np_pred = np.where(np_pred>=0.5, 1.0, 0.)
print(np_pred)
sns.histplot(np_pred)



# 5 Performance Metrics
Kaggle did not include the output vector for the test data. Performance evaluated upon upload of the results.

# 6 Hyper Parameter Tuning
Based on performance metrics, model parameters can be adjusted in step 4. For example:
* change number of layers or neurons per layer of ANN
* add dropout layer
* regularization
* train for more/fewer epochs
* ...

# 7 Create output file for Kaggle upload

In [None]:
# Create output file
#rand_pred = np.random.randint(2, size=418)
#int_predictions = rand_pred.astype(int)
#np_pred = np.zeros(len(X_test))
#df_pred = pd.DataFrame(np_pred)
#indices = list(X_test[(X_test['Sex_female'] == 1) & (X_test['Pclass'] == 1)].index)

#for idx in indices:
#    df_pred.loc[idx] = 1

#np_pred = df_pred.to_numpy()

output = pd.DataFrame({'PassengerId': df_test['PassengerId'].values, 'Survived': np_pred.ravel().astype(int)})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")