# Table of Contents
1.  Load Data
1.  Feature Engineering
1.  Missing values
1.  Preprocess
1.  Processing
1.  Conclusion 
1.  Sources 

# 1. Load and check data
Load libraries and data

In [None]:
!pip install plotnine

In [None]:
import numpy as np
from numpy import nan, asarray
import pandas as pd
import seaborn as sns
import itertools
import re
import matplotlib.pyplot as plt
import plotly.express as px
from collections import Counter

import sklearn
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split #this is just to split training 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

In [None]:
og_test = test
train= pd.DataFrame(train) 
train.info()

# 2. Feature Engineering:
Manipulate data to re-organise table.
Types of maniplation and why:
*  Name: name only has one substring of value and that would be title (Mr. Miss...) Remove the rest of the string or that column would be too sparce to use.
*  Sibsp and Parch: these to columns offer information about how many siblings are with the guest and parents/children. This data can be maniplated in diffrent ways. for example would sibling survival be more likely if their parents or children were not there? But for my case, I have added the two columns together to show that people with smaller families are more likely to survive, ideally no family would be optimal.
* I also dropped some columns mainly due to the modification of the table.


In [None]:
#Adding title column 
name_array = train['Name']
title_train = []
for name in name_array:
  title = re.search(', (.+?). ', name).group(1)
  title_train.append(title)

train['Title'] = title_train

name_ar = test['Name']
title_test = []
for name in name_ar:
  title = re.search(', (.+?). ', name).group(1)
  title_test.append(title)

test['Title'] = title_test

In [None]:
#Narrow down uncommon titles. 

train['Title'] = train['Title'].replace(['Dona', 'Lady', 'the Countess','Capt', 'Col', 'Don', 
                'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'], value ='rare')

test['Title'] = test['Title'].replace(to_replace =['Dona', 'Lady', 'the Countess','Capt', 'Col', 'Don', 
                'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'], value ='rare')

train.loc[(train.Title == 'the Countess'),'Title']='rare'
train.loc[(train.Title == 'Mlle'),'Title']='Miss'
train.loc[(train.Title == 'Ms'),'Title']='Miss'
train.loc[(train.Title == 'Mme'),'Title']='Mrs'

test.loc[(test.Title == 'Mlle'),'Title']='Miss'
test.loc[(test.Title == 'Ms'),'Title']='Miss'
test.loc[(test.Title == 'Mme'),'Title']='Mrs'


In [None]:
#Find unique values in title
print("Train titles: \n ",train['Title'].unique())
print("Test titles: \n", test['Title'].unique())

#Change them to int values for future processing
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "rare": 5}

train['Title'] = train['Title'].map(title_mapping)
train['Title'] = train['Title'].fillna(0)
test['Title'] = test['Title'].map(title_mapping)
test['Title'] = test['Title'].fillna(0)



In [None]:
g = sns.factorplot("Title", col="Survived", col_wrap=4,
                    data=train[train.Survived.notnull()],
                    kind="count", height=10.5, aspect=1);

In [None]:
fig = px.scatter(train, x="Fare", y="Sex", color="Survived",
                 title="Fare of people based on gender"
                )
fig.show()

In [None]:
#Family = Parch + SibSp
train['Family'] = train['Parch'] + train['SibSp']
test['Family'] = test['Parch'] + test['SibSp']

#Drop Tables
train = train.drop(['Parch'], axis = 1)
train = train.drop(['SibSp'], axis = 1)
test = test.drop(['Parch'], axis = 1)
test = test.drop(['SibSp'], axis = 1)

pd.DataFrame(train)

In [None]:
#More tables to drop
train = train.drop(['Ticket'], axis = 1)
train = train.drop(['Cabin'], axis = 1)
train = train.drop(['Name'], axis = 1)
train = train.drop(['PassengerId'], axis = 1)

test = test.drop(['Ticket'], axis = 1)
test= test.drop(['Cabin'], axis = 1)
test = test.drop(['Name'], axis = 1)
test = test.drop(['PassengerId'], axis = 1)

# 3. Missing values.
After removing most of the columns with missing values there is only a handfull of columns still missing values. Most notibly Age.

**Fast and cheap** = drop all missing data or single imputation

**Good and cheap** = proper imputation or pattern submodel method (this is what I did)

**Good and fast** = gather data in a complete manner


-Start cleaning the data (names, age, embarked...)

In [None]:
train.isnull().sum(axis = 0)

In [None]:
train['Embarked'] = train['Embarked'].replace(np.nan, 'C')

train['Embarked'] = train['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

test['Embarked'] = test['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

In [None]:
#There is one fare missing in test so I just put the median there
test['Fare'].fillna(test['Fare'].dropna().median(), inplace=True)

In [None]:


#Change female and male to 1 and 0 for processing
test['Sex'] = test['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

#Make groups of ages instead of unique ages per person, I have tried both and this one
#led to better accuracy.
guess_ages = np.zeros((2,3))
guess_ages

for i in range(0, 2):
    for j in range(0, 3):
      guess_df = test[(test['Sex'] == i) & (test['Pclass'] == j+1)]['Age'].dropna()
      age_guess = int(guess_df.median())
      guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
for i in range(0, 2):
    for j in range(0, 3):
        test.loc[ (test.Age.isnull()) & (test.Sex == i) & (test.Pclass == j+1),\
                 'Age'] = guess_ages[i,j]

test['Age'] = test['Age'].astype(int)

#------train------
train['Sex'] = train['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

guess_ages = np.zeros((2,3))
guess_ages

for i in range(0, 2):
    for j in range(0, 3):
      guess_df = train[(train['Sex'] == i) & (train['Pclass'] == j+1)]['Age'].dropna()
      age_guess = int(guess_df.median())
      guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
for i in range(0, 2):
    for j in range(0, 3):
        train.loc[ (train.Age.isnull()) & (train.Sex == i) & (train.Pclass == j+1),\
                 'Age'] = guess_ages[i,j]

train['Age'] = train['Age'].astype(int)

train.head()

In [None]:
test['AgeBand'] = pd.cut(test['Age'], 5)
test.loc[ test['Age'] <= 16, 'Age'] = 0
test.loc[(test['Age'] > 16) & (test['Age'] <= 32), 'Age'] = 1
test.loc[(test['Age'] > 32) & (test['Age'] <= 48), 'Age'] = 2
test.loc[(test['Age'] > 48) & (test['Age'] <= 64), 'Age'] = 3
test.loc[ test['Age'] > 64, 'Age']

train['AgeBand'] = pd.cut(train['Age'], 5)
train[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

train.loc[ train['Age'] <= 16, 'Age'] = 0
train.loc[(train['Age'] > 16) & (train['Age'] <= 32), 'Age'] = 1
train.loc[(train['Age'] > 32) & (train['Age'] <= 48), 'Age'] = 2
train.loc[(train['Age'] > 48) & (train['Age'] <= 64), 'Age'] = 3
train.loc[ train['Age'] > 64, 'Age']

In [None]:
test = test.drop(['AgeBand'], axis=1)
train = train.drop(['AgeBand'], axis=1)
train.head()

In [None]:
#Two large factors of survival was age and class, besides of course gender
# Here I am putting more weight on this combination for the neraul net to factor.
train['Age*Class'] = train.Age * train.Pclass
test['Age*Class'] = test.Age * test.Pclass

train.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10)

In [None]:
#no more empty values (yay).
train.isnull().sum(axis = 0)

# 4. Preprocess.
- When I was testing the accuracy I split the training set in to two, test and training. This is my final code so i left training alone. But if you were to split it, here would be ideal.
- building the model. 


In [None]:
#Break up the data in to test and training
X_train = train.iloc[:,1:9]
y_train= train.iloc[:,0]
print(test)

In [None]:
X_test = test
# Feature Scaling
sc = StandardScaler()

X_train = np.array(sc.fit_transform(X_train))
X_test = np.array(sc.transform(X_test))

y_train= np.array(y_train)

# 5. Processing

Here I used a sequential neural network because the problem was a classification type, the data is not complicated, and there is not alot of data. 



Rules I followed:
 *   If training is much better than the validation set, you are probably overfitting and you can use techniques like regularization.
 *   If training and validation are both low, you are probably underfitting and you can probably increase the capacity of your network and train more or longer.
 *   If there is an inflection point when training goes above the validation, you might be able to use early stopping.


In [None]:
#I tried other types of activation layers (sigmoid, tanh, softplus)and got worse perfomance, 
#this was th most optimal number of layers and activation layers I tried.
import tensorflow as tf
model = Sequential([
                    Dense(units=16, input_shape=(8,), activation='relu'),
                    #Dropout(0.5),#decreases because of drop out...
                    Dense(units=32, activation='relu'),
                    #Dropout(0.25),
                    Dense(units=2, activation='softmax')
                    ])


In [None]:
#The values here are base on trial and error.
model.compile(optimizer=Adam(learning_rate=0.01), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
trained=model.fit(x=X_train, y= y_train,  batch_size=16, epochs=25, shuffle=True, verbose=2)

In [None]:
val_acc = np.mean(trained.history['accuracy'])
print("\n%s: %.2f%%" % ('val_acc', val_acc*100))
test_shape = X_test.shape
print(test_shape)

In [None]:
predictions = model.predict(x=X_test, batch_size=10, verbose=1)
rounded_predictions =np.argmax(predictions, axis= -1)
rounded_predictions=rounded_predictions.reshape((test_shape[0],1))

solution = pd.DataFrame(rounded_predictions)
solution.columns = ['Survived']
id = pd.DataFrame(og_test['PassengerId'])
id = id.join(solution)
print(id)

id.to_csv("NN_sol_titanic1.csv", index=False)

# 6. Conclusion
This is my first attempt of neural networks. I am aware that other types of predictions might have worked better, such as random forests. My score was 0.756 accuracy which put me at top 85% on the leading board. Feel free to critique my work. I am always happy to learn something new.

# 7. Sources 
These are all the sources I used to help me complete this project. I hope some of this maybe usefull to you.

Machine Learning A-Z™: Hands-On Python & R In Data Science 
https://www.udemy.com/course/machinelearning/

Keras with TensorFlow Course - Python Deep Learning and Neural Networks for Beginners Tutorial
https://www.youtube.com/watch?v=qFJeN9V1ZsI&t=131s

A Comprehensive Guide to types of Neual Networks
https://www.digitalvidya.com/blog/types-of-neural-networks/

Keras
https://keras.io/getting_started/

Difference Between a Batch and an Epoch in a Neural Network
https://machinelearningmastery.com/difference-between-a-batch-and-an-epoch/

Discover Feature Engineering, How to Engineer Features and How to Get Good at It
https://machinelearningmastery.com/discover-feature-engineering-how-to-engineer-features-and-how-to-get-good-at-it/

How To Improve Deep Learning Performance
https://machinelearningmastery.com/improve-deep-learning-performance/

Titanic: Neural Network for Beginners
https://www.kaggle.com/jamesleslie/titanic-neural-network-for-beginners

Titanic: Machine Learning from Disaster
https://www.kaggle.com/gokultalele/titanic-machine-learning-from-disaster