# Titanic data analysis

In [396]:
import numpy as np
import pandas as pd

#
training = pd.read_csv('titanic_training.csv', sep=",")

## Display beginning of dataset

In [397]:

training.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Display end of dataset

In [398]:
training.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


## Correlations

In [399]:
cor = training.corr()
print(cor)

             PassengerId  Survived    Pclass       Age     SibSp     Parch  \
PassengerId     1.000000 -0.005007 -0.035144  0.036847 -0.057527 -0.001652   
Survived       -0.005007  1.000000 -0.338481 -0.077221 -0.035322  0.081629   
Pclass         -0.035144 -0.338481  1.000000 -0.369226  0.083081  0.018443   
Age             0.036847 -0.077221 -0.369226  1.000000 -0.308247 -0.189119   
SibSp          -0.057527 -0.035322  0.083081 -0.308247  1.000000  0.414838   
Parch          -0.001652  0.081629  0.018443 -0.189119  0.414838  1.000000   
Fare            0.012658  0.257307 -0.549500  0.096067  0.159651  0.216225   

                 Fare  
PassengerId  0.012658  
Survived     0.257307  
Pclass      -0.549500  
Age          0.096067  
SibSp        0.159651  
Parch        0.216225  
Fare         1.000000  


## Missing and Empty values

In [400]:
def get_missing(df):
    percent_missing = df.isnull().sum() / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                     'percent_missing': percent_missing})
    missing_value_df.sort_values('percent_missing', inplace=True)
    return missing_value_df


print(get_missing(training))

             column_name  percent_missing
PassengerId  PassengerId         0.000000
Survived        Survived         0.000000
Pclass            Pclass         0.000000
Name                Name         0.000000
Sex                  Sex         0.000000
SibSp              SibSp         0.000000
Parch              Parch         0.000000
Ticket            Ticket         0.000000
Fare                Fare         0.000000
Embarked        Embarked         0.002245
Age                  Age         0.198653
Cabin              Cabin         0.771044


## Replace and Cleanup Dataset

In [401]:
# Map male to 0
training.loc[training['Sex'] == 'male', 'Sex'] = 0
# Map female to 1
training.loc[training['Sex'] == 'female', 'Sex'] = 1

# Map the port to S = 1 C = 2 Q = 3
training.loc[training['Embarked'] == 'S', 'Embarked'] = 1
training.loc[training['Embarked'] == 'C', 'Embarked'] = 2
training.loc[training['Embarked'] == 'Q', 'Embarked'] = 3

training.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0,,1
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0,B42,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,,1,2,W./C. 6607,23.45,,1
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0,C148,2
890,891,0,3,"Dooley, Mr. Patrick",0,32.0,0,0,370376,7.75,,3


## Get all name titles

In [402]:
available_titles = training['Name'].str.split(', ').str[1]
available_titles = available_titles.str.split('.').str[0]
training["Title"] = available_titles
distinct_title = list(set(available_titles))

groupedTitles = {
    "married-female": ["Mrs", "Mme"],
    "not-married-female": ["Ms", "Miss", "Mlle"],
    "army": ["Col", "Major", "Capt"],
    "noble": ["Sir", "Master", "Lady", "Jonkheer", "the Countess", "Don"],
    "academic": ["Dr"],
    "chaplain": ["Rev"],
    "not-specified-male": ["Mr"]
}

count = 0
for category, values_to_replace in groupedTitles.items():
    training.loc[training['Title'].isin(values_to_replace), 'Title'] = count
    count = count + 1
print(training['Title'])
print(distinct_title)
print(len(distinct_title))
print(get_missing(training))

#training.head()
#print(training.astype({'Title': 'int64'}).dtypes)

0      6
1      0
2      1
3      0
4      6
      ..
886    5
887    1
888    1
889    6
890    6
Name: Title, Length: 891, dtype: object
['Don', 'Dr', 'Jonkheer', 'Mr', 'Master', 'Mme', 'Rev', 'Major', 'Sir', 'Ms', 'Col', 'Mlle', 'Lady', 'Capt', 'the Countess', 'Mrs', 'Miss']
17
             column_name  percent_missing
PassengerId  PassengerId         0.000000
Survived        Survived         0.000000
Pclass            Pclass         0.000000
Name                Name         0.000000
Sex                  Sex         0.000000
SibSp              SibSp         0.000000
Parch              Parch         0.000000
Ticket            Ticket         0.000000
Fare                Fare         0.000000
Title              Title         0.000000
Embarked        Embarked         0.002245
Age                  Age         0.198653
Cabin              Cabin         0.771044


# Summary Statistics

## Location parameters

In [403]:
import pandas as pd

numeric_columns = ['Age', 'SibSp', 'Parch', 'Fare']
stats = {}
for item in numeric_columns:
    stats[item] = {
        "mean": training[item].mean(),
        "median": training[item].median(),
        "variance": training[item].var(),
        "min": training[item].min(),
        "max": training[item].max(),
    }

print(stats)

{'Age': {'mean': 29.69911764705882, 'median': 28.0, 'variance': 211.0191247463081, 'min': 0.42, 'max': 80.0}, 'SibSp': {'mean': 0.5230078563411896, 'median': 0.0, 'variance': 1.2160430774662894, 'min': 0, 'max': 8}, 'Parch': {'mean': 0.38159371492704824, 'median': 0.0, 'variance': 0.6497282437357467, 'min': 0, 'max': 6}, 'Fare': {'mean': 32.2042079685746, 'median': 14.4542, 'variance': 2469.436845743117, 'min': 0.0, 'max': 512.3292}}


## Datensatz vorbereiten

In [404]:
# Datensatz ausdünnen
new_training = training.drop(['Embarked', 'PassengerId', 'Name', 'Ticket', 'Cabin', 'Fare', 'Age'], axis=1)
# Inputs
x_training = new_training.drop('Survived', axis=1)
# Outputs
y_training = new_training['Survived']

dim = len(x_training.columns)

## Print Training Dataset

In [405]:
x_training.head()
y_training.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Title
0,3,0,1,0,6
1,1,1,1,0,0
2,3,1,0,0,1
3,1,1,1,0,0
4,3,0,0,0,6


0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

## Scaling

In [406]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_training = sc.fit_transform(x_training)

## Neurales Netz

In [407]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from tensorflow import sigmoid
from tensorflow.keras import activations

# layers = Schichten im NN-Modell
# Dense = Neuronen im Modell
# Activation = Aktivierungregel (wie sigmoid)
# Dropout = overfitting verhindern

#Netz definieren
model = Sequential()
model.add(Dense(7, activation='relu', input_dim=dim))
model.add(Dense(5, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

#Compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

## Train NN

In [408]:
#Train
train = model.fit(x_training, y_training, batch_size=32, epochs=250, validation_split=0.2)
mean_val = np.mean(train.history['val_accuracy'])
max_val = np.max(train.history['val_accuracy'])
print(mean_val)
print(max_val)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78