# Titanic data analysis

In [51]:
import numpy as np
import pandas as pd

# Rotwein- und Weißweindaten einlesen
training = pd.read_csv('titanic_training.csv', sep=",")

## Display beginning of dataset

In [52]:

training.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Display end of dataset

In [53]:
training.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


## Correlations

In [56]:
cor = training.corr()
print(cor)

             PassengerId  Survived    Pclass       Age     SibSp     Parch  \
PassengerId     1.000000 -0.005007 -0.035144  0.036847 -0.057527 -0.001652   
Survived       -0.005007  1.000000 -0.338481 -0.077221 -0.035322  0.081629   
Pclass         -0.035144 -0.338481  1.000000 -0.369226  0.083081  0.018443   
Age             0.036847 -0.077221 -0.369226  1.000000 -0.308247 -0.189119   
SibSp          -0.057527 -0.035322  0.083081 -0.308247  1.000000  0.414838   
Parch          -0.001652  0.081629  0.018443 -0.189119  0.414838  1.000000   
Fare            0.012658  0.257307 -0.549500  0.096067  0.159651  0.216225   

                 Fare  
PassengerId  0.012658  
Survived     0.257307  
Pclass      -0.549500  
Age          0.096067  
SibSp        0.159651  
Parch        0.216225  
Fare         1.000000  


## Missing and Empty values

In [55]:
def get_missing(df):
    percent_missing = df.isnull().sum() / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
    missing_value_df.sort_values('percent_missing', inplace=True)
    return missing_value_df

print(get_missing(training))

             column_name  percent_missing
PassengerId  PassengerId         0.000000
Survived        Survived         0.000000
Pclass            Pclass         0.000000
Name                Name         0.000000
Sex                  Sex         0.000000
SibSp              SibSp         0.000000
Parch              Parch         0.000000
Ticket            Ticket         0.000000
Fare                Fare         0.000000
Embarked        Embarked         0.002245
Age                  Age         0.198653
Cabin              Cabin         0.771044


## Replace and Cleanup Dataset

In [15]:
# Map male to 0
training.loc[training['Sex'] == 'male', 'Sex'] = 0
# Map female to 1
training.loc[training['Sex'] == 'female', 'Sex'] = 1

# Map the port to S = 1 C = 2 Q = 3
training.loc[training['Embarked'] == 'S', 'Embarked'] = 1
training.loc[training['Embarked'] == 'C', 'Embarked'] = 2
training.loc[training['Embarked'] == 'Q', 'Embarked'] = 3

training.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0,,1
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0,B42,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,,1,2,W./C. 6607,23.45,,1
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0,C148,2
890,891,0,3,"Dooley, Mr. Patrick",0,32.0,0,0,370376,7.75,,3


## Get all name titles

In [50]:
available_titles = training['Name'].str.split(', ').str[1]
available_titles = available_titles.str.split('.').str[0]
training["Title"] = available_titles
distinct_title = list(set(available_titles))

groupedTitles = {
    "married-female": ["Mrs", "Mme"],
    "not-married-female": ["Ms", "Miss", "Mlle"],
    "army": ["Col", "Major", "Capt"],
    "noble": ["Sir", "Master", "Lady", "Jonkheer", "the Countess", "Don"],
    "academic": ["Dr"],
    "chaplain": ["Rev"],
    "not-specified-male": ["Mr"]
}

for category, values_to_replace in groupedTitles.items():
    training.loc[training['Title'].isin(values_to_replace) , 'Title'] = category


print(distinct_title)
print(len(distinct_title))
print(get_missing(training))

training.head()

['Major', 'Ms', 'Mr', 'Lady', 'Miss', 'Mrs', 'Mme', 'Col', 'Don', 'Mlle', 'Jonkheer', 'Sir', 'Rev', 'Capt', 'Master', 'the Countess', 'Dr']
17
             column_name  percent_missing
PassengerId  PassengerId         0.000000
Survived        Survived         0.000000
Pclass            Pclass         0.000000
Name                Name         0.000000
Sex                  Sex         0.000000
SibSp              SibSp         0.000000
Parch              Parch         0.000000
Ticket            Ticket         0.000000
Fare                Fare         0.000000
Title              Title         0.000000
Embarked        Embarked         0.002245
Age                  Age         0.198653
Cabin              Cabin         0.771044


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,1,not-specified-male
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,2,married-female
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,1,not-married-female
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,1,married-female
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,1,not-specified-male


In [4]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

# layers = Schichten im NN-Modell
# Dense = Neuronen im Modell
# Activation = Aktivierungregel (wie sigmoid)
# Dropout = overfitting verhindern

#Netz definieren
model = Sequential()
model.add(Dense(5, activation=''))  # activation checken
# Schichten adden
model.add(Dropout(0.5))
model.add(Dense(1, activation=sigmoid()))  # sigmoid import

#Compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  #adam checken

#Train
train = model.fit(training[....], training[survied], epochs=3, batch_size=32,
                  validation_split=0.2)  #epochs niedrig anfangen, validation_split 20 bis 30
val = np.mean(train.history['val_acc'])
print(val)

SyntaxError: invalid syntax (2982012744.py, line 19)