## Laden der Daten

In [520]:
import numpy as np
import pandas as pd

In [521]:
data = pd.read_csv("../../data/titanic.csv")

In [522]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home_destination
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [523]:
data.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [524]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
pclass              1309 non-null int64
survived            1309 non-null int64
name                1309 non-null object
sex                 1309 non-null object
age                 1046 non-null float64
sibsp               1309 non-null int64
parch               1309 non-null int64
ticket              1309 non-null object
fare                1308 non-null float64
cabin               295 non-null object
embarked            1307 non-null object
boat                486 non-null object
body                121 non-null float64
home_destination    745 non-null object
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [525]:
data.isna().any()

pclass              False
survived            False
name                False
sex                 False
age                  True
sibsp               False
parch               False
ticket              False
fare                 True
cabin                True
embarked             True
boat                 True
body                 True
home_destination     True
dtype: bool

## Umgang mit fehlenden kategorischen Werten

In [526]:
# Fill Columns Embarked with the Mode
data.embarked.fillna("C", inplace=True)

In [527]:
# Drop unnecessary columns
data.drop(["body"],axis=1, inplace=True)
data.drop(["home_destination"],axis=1, inplace=True)
data.drop(["name"],axis=1, inplace=True)
data.drop(["ticket"],axis=1, inplace=True)

In [528]:
# Create New Feature for Column Cabin
data['deck'] = data["cabin"].apply(lambda s: s[0] if pd.notnull(s) else 'M')

cabin_binary = []

In [529]:
data['deck'] = data['deck'].replace(['A', 'B', 'C'], 'ABC')
data['deck'] = data['deck'].replace(['D', 'E'], 'DE')
data['deck'] = data['deck'].replace(['F', 'G'], 'FG')
data['deck'] = data['deck'].replace("T", 'M')

Zudem noch ein binäres Feature, ob ein Gast eine Kabine hatte oder nicht.

In [530]:
for index, row in data.iterrows():
    if row["cabin"] == "N":
        cabin_binary.append(0)
    else:
        cabin_binary.append(1)

In [531]:
data["cabin_binary"] = cabin_binary

Es wird aufgeteilt nach Personen, die in einer Kabine waren und Personen, die nicht in einer Kabine waren.
Keine Kabine = 0,
Kabine = 1

In [532]:
data.drop(["cabin"],axis=1, inplace=True)

In [533]:
data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked,boat,deck,cabin_binary
0,1,1,female,29.0,0,0,211.3375,S,2.0,ABC,1
1,1,1,male,0.9167,1,2,151.55,S,11.0,ABC,1
2,1,0,female,2.0,1,2,151.55,S,,ABC,1
3,1,0,male,30.0,1,2,151.55,S,,ABC,1
4,1,0,female,25.0,1,2,151.55,S,,ABC,1


In [534]:
missing_value = data[(data.pclass == 3) & 
                     (data.embarked == "S") & 
                     (data.sex == "male")].fare.mean()
# den einzelnen fehlenden Wert durch den Durschnnit
data.fare.fillna(missing_value, inplace=True)

In [535]:
data['age'] = data.groupby(['sex', 'pclass'])['age'].apply(lambda x: x.fillna(x.median()))

### Erstellen von Altersgruppen

In [536]:
age_group = []

In [537]:
for index, row in data.iterrows():
    if row["age"] <= 6:
        age_group.append("Baby")
    elif row["age"] <= 18:
        age_group.append("Child")
    elif row["age"] <= 65:
        age_group.append("Adult")
    else:
        age_group.append("Senior")

In [538]:
data["age_group"] = age_group

### Erstellen von Feature Familiengröße

In [539]:
data['family_Size'] = data['sibsp'] + data['parch'] + 1

In [540]:
data.columns

Index(['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'boat', 'deck', 'cabin_binary', 'age_group', 'family_Size'],
      dtype='object')

In [541]:
#boat - Lifeboat (if survived)
data["boat"] = data["boat"].apply(lambda s: s[0:2] if pd.notnull(s) else 'No')

In [542]:
data.boat.unique()

array(['2', '11', 'No', '3', '10', 'D', '4', '9', '6', 'B', '8', 'A', '5',
       '7', 'C', '14', '5 ', '13', '1', '15', '8 ', '12', '16', 'C '],
      dtype=object)

In [543]:
data['boat'] = data['boat'].replace('C ', 'C')
data['boat'] = data['boat'].replace('5 ', '5')
data['boat'] = data['boat'].replace('8 ', '8')

In [544]:
data.boat.unique()

array(['2', '11', 'No', '3', '10', 'D', '4', '9', '6', 'B', '8', 'A', '5',
       '7', 'C', '14', '13', '1', '15', '12', '16'], dtype=object)

In [545]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
pclass          1309 non-null int64
survived        1309 non-null int64
sex             1309 non-null object
age             1309 non-null float64
sibsp           1309 non-null int64
parch           1309 non-null int64
fare            1309 non-null float64
embarked        1309 non-null object
boat            1309 non-null object
deck            1309 non-null object
cabin_binary    1309 non-null int64
age_group       1309 non-null object
family_Size     1309 non-null int64
dtypes: float64(2), int64(6), object(5)
memory usage: 133.1+ KB


## Train Test Split

In [546]:
from sklearn.model_selection import train_test_split

In [547]:
input_features = [
       'pclass', 'sex',
        'age', 'sibsp', 'parch',
        'fare', 'deck',
        'embarked', 'boat','cabin_binary',
        'age_group', 'family_Size' 
]

output_features = [
    'survived'
]

X_train, X_test, y_train, y_test = train_test_split(
    data[input_features],
    data[output_features], random_state = 111
)

## Wichtige Sk Learn Pakete laden

In [548]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import preprocessing

### Column Selector erstellen

In [549]:
class ColumnSelector:
    
    def __init__(self, select_numeric=True):
        self.select_numeric = select_numeric
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self.select_numeric:
            return X.select_dtypes(include=["number"])
        elif not self.select_numeric:
            return X.select_dtypes(exclude=["number"])
        
class MyLEncoder():

    def transform(self, X, y=None, **fit_params):
        enc = preprocessing.OneHotEncoder()
        encc = enc.fit(X)
        enc_data = enc.transform(X)

        return enc_data

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [550]:
num_pipeline = Pipeline(steps=[
    ("select numbers", ColumnSelector(select_numeric=True)),
    ("scale data", MinMaxScaler())
])

In [551]:
cat_pipeline = Pipeline(steps=[
    ("select non numeric", ColumnSelector(select_numeric=False)),
    ("encode data", MyLEncoder())
])

In [552]:
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [553]:
full_pipeline.fit(X_train)
X_train_p = full_pipeline.transform(X_train)
X_test_p = full_pipeline.transform(X_test)

Output Feature muss nicht transformiert werden, da es bereits mit 0 und 1 markiert ist.

## Klassifikation Logistischer Regression

In [554]:
from sklearn.linear_model import LogisticRegression

In [565]:
log_model = LogisticRegression(solver = "newton-cg")
log_model.fit(X_train_p, y_train.values.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [566]:
log_prediction = log_model.predict(X_test_p)

In [568]:
log_model.score(X_test_p, y_test)

0.9725609756097561

## Decision Tree Classifier

In [569]:
from sklearn.tree import DecisionTreeClassifier

In [570]:
decision_tree = DecisionTreeClassifier(max_depth= 8, min_samples_leaf= 5)
decision_tree.fit(X_train_p, y_train)



DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [571]:
decision_tree_predict = decision_tree.predict(X_test_p)

In [572]:
decision_tree.score(X_test_p, y_test)

0.9603658536585366

## Klassifikation Random Forest

In [573]:
from sklearn.ensemble import RandomForestClassifier

In [579]:
rf_model = RandomForestClassifier(max_depth=10, random_state= 191, n_estimators=15)
rf_model.fit(X_train_p, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=15,
                       n_jobs=None, oob_score=False, random_state=191,
                       verbose=0, warm_start=False)

In [580]:
rf_prediction = rf_model.predict(X_test_p)

In [581]:
rf_model.score(X_test_p, y_test)

0.9664634146341463

## Klassifikation SVM

In [582]:
from sklearn.svm import SVC

In [584]:
svm = SVC( gamma="auto")
svm.fit(X_train_p, y_train.values.ravel())

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [585]:
svm_prediction = svm.predict(X_test_p)

In [586]:
svm.score(X_test_p, y_test)

0.9725609756097561

Alle Klassisifikationsmodelle weisen eine sehr hohe Accuracy bei der Klassifikation ob Survived oder nicht auf.


(array([0.        , 0.01932367, 1.        ]),
 array([0.        , 0.95867769, 1.        ]),
 array([2, 1, 0]))