# Titanic Prediction of Survivers - Dataset: Kaggle

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing, svm

df = pd.read_csv("train.csv")
df_sub = pd.read_csv("test.csv")


## Checking Correlations

In [None]:
corr = df.corr()
sns.heatmap(corr)
print(corr[["Age","Survived"]].sort_values(by="Survived"))

## Data Cleaning

In [None]:
NUMERICAL = ["PassengerId", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Survived", "Pclass"]
CATEGORIAL = ["Name", "Embarked", "Sex"]

In [None]:
def clean(dataframe):
        
    dataframe["duplicate"] = dataframe.duplicated() # remove duplicates
    dataframe = dataframe.drop(dataframe[dataframe["duplicate"]==True].index)
    dataframe.drop(columns=["duplicate"])

    dataframe = dataframe.drop(dataframe[dataframe["Sex"]==29.69911764705882].index) # removed 1 invalid Datapoint Sex=29.XXXXX
    
    dataframe = dataframe.drop(dataframe[~dataframe["Embarked"].isin(["S", "C", "Q"])].index) # removed 3 Datapoints nan from Embarked

    for cat in CATEGORIAL: # Encode Categoricals
        le = preprocessing.LabelEncoder()
        dataframe[cat] = le.fit_transform(dataframe[cat])
    
    for pclass in dataframe["Pclass"].unique(): # Für NaN Alter: Mean der Pklassen (da höchste Korrelation)
        dataframe.loc[((dataframe["Age"].isnull()) & (dataframe["Pclass"]==pclass)), "Age"] = dataframe.loc[dataframe["Pclass"]==pclass, "Age"].mean()
        print(pclass, dataframe.loc[dataframe["Pclass"]==pclass, "Age"].mean())
        dataframe.loc[((dataframe["Fare"].isnull()) & (dataframe["Pclass"]==pclass)), "Fare"] = dataframe.loc[dataframe["Pclass"]==pclass, "Fare"].mean()
    
    # Alternativ: fill Nan Age with mean
    #dataframe[dataframe["Age"].isnull()] = dataframe["Age"].mean()
    
    dataframe.drop(columns=["Cabin"]) # Cabin raus - da nur 1/4 der Daten vorhanden
    dataframe.drop(columns=["Name"]) # Name raus 
     
    
    return dataframe

df = clean(df)
df_sub = clean(df_sub)

## Feature "Sex" might be important?

In [None]:
df_survived = df[["Sex", "Survived"]].copy()
sns.histplot(data=df_survived, x="Sex", hue="Sex", bins=2)

## SVM with train_test-split

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

X = df[["Sex", "Age", "Fare", "Pclass", "Embarked"]].copy()
y = df["Survived"].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)
clf = svm.SVC(kernel='linear', C=1, random_state=20)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

## SVM with Cross Val

In [None]:
clf = svm.SVC(kernel='linear', C=1, random_state=20) # Support Vector Classifier
scores = cross_val_score(clf, X, y, cv=5) # 5 consecutive Times
print(scores.mean())

## Random Forrest with train_test-split

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=3, criterion="gini", random_state=20)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

## Random Forrest with Cross Val

In [None]:
clf = RandomForestClassifier(max_depth=3, criterion="gini", random_state=20)
scores = cross_val_score(clf, X, y, cv=5)
print(scores.mean())

## Stick to best result für test-data: SVM with train_test-split + wirte Upload-CSV

In [None]:
X_train = df[["Sex", "Age", "Fare", "Pclass", "Embarked"]].copy()
y_train = df["Survived"].copy()
X_test = df_sub[["Sex", "Age", "Fare", "Pclass", "Embarked"]].copy()
clf = svm.SVC(kernel='linear', C=1, random_state=20)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(y_pred)

In [None]:
df_rdy = pd.DataFrame()
df_rdy["PassengerId"] = df_sub["PassengerId"].copy()
df_rdy["Survived"] = y_pred
df_rdy.to_csv("submit.csv", index=False)
