In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

# Examine Data

In [2]:
train = pd.read_csv("data/train.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [4]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
train["Cabin"].str.count("NaN")

0      NaN
1      0.0
2      NaN
3      0.0
4      NaN
      ... 
886    NaN
887    0.0
888    NaN
889    0.0
890    NaN
Name: Cabin, Length: 891, dtype: float64

In [6]:
train[["Pclass","Sex","Age","SibSp", "Parch","Embarked"]].describe()

Unnamed: 0,Pclass,Age,SibSp,Parch
count,891.0,714.0,891.0,891.0
mean,2.308642,29.699118,0.523008,0.381594
std,0.836071,14.526497,1.102743,0.806057
min,1.0,0.42,0.0,0.0
25%,2.0,20.125,0.0,0.0
50%,3.0,28.0,0.0,0.0
75%,3.0,38.0,1.0,0.0
max,3.0,80.0,8.0,6.0


In [7]:
 #sanity check
assert(len(train["PassengerId"].unique())==891)
assert(len(train["Name"].unique())==891)

In [8]:
print("different tickets: ",len(train["Ticket"].unique()))
print("different fares:   ",len(train["Fare"].unique())) 

# appearently, cabin assignement was only known for a few passengers from a document found on a victim after recovery
print("different cabins:  ",len(train["Cabin"].unique()))

different tickets:  681
different fares:    248
different cabins:   148


In [9]:
train["Survived"].sum() # 342 survivors  -> 549 victims

342

## preprocess data

In [94]:
# make categorical features 
# trying out different methods
from sklearn.preprocessing import LabelEncoder  
le = LabelEncoder()

def preprocess_inplace(df):
    
    df["Embarked"] = le.fit_transform(df["Embarked"])

    df["Sex"] = df["Sex"].astype('category').cat.codes

    df["Cabin"] = pd.factorize(df["Cabin"])[0]

    # this one I like most
    df["Ticket"] = df["Ticket"].astype('category').cat.codes
preprocess_inplace(train)

In [91]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,523,7.25,-1,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,596,71.2833,0,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,669,7.925,-1,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,49,53.1,1,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,472,8.05,-1,2


In [12]:
#sns.pairplot(train)
#plt.show()
# no immidiate dependency between Survived and any other label appearent

### Text Feature

In [102]:



vectorizer = CountVectorizer()
text = train["Name"]

X = vectorizer.fit_transform(text)
len(vectorizer.get_feature_names_out())

def embbed_text(df):
    # TODO (if featureexplosion can be handled somehow) implement embedding
    df = df.drop(columns=["Name"])
    return df
    
train = embbed_text(train)

# Solve Task

In [26]:
# work with validation set
random_state = 42
y_in = train["Survived"]
X_in = train.drop(columns="Survived")
X_train, X_valid, y_train, y_valid= train_test_split(X_in,y_in , test_size=50, random_state=random_state)

In [27]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
448,449,3,0,5.0,2,1,194,19.2583,-1,0
820,821,1,0,52.0,1,1,67,93.5,138,2
360,361,3,1,40.0,1,4,337,27.9,-1,2
802,803,1,1,11.0,1,2,33,120.0,72,2
280,281,3,1,65.0,0,0,290,7.75,-1,1


In [28]:
# we have no preference towards any error, so any is bad -> we care about Accuracy only
def evaluate_performance(y_true, y_pred):
    out = ""
    out = out + "confusion matrix:\n" + str(confusion_matrix(y_true, y_pred))
    out = out + "\nmain metric (Accuracy): " + str(accuracy_score(y_true, y_pred))
    return out

In [82]:
def xgclass(X_train, y_train, X_valid, y_valid = None, tree_method= "hist", n_estimators=2, max_depth=2, 
            eta=0.3, objective='binary:logistic'):

    bst = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, eta=eta, objective=objective, 
                        enable_categorical=True, tree_method = tree_method)
    # fit model
    bst.fit(X_train, y_train)
    # make predictions
    preds = bst.predict(X_valid)

    model_identifier = "tree_method: " +tree_method
    
    if y_valid:
        print(model_identifier,"\n" +evaluate_performance(y_valid, preds))

    labels = "PassengerId 	Pclass 	Sex 	Age 	SibSp 	Parch 	Ticket 	Fare 	Cabin 	Embarked ".split("	")
    feature_importance = [ i + f'{j:.3f}' for i,j in zip(labels,bst.feature_importances_)]
    print(feature_importance)
    return preds

In [61]:
# dataset is imbalanced towards having more victims -> trivial baseline is majority class classifier
y_base_line = np.zeros(len(y_valid))
print("baseline majority class classifier\n" + evaluate_performance(y_valid, y_base_line))

baseline majority class classifier
confusion matrix:
[[32  0]
 [18  0]]
main metric (Accuracy): 0.64


In [78]:
xgclass(X_train, y_train, X_valid, y_valid, "hist", max_depth = 3, eta=1, objective='binary:logistic')

tree_method: hist 
confusion matrix:
[[30  2]
 [ 5 13]]
main metric (Accuracy): 0.86
['PassengerId 0.016', 'Pclass 0.158', 'Sex 0.553', 'Age 0.044', 'SibSp 0.000', 'Parch 0.000', 'Ticket 0.054', 'Fare 0.050', 'Cabin 0.125', 'Embarked 0.000']


array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0])

In [79]:
xgclass(X_train, y_train, X_valid, y_valid, "hist", max_depth = 5, eta=1, objective='binary:logistic')

tree_method: hist 
confusion matrix:
[[31  1]
 [ 6 12]]
main metric (Accuracy): 0.86
['PassengerId 0.009', 'Pclass 0.161', 'Sex 0.611', 'Age 0.027', 'SibSp 0.042', 'Parch 0.006', 'Ticket 0.021', 'Fare 0.022', 'Cabin 0.081', 'Embarked 0.018']


array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

# Create Submission

In [129]:
test = pd.read_csv("data/test.csv")
preprocess_inplace(test)
test = embbed_text(test)

pred = xgclass(X_in, y_in, test, None, "hist", max_depth = 3, eta=1, objective='binary:logistic')
print(len(pred))
submission = test[["PassengerId"]]
submission.insert(1,"Survived",list(pred))
submission.to_csv("result/submission.csv", index=False)
submission.head()

['PassengerId 0.018', 'Pclass 0.120', 'Sex 0.574', 'Age 0.052', 'SibSp 0.000', 'Parch 0.000', 'Ticket 0.051', 'Fare 0.054', 'Cabin 0.131', 'Embarked 0.000']
418


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
