### In this case I use contamination approach to classification Titanic passengers. I use svm, random forest and extra trees algorithm. Mixer algorithm will be logistic regression.

##### Due to small amount of data this approach perform worse than other ensemble algorithms.

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")
import sklearn
import xgboost

In [3]:
titanic_train = pd.read_csv("titanic_train.csv")
titanic_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
def preprocessing(data):

    data["Salutation"] = data.Name.str.extract(pat = "([A-Z][a-z]*\\.)")
    data["Deck"] = data.Cabin.str.extract(pat = "(^.)")
    data = data.drop(columns = ["PassengerId", "Ticket", "Name", "Cabin"])
    return data

In [5]:
titanic_train = preprocessing(titanic_train)

In [6]:
y, X = titanic_train.Survived, titanic_train.drop(columns = ["Survived"])

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 50)

In [9]:
# numerical features from the dataset
numerical_features = X.select_dtypes(include = ["int", "float"]).columns

# categorical features from the dataset
categorical_features = X.select_dtypes(include = ["object"]).columns

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

numerical_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "mean")),
    ("scaler", StandardScaler())
])

In [12]:
from sklearn.preprocessing import OneHotEncoder

categorical_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "constant")),
    ("encoder", OneHotEncoder(handle_unknown = "ignore"))
])

In [13]:
from sklearn.compose import ColumnTransformer

data_transformer = ColumnTransformer(transformers = [
    ("numerical", numerical_transformer, numerical_features),
    ("categorical", categorical_transformer, categorical_features)
])

In [14]:
X_train_transform = data_transformer.fit_transform(X_train)
X_test_transform = data_transformer.transform(X_test)

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


rf_clf = RandomForestClassifier()
et_clf = ExtraTreesClassifier()
gb_clf = GradientBoostingClassifier()
ab_clf = AdaBoostClassifier()
xgb_clf = XGBClassifier()


for i in [rf_clf, et_clf, gb_clf, ab_clf, xgb_clf]:
    i.fit(X_train_transform, y_train)
    pred = i.predict(X_test_transform)
    
    print(i.__class__.__name__, "accuracy_score:", accuracy_score(y_test, pred))
    print(i.__class__.__name__, "f1_score:", f1_score(y_test, pred), "\n")

RandomForestClassifier accuracy_score: 0.7982062780269058
RandomForestClassifier f1_score: 0.7398843930635839 

ExtraTreesClassifier accuracy_score: 0.8026905829596412
ExtraTreesClassifier f1_score: 0.7411764705882352 

GradientBoostingClassifier accuracy_score: 0.8071748878923767
GradientBoostingClassifier f1_score: 0.7570621468926554 

AdaBoostClassifier accuracy_score: 0.7937219730941704
AdaBoostClassifier f1_score: 0.7553191489361704 

XGBClassifier accuracy_score: 0.7892376681614349
XGBClassifier f1_score: 0.7218934911242604 



In [59]:
X_train_mixer = pd.concat([pd.DataFrame(gb_clf.predict_proba(X_test_transform)), 
                           pd.DataFrame(rf_clf.predict_proba(X_test_transform)),
                           pd.DataFrame(et_clf.predict_proba(X_test_transform)), 
                           pd.DataFrame(ab_clf.predict_proba(X_test_transform))],
                          axis = 1)

In [60]:
mixer_clf = LogisticRegression()

mixer_clf.fit(X_train_mixer, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [61]:
from sklearn.model_selection import cross_val_score

cross_val_score(mixer_clf, X_train_mixer, y_test, cv = 5)

array([0.86666667, 0.82222222, 0.82222222, 0.75      , 0.86363636])

# Classification kaggle test dataset

In [34]:
titanic_test = pd.read_csv("titanic_test.csv")

PassengerId = titanic_test.PassengerId

In [35]:
titanic_test = preprocessing(titanic_test)
titanic_test = data_transformer.transform(titanic_test)

In [37]:
X_test_mixer = pd.concat([pd.DataFrame(gb_clf.predict_proba(titanic_test)), 
                           pd.DataFrame(rf_clf.predict_proba(titanic_test)),
                           pd.DataFrame(et_clf.predict_proba(titanic_test)), 
                           pd.DataFrame(ab_clf.predict_proba(titanic_test))],
                          axis = 1)

In [38]:
gender_submission = mixer_clf.predict(X_test_mixer)

In [39]:
gender_submission = pd.DataFrame({"Survived": gender_submission}, index = PassengerId)
gender_submission

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1
...,...
1305,0
1306,1
1307,0
1308,0


In [40]:
gender_submission.to_csv("gender_submission.csv")