In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, StackingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

data_train = data_train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
data_test = data_test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

embarked_map = {'S': 1, 'C': 2, 'Q': 3}
sex_map = {'male': -1, 'female': 1}

data_train['Embarked'] = data_train['Embarked'].map(embarked_map)
data_test['Embarked'] = data_test['Embarked'].map(embarked_map)

data_train['Sex'] = data_train['Sex'].map(sex_map)
data_test['Sex'] = data_test['Sex'].map(sex_map)

data_train['Age'] = pd.to_numeric(data_train['Age'], errors='coerce')
data_test['Age'] = pd.to_numeric(data_test['Age'], errors='coerce')

data_train['Age'] = data_train['Age'].fillna(data_train['Age'].mean())
data_test['Age'] = data_test['Age'].fillna(data_test['Age'].mean())

data_train['Embarked'] = data_train['Embarked'].fillna(1)
data_test['Fare'] = data_test['Fare'].fillna(data_test['Fare'].mean())

In [4]:
X_train, y_train = data_train.drop('Survived', axis=1), data_train.Survived
X_test, y_test = data_test, pd.read_csv('gender_submission.csv').Survived

In [26]:
estimators = [
    ('RandomForest', RandomForestClassifier()),
    ('AdaBoost', AdaBoostClassifier()),
    ('BaggingDecisionTree', BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100)),
    ('BaggingSVC', BaggingClassifier(base_estimator=SVC(C=5), n_estimators=100)),
    ('BaggingKNN', BaggingClassifier(base_estimator=KNeighborsClassifier(), n_estimators=100)),
]

In [27]:
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

clf.fit(X_train, y_train)

StackingClassifier(estimators=[('RandomForest', RandomForestClassifier()),
                               ('AdaBoost', AdaBoostClassifier()),
                               ('BaggingDecisionTree',
                                BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                                  n_estimators=100)),
                               ('BaggingSVC',
                                BaggingClassifier(base_estimator=SVC(C=5),
                                                  n_estimators=100)),
                               ('BaggingKNN',
                                BaggingClassifier(base_estimator=KNeighborsClassifier(),
                                                  n_estimators=100))],
                   final_estimator=LogisticRegression())

In [28]:
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)

print("Accuracy train: ", train_acc)
print("Accuracy test: ", test_acc)

Accuracy train:  0.9809203142536476
Accuracy test:  0.8133971291866029
