In [47]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [48]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [49]:
to_drop = ["PassengerId", "Name", "Cabin", "Ticket"]

df_train.drop(to_drop,axis=1, inplace= True)
df_test.drop(to_drop,axis=1, inplace= True)

In [50]:
df_train=df_train.dropna(subset=["Embarked"])
df_test=df_test.dropna(subset=["Embarked"])

In [51]:
df_train = pd.get_dummies(df_train,
                        columns=["Sex", "Embarked"],
                        drop_first=True)

df_test = pd.get_dummies(df_test,
                        columns=["Sex", "Embarked"],
                        drop_first=True)

In [52]:
labels = df_train[['Survived']]
labels.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [53]:
features = df_train[df_train.columns]
features.drop(['Survived'],
            axis=1,
            inplace=True)

#features.head()

In [54]:
from sklearn.impute import KNNImputer
imputer = KNNImputer()

filled_features = imputer.fit_transform(features)
filled_test = imputer.transform(df_test)

In [55]:
filled_features = pd.DataFrame(filled_features, columns = features.columns)
filled_test = pd.DataFrame(filled_test, columns = df_test.columns)

In [61]:
filled_features.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3.0,22.0,1.0,0.0,7.25,1.0,0.0,1.0
1,1.0,38.0,1.0,0.0,71.2833,0.0,0.0,0.0
2,3.0,26.0,0.0,0.0,7.925,0.0,0.0,1.0
3,1.0,35.0,1.0,0.0,53.1,0.0,0.0,1.0
4,3.0,35.0,0.0,0.0,8.05,1.0,0.0,1.0


In [62]:
filled_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3.0,34.5,0.0,0.0,7.8292,1.0,1.0,0.0
1,3.0,47.0,1.0,0.0,7.0,0.0,0.0,1.0
2,2.0,62.0,0.0,0.0,9.6875,1.0,1.0,0.0
3,3.0,27.0,0.0,0.0,8.6625,1.0,0.0,1.0
4,3.0,22.0,1.0,1.0,12.2875,0.0,0.0,1.0


In [60]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = pd.DataFrame(scaler.fit_transform(filled_features),
                        columns=features.columns)
scaled_test = pd.DataFrame(scaler.transform(filled_test),
                        columns=features.columns)

In [63]:
scaled_features.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.825209,-0.585051,0.43135,-0.474326,-0.50024,0.735342,-0.307941,0.616794
1,-1.572211,0.594799,0.43135,-0.474326,0.788947,-1.359911,-0.307941,-1.621287
2,0.825209,-0.290088,-0.475199,-0.474326,-0.48665,-1.359911,-0.307941,0.616794
3,-1.572211,0.373577,0.43135,-0.474326,0.422861,-1.359911,-0.307941,0.616794
4,0.825209,0.373577,-0.475199,-0.474326,-0.484133,0.735342,-0.307941,0.616794


In [64]:
scaled_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.825209,0.336707,-0.475199,-0.474326,-0.488579,0.735342,3.247377,-1.621287
1,0.825209,1.258464,0.43135,-0.474326,-0.505273,-1.359911,-0.307941,0.616794
2,-0.373501,2.364573,-0.475199,-0.474326,-0.451165,0.735342,3.247377,-1.621287
3,0.825209,-0.216348,-0.475199,-0.474326,-0.471802,0.735342,-0.307941,0.616794
4,0.825209,-0.585051,0.43135,0.765897,-0.398819,-1.359911,-0.307941,0.616794


In [65]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(scaled_features,
                                labels)

Using TensorFlow backend.


In [66]:
X_res = pd.DataFrame(X_res,
                    columns=features.columns)
X_res.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.825209,-0.585051,0.43135,-0.474326,-0.50024,0.735342,-0.307941,0.616794
1,-1.572211,0.594799,0.43135,-0.474326,0.788947,-1.359911,-0.307941,-1.621287
2,0.825209,-0.290088,-0.475199,-0.474326,-0.48665,-1.359911,-0.307941,0.616794
3,-1.572211,0.373577,0.43135,-0.474326,0.422861,-1.359911,-0.307941,0.616794
4,0.825209,0.373577,-0.475199,-0.474326,-0.484133,0.735342,-0.307941,0.616794


In [68]:
y_res = pd.DataFrame(y_res,
                    columns=labels.columns)
y_res.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [69]:
y_res['Survived'].value_counts()

1    549
0    549
Name: Survived, dtype: int64

In [70]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb

In [71]:
svc = SVC(probability=True)
knn = KNeighborsClassifier()
rfc = RandomForestClassifier()
ada = AdaBoostClassifier()
xgb = xgb.XGBClassifier(n_estimators=300)

In [72]:
X_train = X_res
Y_train = y_res

X_test = scaled_test

In [73]:
svc.fit(X_train,
        Y_train)
knn.fit(X_train,
        Y_train)
rfc.fit(X_train,
        Y_train)
ada.fit(X_train,
        Y_train)
xgb.fit(X_train,
        Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [76]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimators=[('svc', svc), ('knn', knn), ('rfc', rfc), ('ada', ada), ('xgb', xgb)]
stack = StackingClassifier(estimators=estimators,
                        final_estimator=LogisticRegression())

In [77]:
stack.fit(X_train,
        Y_train)

StackingClassifier(cv=None,
                   estimators=[('svc',
                                SVC(C=1.0, break_ties=False, cache_size=200,
                                    class_weight=None, coef0=0.0,
                                    decision_function_shape='ovr', degree=3,
                                    gamma='scale', kernel='rbf', max_iter=-1,
                                    probability=True, random_state=None,
                                    shrinking=True, tol=0.001, verbose=False)),
                               ('knn',
                                KNeighborsClassifier(algorithm='auto',
                                                     leaf_size=30,
                                                     metric='minkowski',
                                                     met...
                                              seed=None, silent=None,
                                              subsample=1, verbosity=1))],
                   final_es

In [78]:
stack_predictions = stack.predict(X_test)

In [81]:
# save predictions to csv

In [83]:
dd = pd.DataFrame({'Survived': stack_predictions }, index=[892+i for i in range(418)])
dd.index.name='PassengerId'
print(dd.head())
dd.to_csv('stack_predictions.csv')

KeyError: "None of ['PassengerId'] are in the columns"