In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingClassifier, VotingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder, PowerTransformer, StandardScaler
import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None

In [4]:
le = LabelEncoder()
pt = PowerTransformer()
ss = StandardScaler()

In [5]:
train = pd.read_csv("train_titanic.csv")
train_target = train["Survived"]
train["SSP"] = train["SibSp"] + train["Parch"]
ticket_table = dict(train["Ticket"].value_counts())
train["Alone"] = train["Ticket"].map(ticket_table)
train["Alone"] = np.where(train["Alone"]==1, 1, 0)
train = train.drop(["PassengerId", "Survived", "SibSp", "Ticket", "Parch", "Name"], axis=1)
test = pd.read_csv("test_titanic.csv")
test["SSP"] = test["SibSp"] + test["Parch"]
test_id = test["PassengerId"]
test["Alone"] = test["Ticket"].map(ticket_table)
test["Alone"] = np.where(test["Alone"]==1, 1, 0)
test = test.drop(["PassengerId", "SibSp", "Parch", "Ticket", "Name"], axis=1)

In [6]:
train["Embarked"].fillna("S", inplace=True)

In [7]:
for col in train.columns:
    if train[col].dtype != object:
        train[col].fillna(train[col].median(), inplace=True)
        test[col].fillna(train[col].median(), inplace=True)
        train[col] = ss.fit_transform(train[col].values.reshape(-1,1))
        test[col] = ss.transform(test[col].values.reshape(-1,1))
    else:
        train[col].fillna("None", inplace=True)
        test[col].fillna("None", inplace=True)

In [8]:
for col in train.columns:
    if train[col].dtype != object:
        if np.abs(train[col].skew()) > 1:
            train[col] = pt.fit_transform(train[col].values.reshape(-1,1))
            test[col] = pt.transform(test[col].values.reshape(-1,1))
        else:
            pass
    else:
        pass

In [9]:
train["Cabin"] = train["Cabin"].astype(str).str[0]
test["Cabin"] = test["Cabin"].astype(str).str[0]

In [10]:
for col in train.columns:
    if train[col].dtype == object:
        train[col] = le.fit_transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(train, train_target, test_size=0.2, random_state=6)

In [12]:
rfc = RandomForestClassifier(max_depth=9, n_estimators=200)
gbc = GradientBoostingClassifier(max_depth=3, n_estimators=300, learning_rate=0.05)
abc = AdaBoostClassifier()
bc = BaggingClassifier()
lgbmc = LGBMClassifier()
xgbc = XGBClassifier(objective="binary:logistic", eval_metric="error", n_estimators=500, max_depth=2, learning_rate=0.1)
xgbrfc = XGBRFClassifier(objective="binary:logistic", eval_metric="error", max_depth=5, n_estimators=200, learning_rate=0.001)
cbc = CatBoostClassifier(verbose=0)

In [59]:
models = []
models.append(("RFC", rfc))
models.append(("GBC", gbc))
models.append(("ABC", abc))
models.append(("BC", bc))
models.append(("LGBMC", lgbmc))
models.append(("XGBC", xgbc))
models.append(("XGBRFC", xgbrfc))
models.append(("CBC", cbc))

In [60]:
names = []
results = []

In [61]:
for name, model in models:
    kfold = KFold(n_splits=25, shuffle=True)
    result = cross_val_score(model, train, train_target, cv=kfold, scoring="accuracy")
    names.append(name)
    results.append(result)
    print("Estimator: {} | Mean: {} | SD: {} | Min: {} | Max: {}".format(name, result.mean().round(3), result.std().round(3), result.min().round(3), result.max().round(3)))

Estimator: RFC | Mean: 0.821 | SD: 0.053 | Min: 0.722 | Max: 0.889
Estimator: GBC | Mean: 0.818 | SD: 0.041 | Min: 0.694 | Max: 0.917
Estimator: ABC | Mean: 0.812 | SD: 0.074 | Min: 0.657 | Max: 0.914
Estimator: BC | Mean: 0.815 | SD: 0.061 | Min: 0.657 | Max: 0.943
Estimator: LGBMC | Mean: 0.833 | SD: 0.06 | Min: 0.694 | Max: 0.944
Estimator: XGBC | Mean: 0.834 | SD: 0.062 | Min: 0.694 | Max: 0.944
Estimator: XGBRFC | Mean: 0.823 | SD: 0.037 | Min: 0.722 | Max: 0.889
Estimator: CBC | Mean: 0.829 | SD: 0.06 | Min: 0.743 | Max: 0.971


In [40]:
param_grid = {"n_estimators":[100,200,300,400,500], "max_depth":[1,2,3,4,5], "learning_rate":[0.001,0.01,0.05,0.1,0.5]}
grid = GridSearchCV(GradientBoostingClassifier(), param_grid=param_grid)
grid.fit(X_train, y_train)

GridSearchCV(estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5],
                         'max_depth': [1, 2, 3, 4, 5],
                         'n_estimators': [100, 200, 300, 400, 500]})

In [41]:
grid.best_params_

{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300}

In [13]:
vc = VotingClassifier(estimators=[("XGBRFC", xgbrfc), ("GBC", gbc), ("RFC", rfc)])

In [14]:
vc.fit(train, train_target)

VotingClassifier(estimators=[('XGBRFC',
                              XGBRFClassifier(base_score=None, booster=None,
                                              colsample_bylevel=None,
                                              colsample_bytree=None,
                                              eval_metric='error', gamma=None,
                                              gpu_id=None,
                                              importance_type='gain',
                                              interaction_constraints=None,
                                              learning_rate=0.001,
                                              max_delta_step=None, max_depth=5,
                                              min_child_weight=None,
                                              missing=nan,
                                              monotone_constraints=None,
                                              n_estimators=200, n_jobs=None,
                                   

In [15]:
predictions = vc.predict(test)

In [16]:
submission = pd.DataFrame({"PassengerId":test_id, "Survived":predictions})

In [17]:
submission.to_csv("Titanic_112521_IV.csv", index=False)