In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingClassifier, VotingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder, PowerTransformer, StandardScaler
import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None

In [2]:
le = LabelEncoder()
pt = PowerTransformer()
ss = StandardScaler()

In [3]:
# A few matters of preprocessing.
# We can see SibSP and Parch are related, so we combine them into SSP and delete them.
# Families or groups traveling together could do so under a single ticket, so we create a dictionary of these value countst.
# Finally, we create a feature on whether a person was alone based on how many time their ticket appears.
train = pd.read_csv("train_titanic.csv")
train_target = train["Survived"]
train["SSP"] = train["SibSp"] + train["Parch"]
ticket_table = dict(train["Ticket"].value_counts())
train["Alone"] = train["Ticket"].map(ticket_table)
train["Alone"] = np.where(train["Alone"]==1, 1, 0)
train = train.drop(["PassengerId", "Survived", "SibSp", "Ticket", "Parch", "Name"], axis=1)
test = pd.read_csv("test_titanic.csv")
test["SSP"] = test["SibSp"] + test["Parch"]
test_id = test["PassengerId"]
test["Alone"] = test["Ticket"].map(ticket_table)
test["Alone"] = np.where(test["Alone"]==1, 1, 0)
test = test.drop(["PassengerId", "SibSp", "Parch", "Ticket", "Name"], axis=1)

In [4]:
# Two people in Train had no embarkment location, so we assign them to Southampton.
train["Embarked"].fillna("S", inplace=True)

In [5]:
# Mean imputation for non-objects; "None" imputed for objects.
for col in train.columns:
    if train[col].dtype != object:
        train[col].fillna(train[col].median(), inplace=True)
        test[col].fillna(train[col].median(), inplace=True)
        train[col] = ss.fit_transform(train[col].values.reshape(-1,1))
        test[col] = ss.transform(test[col].values.reshape(-1,1))
    else:
        train[col].fillna("None", inplace=True)
        test[col].fillna("None", inplace=True)

In [6]:
# If the skew of a given column is greater than 1 (absolute value), we use PowerTransformer to adjust.
for col in train.columns:
    if train[col].dtype != object:
        if np.abs(train[col].skew()) > 1:
            train[col] = pt.fit_transform(train[col].values.reshape(-1,1))
            test[col] = pt.transform(test[col].values.reshape(-1,1))
        else:
            pass
    else:
        pass

In [7]:
# Cabins are most indicative when looking at the first letter, so we extract it from the string.
# If there is no cabin, we just call it "N."
train["Cabin"] = train["Cabin"].astype(str).str[0]
test["Cabin"] = test["Cabin"].astype(str).str[0]

In [8]:
# Label encoding.
for col in train.columns:
    if train[col].dtype == object:
        train[col] = le.fit_transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))

In [9]:
# To iteratively compare train vs. test scores, we do a split, although we will just use the entire train set for fitting.
X_train, X_test, y_train, y_test = train_test_split(train, train_target, test_size=0.2, random_state=6)

In [10]:
# A number of different models to evaluate. Some (high performers) have been iteratively adjusted based on the grid below.
rfc = RandomForestClassifier(max_depth=9, n_estimators=200)
gbc = GradientBoostingClassifier(max_depth=3, n_estimators=300, learning_rate=0.05)
abc = AdaBoostClassifier()
bc = BaggingClassifier()
lgbmc = LGBMClassifier()
xgbc = XGBClassifier(objective="binary:logistic", eval_metric="error", n_estimators=500, max_depth=2, learning_rate=0.1)
xgbrfc = XGBRFClassifier(objective="binary:logistic", eval_metric="error", max_depth=5, n_estimators=200, learning_rate=0.001)
cbc = CatBoostClassifier(verbose=0)

In [11]:
models = []
models.append(("RFC", rfc))
models.append(("GBC", gbc))
models.append(("ABC", abc))
models.append(("BC", bc))
models.append(("LGBMC", lgbmc))
models.append(("XGBC", xgbc))
models.append(("XGBRFC", xgbrfc))
models.append(("CBC", cbc))

In [12]:
names = []
results = []

In [13]:
for name, model in models:
    kfold = KFold(n_splits=25, shuffle=True)
    result = cross_val_score(model, train, train_target, cv=kfold, scoring="accuracy")
    names.append(name)
    results.append(result)
    print("Estimator: {} | Mean: {} | SD: {} | Min: {} | Max: {}".format(name, result.mean().round(3), result.std().round(3), result.min().round(3), result.max().round(3)))

Estimator: RFC | Mean: 0.82 | SD: 0.068 | Min: 0.667 | Max: 0.943
Estimator: GBC | Mean: 0.817 | SD: 0.07 | Min: 0.611 | Max: 0.943
Estimator: ABC | Mean: 0.801 | SD: 0.075 | Min: 0.611 | Max: 0.917
Estimator: BC | Mean: 0.807 | SD: 0.067 | Min: 0.629 | Max: 0.917
Estimator: LGBMC | Mean: 0.822 | SD: 0.071 | Min: 0.629 | Max: 0.943
Estimator: XGBC | Mean: 0.832 | SD: 0.065 | Min: 0.667 | Max: 1.0
Estimator: XGBRFC | Mean: 0.819 | SD: 0.064 | Min: 0.694 | Max: 0.917
Estimator: CBC | Mean: 0.823 | SD: 0.05 | Min: 0.722 | Max: 0.917


In [16]:
# Here's the last grid.
param_grid = {"n_estimators":[100,200,300,400,500], "max_depth":[1,2,3,4,5], "learning_rate":[0.001,0.01,0.05,0.1,0.5]}
grid = GridSearchCV(GradientBoostingClassifier(), param_grid=param_grid)
grid.fit(X_train, y_train)

GridSearchCV(estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5],
                         'max_depth': [1, 2, 3, 4, 5],
                         'n_estimators': [100, 200, 300, 400, 500]})

In [17]:
# Results of the last one.
grid.best_params_

{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300}

In [18]:
# After a few iterations, landed on a voting classifier with XGBoost's RFC and Scikit-learn's GBC and RFC.
vc = VotingClassifier(estimators=[("XGBRFC", xgbrfc), ("GBC", gbc), ("RFC", rfc)])

In [19]:
vc.fit(train, train_target)

VotingClassifier(estimators=[('XGBRFC',
                              XGBRFClassifier(base_score=None, booster=None,
                                              colsample_bylevel=None,
                                              colsample_bytree=None,
                                              eval_metric='error', gamma=None,
                                              gpu_id=None,
                                              importance_type='gain',
                                              interaction_constraints=None,
                                              learning_rate=0.001,
                                              max_delta_step=None, max_depth=5,
                                              min_child_weight=None,
                                              missing=nan,
                                              monotone_constraints=None,
                                              n_estimators=200, n_jobs=None,
                                   

In [20]:
predictions = vc.predict(test)

In [21]:
submission = pd.DataFrame({"PassengerId":test_id, "Survived":predictions})

In [22]:
submission.to_csv("Titanic_112521_IV.csv", index=False)