In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path
import sys
from sklearn_utils.preprocessing import ColumnSelector, ColumnDropper, DTypeTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import clone
import numpy as np

In [2]:
def prepend_pipeline_steps(search_dict, name):
    res = {f"{name}__{key}": val for key, val in search_dict.items()}
    return res

In [3]:
data_dir = Path(".").absolute().parent / "data"
data = pd.read_csv(data_dir / "train.csv")
X = data[[c for c in data.columns if c != "Survived"]]
y = data[["Survived"]]
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Preprocessing

In [4]:
NUM_COLS = ["Fare", "Age"]
NOM_COLS = ["Sex", "Embarked"]
ORD_COLS = ["Pclass"]
ALL_COLS = NUM_COLS + NOM_COLS + ORD_COLS 

# select columns
c_selector = ColumnSelector(ALL_COLS)

# nominal_pipeline
nom_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", OneHotEncoder(handle_unknown="ignore")),
])

# ordinal_pipeline
ord_pipe = Pipeline([
    ("encode", OrdinalEncoder(categories=[[1, 2, 3,]])),
    ("impute", SimpleImputer(strategy="most_frequent")),
])

# numeric pipeline
num_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("impute", SimpleImputer(strategy="median")),
])

preprocessor = ColumnTransformer(
    [("nom", nom_pipe, NOM_COLS),
    ("ord", ord_pipe, ORD_COLS),
    ("num", num_pipe, NUM_COLS)],
)

pp_pipe = Pipeline([
    ("col_select", c_selector),
    ("preprocess", preprocessor),
    ])

# Model Selection and Hyperparmeter Tuning

In [5]:
posible_models = {
    "logistic_regression": {
        "model": LogisticRegression(solver="liblinear"),
        "params": {
            "C": [0.1, 1, 10],
            "penalty": ["l1", "l2"]
        }
    },
    "svm": {
        "model": SVC(),
        "params": {
            "C": [0.1, 1, 10, 100],
            "kernel": ["linear" ,"poly", "rbf"]
        }
    },
    "naive_bayes": {
        "model": GaussianNB(),
        "params": {}
    },
    "random_forrest": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [50, 100, 200, 500],
            "max_depth": [None, 16, 32]
        }
    },
    "adaboost": {
        "model": AdaBoostClassifier(),
        "params": {
            "n_estimators": [30, 50, 100, 200],
            "learning_rate": [0.1, 1]
        }
    },
}

scores = []
for mod_name, mod_params in posible_models.items():
    pipe = clone(pp_pipe)
    pipe.steps.append(["clf", mod_params["model"]])
    mod_params = prepend_pipeline_steps(
        mod_params["params"], "clf",
    )
    pipe = GridSearchCV(pipe, mod_params)
    pipe.fit(X_train, np.ravel(y_train))
    scores.append({
        "model": mod_name,
        "best_score": pipe.best_score_,
        "best_params": pipe.best_params_,
    })
scores

[{'model': 'logistic_regression',
  'best_score': 0.7980136909437773,
  'best_params': {'clf__C': 10, 'clf__penalty': 'l1'}},
 {'model': 'svm',
  'best_score': 0.8338345864661655,
  'best_params': {'clf__C': 1, 'clf__kernel': 'rbf'}},
 {'model': 'naive_bayes', 'best_score': 0.7740769835035349, 'best_params': {}},
 {'model': 'random_forrest',
  'best_score': 0.8128268432274716,
  'best_params': {'clf__max_depth': 32, 'clf__n_estimators': 100}},
 {'model': 'adaboost',
  'best_score': 0.8189541016720906,
  'best_params': {'clf__learning_rate': 1, 'clf__n_estimators': 200}}]

## Ensembling best models together

In [6]:
estimators = [
    ("logreg", LogisticRegression(C=0.1, penalty= 'l2')),
    ("svm", SVC(C=1, kernel="rbf")),
    ("naive_bayes", GaussianNB()),
    ("rforrest", RandomForestClassifier(max_depth=16)),
    ("adaboost", AdaBoostClassifier(n_estimators=200))
]

In [7]:
voting_clf = clone(pp_pipe)
voting_clf.steps.append(["clf", VotingClassifier(estimators, n_jobs=-1)])
voting_clf.fit(X_train, np.ravel(y_train))
voting_score = voting_clf.score(X_test, y_test)
voting_score

0.8026905829596412

In [8]:
stacking_clf = clone(pp_pipe)
stacking_clf.steps.append(["clf", StackingClassifier(estimators, n_jobs=-1)])
stacking_clf.fit(X_train, np.ravel(y_train))
stacking_score = stacking_clf.score(X_test, np.ravel(y_test))
stacking_score

0.7937219730941704

## Final Model Training

In [9]:
test_data = pd.read_csv(data_dir / "test.csv")
stacking_clf = clone(pp_pipe)
stacking_clf.steps.append(["clf", StackingClassifier(estimators, n_jobs=-1)])
stacking_clf.fit(X, np.ravel(y))

predictions = stacking_clf.predict(test_data)
df_out = pd.DataFrame({"PassengerId": test_data["PassengerId"], "Survived": predictions})
df_out.to_csv(data_dir / "pred.csv", index=False)