In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, Imputer, LabelBinarizer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
import lightgbm
import sys
sys.path.insert(0,'../scripts/')
from data_loader import load_data, extract_X_y, splitting
from sklearn_pandas import DataFrameMapper
from pipeline_utils import AddTwoCategoricalVariables, ColumnExtractor, DropColumns, FeatureNormalizer, FillNa, Numerical2Categorical, SexBinarizer, OneHotEncoding, FillNa2
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')



In [2]:
train_path = '../data/train.csv'
train_path_pkl = '../data/train.pkl'
test_path = '../data/test.csv'
test_path_pkl = '../data/test.pkl'

In [3]:
titanic_data = load_data(train_path, train_path_pkl)

INFO:data_loader:
Timings: 'load_data'  112.45 ms


In [4]:
test_data = load_data(test_path, test_path_pkl)

INFO:data_loader:
Timings: 'load_data'  72.22 ms


### Splitting

In [5]:
X, y = extract_X_y(titanic_data)

INFO:data_loader:
Timings: 'extract_X_y'  59.56 ms


In [6]:
X_train, X_val, y_train, y_val = splitting(X,y)

INFO:data_loader:
Timings: 'splitting'  19.33 ms


### Feature Engineering

In [25]:
age_range = [0, 15, 35, 50, 80]
age_label = [0, 1, 2, 3]

In [46]:
def cross_val_accuracy():
    mapper = DataFrameMapper([
    ("Fare", [FillNa2("mean"), MinMaxScaler()], {'alias': 'Fare'}),
    ("Pclass", [FillNa2("mean"), MinMaxScaler()], {'alias': 'Pclass'}),
    ("Sex", FillNa2("mean"),  {'alias': "Sex"}),
    ("Age", FillNa2("mean"),  {'alias': "Age"}),
    ("SibSp", [FillNa2("mean"), MinMaxScaler()], {'alias': 'SibSp'}),
    ("Parch", [FillNa2("mean"), MinMaxScaler()], {'alias': 'Parch'}),
    ], df_out=True)
    pipeline = Pipeline([
            ("mapper", mapper),
            ("sex_binarizer", SexBinarizer()),
            ("num2cat", Numerical2Categorical("Age", age_range, age_label)),
            ("add_age_sex", AddTwoCategoricalVariables("Age_cat", "Sex")),
            ("add_sex_class", AddTwoCategoricalVariables("Sex", "Pclass")),
            ("add_age_sex_class", AddTwoCategoricalVariables("Age_cat_Sex", "Pclass")),
            ("one_hot_encoding", OneHotEncoding(["Age_cat_Sex", "Sex_Pclass"])),
            ("drop_columns", DropColumns(["Age_cat"])),
            ('xgb', XGBClassifier())
    ])
    scores = cross_val_score(pipeline, X, y, cv=5, scoring="roc_auc")
    print("cross-validation score: {0:0.4f}".format(scores.mean()))
    return scores.mean(), pipeline

In [47]:
scores, pipeline = cross_val_accuracy()

cross-validation score: 0.8616
