In [7]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, Imputer, LabelBinarizer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
import lightgbm
import sys
sys.path.insert(0,'../scripts/')
from data_loader import load_data, extract_X_y, splitting
from sklearn_pandas import DataFrameMapper
from pipeline_utils import AddTwoCategoricalVariables, ColumnExtractor, DropColumns, FeatureNormalizer, FillNa, Numerical2Categorical, SexBinarizer, OneHotEncoding, FillNa2
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_path = '../data/train.csv'
train_path_pkl = '../data/train.pkl'
test_path = '../data/test.csv'
test_path_pkl = '../data/test.pkl'

In [3]:
titanic_data = load_data(train_path, train_path_pkl)

INFO:data_loader:
Timings: 'load_data'  3.47 ms


In [4]:
test_data = load_data(test_path, test_path_pkl)

INFO:data_loader:
Timings: 'load_data'  3.80 ms


### Splitting

In [5]:
X, y = extract_X_y(titanic_data)

INFO:data_loader:
Timings: 'extract_X_y'  1.97 ms


In [6]:
X_train, X_val, y_train, y_val = splitting(X,y)

INFO:data_loader:
Timings: 'splitting'  3.49 ms


### Feature Engineering

In [29]:
# feature_columns = ["Fare", "Pclass", "Sex", "Age", "SibSp", "Parch"]
normalize_features = ["Fare", "SibSp", "Parch"]
age_range = [0, 15, 35, 50, 80]
age_label = [0, 1, 2, 3]

In [44]:
def cross_val_accuracy():
    mapper = DataFrameMapper([
    (["Fare", "Pclass", "Age", "SibSp", "Parch"], FillNa2("mean")),
    ("Sex", None),
    ("Age", None),
    ("Pclass", None),
    ("SibSp", None),
    ("Parch", None)"Age",
    ("Fare", None)
    ], df_out=True)
    pipeline = Pipeline([
#             ("column_extractor", ColumnExtractor(feature_columns)),
#             ("fill_na", FillNa("mean")),
            ("mapper", mapper),
            ("sex_binarizer", SexBinarizer()),
            ("num2cat", Numerical2Categorical("Age", age_range, age_label)),
            ("add_age_sex", AddTwoCategoricalVariables("Age_cat", ColumnExtractor("Sex"))),
            ("add_sex_class", AddTwoCategoricalVariables(ColumnExtractor("Sex"), "Pclass")),
            ("add_age_sex_class", AddTwoCategoricalVariables("Age_cat_Sex", ColumnExtractor("Pclass"))),
            ("one_hot_encoding", OneHotEncoding(["Age_cat_Sex", "Sex_Pclass"])),
            ("drop_columns", DropColumns(["Age_cat"])),
            ("feature_normalizer", FeatureNormalizer(normalize_features)),
            ('xgb', LogisticRegression())
    ])
#     scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="roc_auc")
#     print("cross-validation score: {0:0.4f}".format(scores.mean()))
#     return scores.mean(), pipeline
    print(pipeline.fit_transform(X_train).head())

In [None]:
def cross_val_accuracy():
    pipeline = Pipeline([
            ("column_extractor", ColumnExtractor(feature_columns)),
            ("fill_na", FillNa("mean")),
            ("sex_binarizer", SexBinarizer()),
            ("num2cat", Numerical2Categorical("Age", age_range, age_label)),
            ("add_age_sex", AddTwoCategoricalVariables("Age_cat", "Sex")),
            ("add_sex_class", AddTwoCategoricalVariables("Sex", "Pclass")),
            ("add_age_sex_class", AddTwoCategoricalVariables("Age_cat_Sex", "Pclass")),
            ("one_hot_encoding", OneHotEncoding(["Age_cat_Sex", "Sex_Pclass"])),
            ("drop_columns", DropColumns(["Age_cat"])),
            ("feature_normalizer", FeatureNormalizer(normalize_features)),
            ("clf", LogisticRegression())])
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="accuracy")
    print("cross-validation score: {0:0.4f}".format(scores.mean()))
    return scores.mean(), pipeline

In [45]:
scores, pipeline = cross_val_accuracy()

KeyError: 'the label [Sex] is not in the [columns]'

In [None]:
mapper = DataFrameMapper([
    ('Pclass', None),
    ('Sex', LabelBinarizer()),
    (['Age'], [Imputer()]),
    ('SibSp', None, {'alias': 'Some variable'}),
    (['Ticket'], [LabelBinarizer()]),
    (['Fare'], Imputer())
    ], default=False)

In [None]:
pipeline = Pipeline([
    ('feature_mapper', mapper),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

In [None]:
pipeline.fit_transform(X_train, y_train).head()

In [None]:
scores = cross_val_score(pipeline, X, y, cv=5, scoring="roc_auc")

In [None]:
scores.mean()

In [None]:
X_train