In [16]:
import numpy as np
import pandas as pd

%matplotlib inline

# Import data

In [2]:
train = pd.read_csv("census.csv")

# continous data
num_cols = ['age', 'education-num', 'capital-gain',
            'capital-loss', 'hours-per-week']

# categorical data
cat_cols = ['workclass', 'education_level', 
            'marital-status', 'occupation', 
            'relationship', 'race', 
            'sex', 'native-country']

# need log transform
log_transform_cols = ['capital-loss', 'capital-gain']

# Functions used for pipeline

In [3]:
# select the categorical columsn
def get_cat_cols(X):
    return X[cat_cols]

# select the numerical columns
def get_num_cols(X):
    return X[num_cols]

# select the columns that need log transform
def get_log_transform_cols(X):
    return X[log_transform_cols]

# one-hot encode the categorical variables
def get_dummies(X):
    return pd.get_dummies(X)

# imputer for empty values in categorical variables.
# note: this is not optimal since we are not using the strategy from train in the test
# sample. Not sure how to accomplish that.
def cat_imputer(X):
    return X.apply(lambda col: CategoricalImputer().fit_transform(col)) 

# Pipeline steps

In [17]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, Imputer

# log transform
log_transform_pipeline = Pipeline([
 ('get_log_transform_cols', FunctionTransformer(get_log_transform_cols, validate=False)),
 ('imputer', Imputer(strategy='mean')),   
 ('log_transform', FunctionTransformer(np.log1p))
])

# for all the numerical cols fill null values with the mean of the column
# and then apply scaling
num_cols_pipeline = Pipeline([
 ('get_num_cols', FunctionTransformer(get_num_cols, validate=False)),
 ('imputer', Imputer(strategy='mean')),
 ('min_max_scaler', MinMaxScaler())
])

# for all the categorical cols, apply the categorical imputer function
# from the sklearn_pandas library and then one-hot encode using the pandas
# get_dummies function
cat_cols_pipeline = Pipeline([
 ('get_cat_cols', FunctionTransformer(get_cat_cols, validate=False)),
 ('imputer', FunctionTransformer(cat_imputer, validate=False)),
 ('get_dummies', FunctionTransformer(get_dummies, validate=False))
])

# Combine pipeline steps

In [13]:
steps_ = FeatureUnion([
    ('log_transform', log_transform_pipeline),
    ('num_cols', num_cols_pipeline),
    ('cat_cols', cat_cols_pipeline)
])

# this full pipeline will apply the 3 previous steps
full_pipeline = Pipeline([('steps_', steps_)])

# Apply pipeline on training set

In [20]:
from sklearn.model_selection import train_test_split
from sklearn_pandas import CategoricalImputer

# binarize the target variable
y = train['income'].map({'<=50K': 0, '>50K': 1})

# transform the entire training set.
# this pipeline will be fitted to the training set
# and the test set (for submission) only need to be transformed (not fitted)
X = full_pipeline.fit_transform(train)

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Train data using GradientBoosting

Used values from Udacity project 1

In [22]:
from sklearn.ensemble import GradientBoostingClassifier

model_GB = GradientBoostingClassifier(learning_rate=0.1,n_estimators=400,random_state = 42)
model_GB.fit(X_train, y_train)

  from numpy.core.umath_tests import inner1d


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=400,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)

# Score model

In [24]:
from sklearn.metrics import roc_auc_score

probs_train = model_GB.predict_proba(X_train)[:, 1]
probs_test  = model_GB.predict_proba(X_test)[:, 1]
print("score train: {}".format(roc_auc_score(y_train, probs_train)))
print("score test: {}".format(roc_auc_score(y_test, probs_test)))

score train: 0.9375546674803292
score test: 0.9231381860251591


# Apply model on submission data

In [41]:
submission_data = pd.read_csv("test_census.csv")

# use the pipeline to transform
X_sub = full_pipeline.transform(submission_data)

# rename the first column to id
submission_data['id'] = submission_data.iloc[:,0] 

# make predictions
submission_data['income'] = model_GB.predict_proba(X_sub)[:, 1]

# convert into discreate values
submission_data['income'] = submission_data['income'].apply(lambda x: 1 if x >=0.5  else 0)

# generate output file
submission_data[['id', 'income']].to_csv("submission_GB.csv", index=False)

# save model

In [43]:
from sklearn.externals import joblib

filename = 'gradboost.joblib.pkl'
_ = joblib.dump(model_GB, filename, compress=9)

# load model

In [44]:
load_model = joblib.load(filename)
load_model

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=400,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)