<a id='sec0'></a>
# Outputting a file for 1st submission
1. <a href='#sec1'>Import Modules and Data</a><br>
<br>
2. <a href='#sec2'>Digitize both train and test sets</a><br>
<br>
3. <a href='#sec3'>Create subset of feature spaces</a>
    - Use features from F001 univariate selection<br>
<br>
4. <a href='#sec4'>Resample train set</a>
    - Tomek Link majority
    - Undersample majority, oversample minority<br>
<br>  
5. <a href='#sec5'>Train XGBoost, make a prediction, save result</a><br>

<a id='sec1'></a>
# Import Modules and Data
(<a href='#sec0'>back to top</a>)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

%matplotlib inline

In [2]:
from feature_processing import (create_contingency_table,
                                calculate_conditional_prob_bin,
                                encode_my_categorical_labels,
                                calculate_conditional_prob_cat,
                                estimate_cond_prob_density,
                                bin_myFeature)

In [38]:
train = pd.read_csv('train.csv', header=0)
test = pd.read_csv('test.csv', header=0)

In [39]:
print('Train Set Shape: ', train.shape)
print(' Test Set Shape: ', test.shape)

Train Set Shape:  (595212, 59)
 Test Set Shape:  (892816, 58)


<a id='sec2'></a>
# Digtizing train and test sets
(<a href='#sec0'>back to top</a>)

In [5]:
# Get different kinds of features
all_fs = train.columns[2:]
binary_fs = sorted([f for f in all_fs if '_bin' in f])
categorical_fs = sorted([f for f in all_fs if '_cat' in f])
other_fs = sorted([f for f in all_fs
            if f not in binary_fs
            if f not in categorical_fs])

print("# total of features: %8d" % len(all_fs))
print("# of binary features: %7d" % len(binary_fs))
print("# of categorical features: %1d" % len(categorical_fs))
print("# of other features: %8d" % len(other_fs))

# total of features:       57
# of binary features:      17
# of categorical features: 14
# of other features:       26


In [6]:
# Keep Binary Features as they are
train_binaries = train[binary_fs]
test_binaries = test[binary_fs]

print('Train Set Binaries Shape: ', train_binaries.shape)
print(' Test Set Binaries Shape: ', test_binaries.shape)

Train Set Binaries Shape:  (595212, 17)
 Test Set Binaries Shape:  (892816, 17)


In [7]:
# Encode Categorical Features
train_categoricals = []
test_categoricals = []

for i, fs in enumerate(categorical_fs):
    train_categoricals.append(encode_my_categorical_labels(train[fs]))
    test_categoricals.append(encode_my_categorical_labels(test[fs]))

train_categoricals = pd.concat(train_categoricals, axis=1)
test_categoricals = pd.concat(test_categoricals, axis=1)

print('Train Set Categoricals Shape: ', train_categoricals.shape)
print(' Test Set Categoricals Shape: ', test_categoricals.shape)

Train Set Categoricals Shape:  (595212, 184)
 Test Set Categoricals Shape:  (892816, 184)


In [8]:
# Digitize Ordinal/Continuous Features
train_others = train[other_fs]
train_others.loc[:, 'ps_car_12'] = bin_myFeature(train_others.loc[:, 'ps_car_12'], 0, 1.5, bins=20)[1]
train_others.loc[:, 'ps_car_13'] = bin_myFeature(train_others.loc[:, 'ps_car_13'], 0, 4, bins=50)[1]
train_others.loc[:, 'ps_car_13'] = bin_myFeature(train_others.loc[:, 'ps_car_13'], 0, 4, bins=50)[1]
train_others.loc[:, 'ps_car_14'] = bin_myFeature(train_others.loc[:, 'ps_car_14'], 0, 4, bins=40)[1]
train_others.loc[:, 'ps_reg_03'] = bin_myFeature(train_others.loc[:, 'ps_reg_03'], 0, 5, bins=50)[1]

test_others = test[other_fs]
test_others.loc[:, 'ps_car_12'] = bin_myFeature(test_others.loc[:, 'ps_car_12'], 0, 1.5, bins=20)[1]
test_others.loc[:, 'ps_car_13'] = bin_myFeature(test_others.loc[:, 'ps_car_13'], 0, 4, bins=50)[1]
test_others.loc[:, 'ps_car_13'] = bin_myFeature(test_others.loc[:, 'ps_car_13'], 0, 4, bins=50)[1]
test_others.loc[:, 'ps_car_14'] = bin_myFeature(test_others.loc[:, 'ps_car_14'], 0, 4, bins=40)[1]
test_others.loc[:, 'ps_reg_03'] = bin_myFeature(test_others.loc[:, 'ps_reg_03'], 0, 5, bins=50)[1]

print('Train Set Ordinal/Continuous Shape: ', train_others.shape)
print(' Test Set Ordinal/Continuous Shape: ', test_others.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Train Set Ordinal/Continuous Shape:  (595212, 26)
 Test Set Ordinal/Continuous Shape:  (892816, 26)


In [9]:
# Re-combined different types of features
trainset_digitized = pd.concat([train_others, train_binaries, train_categoricals, train.target], axis=1)
testset_digitized = pd.concat([test_others, test_binaries, test_categoricals], axis=1)

print('Digitized Train Set Shape: ', trainset_digitized.shape)
print('Digitized  Test Set Shape: ', testset_digitized.shape)

Digitized Train Set Shape:  (595212, 228)
Digitized  Test Set Shape:  (892816, 227)


In [10]:
# Save the datasets
if True:
    trainset_digitized.to_csv('./data/digitized_trainset1.csv', index=False)
    testset_digitized.to_csv('./data/digitized_testset1.csv', index=False)

<a id='sec3'></a>
# Selecting Features
(<a href='#sec0'>back to top</a>)

In [13]:
# Get a list of features selected after univariate selection and RFE
rfe = pd.read_csv('./data/rfe_features.csv')
rfe_features = list(rfe.columns)

In [14]:
trainset_ready = trainset_digitized[rfe_features]
testset_ready = testset_digitized[rfe_features]

In [15]:
print('Final Train Set Shape: ', trainset_ready.shape)
print('Final  Test Set Shape: ', testset_ready.shape)

Final Train Set Shape:  (595212, 108)
Final  Test Set Shape:  (892816, 108)


<a id='sec4'></a>
# Resample Train Set
(<a href='#sec0'>back to top</a>)

In [16]:
X = np.array(trainset_ready)
y = np.array(trainset_digitized.iloc[:, -1])

print('X shape: ', X.shape)
print('y shape: ', y.shape)

X shape:  (595212, 108)
y shape:  (595212,)


In [19]:
%%time
# Tomek Links to denoise majority
if True:
    tl = TomekLinks(n_jobs=8, ratio='majority')
    X_tl, y_tl = tl.fit_sample(X, y)

    print('Before tomek links: size of X: ', X.shape)
    print('After  tomek links: size of X: ', X_tl.shape)
    print('Before tomek links: class0/class1 = %d / %d' % (len(y)-np.sum(y), np.sum(y)))
    print('After  tomek links: class0/class1 = %d / %d' % (len(y_tl)-np.sum(y_tl), np.sum(y_tl)))

Before tomek links: size of X:  (595212, 108)
After  tomek links: size of X:  (587812, 108)
Before tomek links: class0/class1 = 573518 / 21694
After  tomek links: class0/class1 = 566118 / 21694
CPU times: user 1h 44min 15s, sys: 611 ms, total: 1h 44min 16s
Wall time: 13min 42s


In [68]:
# Set targets for the number of each class
# Here, set class0:class1 = 1:1
num_class1 = np.sum(y_tl)
num_class1_to_resample = 5 * num_class1
num_class0_to_resample = int(3 * num_class1_to_resample)

# First, randomly undersample the majority
rus = RandomUnderSampler(ratio={0: num_class0_to_resample , 1: num_class1})
X_tlrus, y_tlrus = rus.fit_sample(X_tl, y_tl)

# Then use SMOTE to oversample the minority
smote = SMOTE(ratio={0: num_class0_to_resample , 1: num_class1_to_resample}, n_jobs=4)
X_res, y_res = smote.fit_sample(X_tlrus, y_tlrus)

In [69]:
# Print Resampling Results
print('Before Resampling: size of X: ', X_tl.shape)
print('After  Resampling: size of X: ', X_res.shape)
print('Before Resampling: class0/class1 =%7d/%6d' % (len(y_tl)-np.sum(y_tl), np.sum(y_tl)))
print('After  Resampling: class0/class1 =%7d/%6d' % (len(y_res)-np.sum(y_res), np.sum(y_res)))

Before Resampling: size of X:  (587812, 108)
After  Resampling: size of X:  (433880, 108)
Before Resampling: class0/class1 = 566118/ 21694
After  Resampling: class0/class1 = 325410/108470


<a id='sec5'></a>
# Train XGBoost, predict probabilities, save to a file
(<a href='#sec0'>back to top</a>)

In [70]:
# Train XGBoost
clf = XGBClassifier(gamma=9, subsample=0.85, max_depth=11, 
                    min_child_weight=4, learning_rate=0.05, 
                    n_estimators=200, n_jobs=8)

clf.fit(X_res, y_res)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=9, learning_rate=0.05, max_delta_step=0,
       max_depth=11, min_child_weight=4, missing=None, n_estimators=200,
       n_jobs=8, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.85)

In [71]:
X_test = np.array(testset_ready)
y_ids = np.array(test.id)

In [72]:
X_test.shape, y_ids.shape

((892816, 108), (892816,))

In [73]:
y_probas = clf.predict_proba(X_test)

In [74]:
y_probas.shape

(892816, 2)

In [75]:
prediction_result = pd.DataFrame({'id': y_ids, 'target': y_probas[:, 1]})
prediction_result = prediction_result[['id', 'target']]

In [76]:
prediction_result.head(10)

Unnamed: 0,id,target
0,0,0.052074
1,1,0.039917
2,2,0.045769
3,3,0.028076
4,4,0.070076
5,5,0.086788
6,6,0.026964
7,8,0.105791
8,10,0.077828
9,11,0.0841


In [77]:
prediction_result.to_csv('./porto_seguro_submission_RH4.csv', index=False, header=True)