<a id='sec0'></a>
# Outputting a file for 1st submission
1. <a href='#sec1'>Import Modules and Data</a><br>
<br>
2. <a href='#sec2'>Digitize both train and test sets</a><br>
<br>
3. <a href='#sec3'>Create subset of feature spaces</a>
    - Use features from F001 univariate selection<br>
<br>
4. <a href='#sec4'>Resample train set</a>
    - Tomek Link majority
    - Undersample majority, oversample minority<br>
<br>  
5. <a href='#sec5'>Train XGBoost, make a prediction, save result</a><br>

<a id='sec1'></a>
# Import Modules and Data
(<a href='#sec0'>back to top</a>)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

from xgboost import XGBClassifier

%matplotlib inline

In [2]:
from feature_processing import (create_contingency_table,
                                calculate_conditional_prob_bin,
                                encode_my_categorical_labels,
                                calculate_conditional_prob_cat,
                                estimate_cond_prob_density,
                                bin_myFeature)

In [3]:
train_ori = pd.read_csv('train.csv', header=0)
test_ori = pd.read_csv('test.csv', header=0)
train = pd.read_csv('./data/train_probas.csv', header=0)
test = pd.read_csv('./data/test_probas.csv', header=0)

In [4]:
print('Train Set Shape: ', train.shape)
print(' Test Set Shape: ', test.shape)

Train Set Shape:  (595212, 31)
 Test Set Shape:  (892816, 31)


In [5]:
num_samples = len(train_ori.target)
num_target = np.sum(train_ori.target)
freq_target = num_target/num_samples
freq_target

0.036447517859182946

In [6]:
test.iloc[:, -5:] = test.iloc[:, -5:].replace({np.NaN: freq_target})

In [7]:
used_fs = list(train.columns)
used_fs.remove('calc_bin_proba')
used_fs.remove('ps_calc_01_proba')
used_fs.remove('ps_calc_02_proba')
used_fs.remove('ps_calc_03_proba')
used_fs.remove('ps_calc_04_proba')
used_fs.remove('ps_calc_06_proba')
used_fs.remove('ps_calc_08_proba')
used_fs.remove('ps_calc_09_proba')

In [8]:
used_fs

['ps_calc_05_proba',
 'ps_calc_07_proba',
 'ps_calc_10_proba',
 'ps_calc_11_proba',
 'ps_calc_12_proba',
 'ps_calc_13_proba',
 'ps_calc_14_proba',
 'ps_car_11_proba',
 'ps_car_12_proba',
 'ps_car_13_proba',
 'ps_car_14_proba',
 'ps_car_15_proba',
 'ps_ind_01_proba',
 'ps_ind_03_proba',
 'ps_ind_14_proba',
 'ps_ind_15_proba',
 'ps_reg_01_proba',
 'ps_reg_02_proba',
 'ps_reg_03_proba',
 'ind_bin_proba',
 'car_cat_proba1',
 'car_cat_proba2',
 'ind_cat_proba']

In [9]:
train = train[used_fs]
test = test[used_fs]

In [10]:
train.shape, test.shape

((595212, 23), (892816, 23))

<a id='sec4'></a>
# Resample Train Set
(<a href='#sec0'>back to top</a>)

In [11]:
X = np.array(train)
y = np.array(train_ori['target'])

print('X shape: ', X.shape)
print('y shape: ', y.shape)

X shape:  (595212, 23)
y shape:  (595212,)


In [12]:
%%time
# Tomek Links to denoise majority
if False:
    tl = TomekLinks(n_jobs=8, ratio='majority')
    X_tl, y_tl = tl.fit_sample(X, y)

    print('Before tomek links: size of X: ', X.shape)
    print('After  tomek links: size of X: ', X_tl.shape)
    print('Before tomek links: class0/class1 = %d / %d' % (len(y)-np.sum(y), np.sum(y)))
    print('After  tomek links: class0/class1 = %d / %d' % (len(y_tl)-np.sum(y_tl), np.sum(y_tl)))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs


In [13]:
# Set targets for the number of each class
# Here, set class0:class1 = 1:1
num_class1 = np.sum(y)
num_class1_to_resample = 2 * num_class1
num_class0_to_resample = int(1 * num_class1_to_resample)

# First, randomly undersample the majority
rus = RandomUnderSampler(ratio={0: num_class0_to_resample , 1: num_class1})
X_tlrus, y_tlrus = rus.fit_sample(X, y)

# Then use SMOTE to oversample the minority
smote = SMOTE(ratio={0: num_class0_to_resample , 1: num_class1_to_resample}, n_jobs=4)
X_res, y_res = smote.fit_sample(X_tlrus, y_tlrus)

In [15]:
# Print Resampling Results
print('Before Resampling: size of X: ', X.shape)
print('After  Resampling: size of X: ', X_res.shape)
print('Before Resampling: class0/class1 =%7d/%6d' % (len(y)-np.sum(y), np.sum(y)))
print('After  Resampling: class0/class1 =%7d/%6d' % (len(y_res)-np.sum(y_res), np.sum(y_res)))

Before Resampling: size of X:  (595212, 23)
After  Resampling: size of X:  (86776, 23)
Before Resampling: class0/class1 = 573518/ 21694
After  Resampling: class0/class1 =  43388/ 43388


<a id='sec5'></a>
# Train XGBoost, predict probabilities, save to a file
(<a href='#sec0'>back to top</a>)

In [17]:
%%time
if True:
# Train XGBoost
    clf = XGBClassifier(gamma=14, subsample=0.85, max_depth=12, 
                    min_child_weight=4, n_estimators=500, learning_rate=0.05, n_jobs=8)

    clf.fit(X_res, y_res)

CPU times: user 3min 15s, sys: 215 ms, total: 3min 15s
Wall time: 24.9 s


In [18]:
X_test = np.array(test)
y_ids = np.array(test_ori.id)

In [19]:
X_test.shape, y_ids.shape

((892816, 23), (892816,))

In [20]:
y_probas = clf.predict_proba(X_test)

In [21]:
y_probas.shape

(892816, 2)

In [22]:
prediction_result = pd.DataFrame({'id': y_ids, 'target': y_probas[:, 1]})
prediction_result = prediction_result[['id', 'target']]

In [23]:
prediction_result.head(10)

Unnamed: 0,id,target
0,0,0.260449
1,1,0.302411
2,2,0.224499
3,3,0.163185
4,4,0.371811
5,5,0.331246
6,6,0.147253
7,8,0.287701
8,10,0.462865
9,11,0.374953


In [24]:
prediction_result.to_csv('./submissions/porto_seguro_submission_RH9.csv', index=False, header=True)