# Preliminary Classification
1. Import Data
2. Encode binary and categorical data
3. select features based on previous analysis
4. Try RandomForest and XGBoost

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns

from matplotlib import cm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
%matplotlib inline

In [2]:
from feature_processing import (create_contingency_table,
                                calculate_conditional_prob_bin,
                                encode_my_categorical_labels,
                                calculate_conditional_prob_cat,
                                estimate_cond_prob_density,
                                bin_myFeature)

# Import Training Data

In [3]:
train = pd.read_csv('train.csv', header=0)

all_fs = train.columns[2:]
binary_fs = sorted([f for f in all_fs if '_bin' in f])
categorical_fs = sorted([f for f in all_fs if '_cat' in f])
other_fs = sorted([f for f in all_fs
            if f not in binary_fs
            if f not in categorical_fs])

print("# total of features: %8d" % len(all_fs))
print("# of binary features: %7d" % len(binary_fs))
print("# of categorical features: %1d" % len(categorical_fs))
print("# of other features: %8d" % len(other_fs))

# total of features:       57
# of binary features:      17
# of categorical features: 14
# of other features:       26


# Selecting Feature Space

### Encode categorical features

In [4]:
categoricals_encoded = []
for i, fs in enumerate(categorical_fs):
    categoricals_encoded.append(encode_my_categorical_labels(train[fs]))
categoricals_encoded = pd.concat(categoricals_encoded, axis=1)

In [5]:
categoricals_encoded.columns

Index(['ps_car_01_cat_NaN', 'ps_car_01_cat_0', 'ps_car_01_cat_1',
       'ps_car_01_cat_2', 'ps_car_01_cat_3', 'ps_car_01_cat_4',
       'ps_car_01_cat_5', 'ps_car_01_cat_6', 'ps_car_01_cat_7',
       'ps_car_01_cat_8',
       ...
       'ps_ind_04_cat_0', 'ps_ind_04_cat_1', 'ps_ind_05_cat_NaN',
       'ps_ind_05_cat_0', 'ps_ind_05_cat_1', 'ps_ind_05_cat_2',
       'ps_ind_05_cat_3', 'ps_ind_05_cat_4', 'ps_ind_05_cat_5',
       'ps_ind_05_cat_6'],
      dtype='object', length=184)

### Define select features based on nb02-nb03 analysis

In [6]:
# Lists of feature labels
select_binary_fs = ['ps_ind_07_bin', 'ps_ind_10_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_17_bin']

select_categorical_fs1 = ['ps_car_04_cat_5', 'ps_car_04_cat_7', 'ps_car_06_cat_2', 'ps_car_06_cat_5', 
                          'ps_car_06_cat_8', 'ps_car_11_cat_18', 'ps_car_11_cat_41', 'ps_ind_05_cat_2'] 

select_categorical_fs2 = ['ps_car_04_cat_5', 'ps_car_04_cat_6', 'ps_car_04_cat_7', 'ps_car_04_cat_9', 
                          'ps_car_06_cat_17', 'ps_car_06_cat_2', 'ps_car_06_cat_5', 'ps_car_06_cat_8', 
                          'ps_car_09_cat_1', 'ps_car_11_cat_18', 'ps_car_11_cat_21', 'ps_car_11_cat_4', 
                          'ps_car_11_cat_41', 'ps_car_11_cat_58', 'ps_car_11_cat_63', 'ps_car_11_cat_75', 
                          'ps_car_11_cat_93', 'ps_car_11_cat_97', 'ps_ind_05_cat_2', 'ps_ind_05_cat_6'] 

select_categorical_fs3 = ['ps_car_01_cat_9', 'ps_car_02_cat_0', 'ps_car_03_cat_1', 'ps_car_04_cat_3', 
                          'ps_car_04_cat_5', 'ps_car_04_cat_6', 'ps_car_04_cat_7', 'ps_car_04_cat_9', 
                          'ps_car_06_cat_13', 'ps_car_06_cat_15', 'ps_car_06_cat_17', 'ps_car_06_cat_2', 
                          'ps_car_06_cat_5', 'ps_car_06_cat_8', 'ps_car_06_cat_9', 'ps_car_07_cat_0', 
                          'ps_car_09_cat_1', 'ps_car_09_cat_4', 'ps_car_11_cat_100', 'ps_car_11_cat_18', 
                          'ps_car_11_cat_21', 'ps_car_11_cat_3', 'ps_car_11_cat_33', 'ps_car_11_cat_4', 
                          'ps_car_11_cat_41', 'ps_car_11_cat_55', 'ps_car_11_cat_56', 'ps_car_11_cat_58', 
                          'ps_car_11_cat_61', 'ps_car_11_cat_63', 'ps_car_11_cat_69', 'ps_car_11_cat_71', 
                          'ps_car_11_cat_72', 'ps_car_11_cat_75', 'ps_car_11_cat_93', 'ps_car_11_cat_97', 
                          'ps_ind_05_cat_2', 'ps_ind_05_cat_4', 'ps_ind_05_cat_6']

select_categorical_fs4 = ['ps_car_01_cat_0', 'ps_car_01_cat_1', 'ps_car_01_cat_11', 'ps_car_01_cat_9', 
                          'ps_car_02_cat_0', 'ps_car_03_cat_1', 'ps_car_04_cat_1', 'ps_car_04_cat_2', 
                          'ps_car_04_cat_3', 'ps_car_04_cat_5', 'ps_car_04_cat_6', 'ps_car_04_cat_7', 
                          'ps_car_04_cat_8', 'ps_car_04_cat_9', 'ps_car_06_cat_10', 'ps_car_06_cat_12', 
                          'ps_car_06_cat_13', 'ps_car_06_cat_15', 'ps_car_06_cat_16', 'ps_car_06_cat_17', 
                          'ps_car_06_cat_2', 'ps_car_06_cat_5', 'ps_car_06_cat_8', 'ps_car_06_cat_9', 
                          'ps_car_07_cat_0', 'ps_car_08_cat_0', 'ps_car_09_cat_1', 'ps_car_09_cat_4', 
                          'ps_car_11_cat_100', 'ps_car_11_cat_104', 'ps_car_11_cat_13', 'ps_car_11_cat_17', 
                          'ps_car_11_cat_18', 'ps_car_11_cat_20', 'ps_car_11_cat_21', 'ps_car_11_cat_3', 
                          'ps_car_11_cat_33', 'ps_car_11_cat_4', 'ps_car_11_cat_41', 'ps_car_11_cat_45', 
                          'ps_car_11_cat_50', 'ps_car_11_cat_55', 'ps_car_11_cat_56', 'ps_car_11_cat_58', 
                          'ps_car_11_cat_61', 'ps_car_11_cat_63', 'ps_car_11_cat_69', 'ps_car_11_cat_71', 
                          'ps_car_11_cat_72', 'ps_car_11_cat_75', 'ps_car_11_cat_79', 'ps_car_11_cat_89', 
                          'ps_car_11_cat_90', 'ps_car_11_cat_93', 'ps_car_11_cat_94', 'ps_car_11_cat_97', 
                          'ps_ind_05_cat_1', 'ps_ind_05_cat_2', 'ps_ind_05_cat_4', 'ps_ind_05_cat_5', 
                          'ps_ind_05_cat_6']

select_categorical_neg_fs = ['ps_car_02_cat_1', 'ps_car_04_cat_0', 'ps_car_04_cat_4', 'ps_car_07_cat_1', 
                          'ps_car_11_cat_19', 'ps_car_11_cat_32', 'ps_car_11_cat_39', 'ps_car_11_cat_43', 
                          'ps_car_11_cat_57', 'ps_car_11_cat_7', 'ps_car_11_cat_99', 'ps_ind_05_cat_0']

select_other_fs = ['ps_calc_05', 'ps_calc_07', 'ps_calc_13', 'ps_calc_14',
                   'ps_car_12', 'ps_car_13', 'ps_ind_03', 'ps_ind_14', 'ps_reg_02', 'ps_reg_03']

In [7]:
# Create DataFrames based on the above lists
select_binaries = train[select_binary_fs]
select_others = train[select_other_fs]

select_cats1 = categoricals_encoded[select_categorical_fs1]
select_cats2 = categoricals_encoded[select_categorical_fs2]
select_cats3 = categoricals_encoded[select_categorical_fs3]
select_cats4 = categoricals_encoded[select_categorical_fs4]
select_cats_neg = categoricals_encoded[select_categorical_neg_fs]

In [8]:
# Combined the above DataFrames to create different sets of features
select_feature1 = pd.concat([select_others, select_binaries, select_cats1, train.target], axis=1)
select_feature2 = pd.concat([select_others, select_binaries, select_cats2, train.target], axis=1)
select_feature3 = pd.concat([select_others, select_binaries, select_cats3, train.target], axis=1)
select_feature4 = pd.concat([select_others, select_binaries, select_cats4, train.target], axis=1)
select_feature5 = pd.concat([select_others, select_binaries, select_cats1, select_cats_neg, train.target], axis=1)
select_feature6 = pd.concat([select_others, select_binaries, select_cats2, select_cats_neg, train.target], axis=1)
select_feature7 = pd.concat([select_others, select_binaries, select_cats3, select_cats_neg, train.target], axis=1)
select_feature8 = pd.concat([select_others, select_binaries, select_cats4, select_cats_neg, train.target], axis=1)

In [9]:
# print the sizes of the feature space
print('selection1 feature space:', select_feature1.shape)
print('selection2 feature space:', select_feature2.shape)
print('selection3 feature space:', select_feature3.shape)
print('selection4 feature space:', select_feature4.shape)
print('selection5 feature space:', select_feature5.shape)
print('selection6 feature space:', select_feature6.shape)
print('selection7 feature space:', select_feature7.shape)
print('selection8 feature space:', select_feature8.shape)

selection1 feature space: (595212, 24)
selection2 feature space: (595212, 36)
selection3 feature space: (595212, 55)
selection4 feature space: (595212, 77)
selection5 feature space: (595212, 36)
selection6 feature space: (595212, 48)
selection7 feature space: (595212, 67)
selection8 feature space: (595212, 89)


# Prepare Data for Classification

In [12]:
# Remove missing data entries
selection1 = select_feature1.replace({-1:np.NaN}).dropna()
selection2 = select_feature2.replace({-1:np.NaN}).dropna()
selection3 = select_feature3.replace({-1:np.NaN}).dropna()
selection4 = select_feature4.replace({-1:np.NaN}).dropna()
selection5 = select_feature5.replace({-1:np.NaN}).dropna()
selection6 = select_feature6.replace({-1:np.NaN}).dropna()
selection7 = select_feature7.replace({-1:np.NaN}).dropna()
selection8 = select_feature8.replace({-1:np.NaN}).dropna()

# print the sizes of the feature space
print('selection1 feature space:', selection1.shape)
print('selection2 feature space:', selection2.shape)
print('selection3 feature space:', selection3.shape)
print('selection4 feature space:', selection4.shape)
print('selection5 feature space:', selection5.shape)
print('selection6 feature space:', selection6.shape)
print('selection7 feature space:', selection7.shape)
print('selection8 feature space:', selection8.shape)

selection1 feature space: (487439, 24)
selection2 feature space: (487439, 36)
selection3 feature space: (487439, 55)
selection4 feature space: (487439, 77)
selection5 feature space: (487439, 36)
selection6 feature space: (487439, 48)
selection7 feature space: (487439, 67)
selection8 feature space: (487439, 89)


In [20]:
# Assess if frequency of class1 changed before and after pruning
pre_total = len(train.target)
pre_class0 = len(train.target[train.target == 0])
pre_class1 = pre_total - pre_class0

post_total = len(selection1)
post_class0 = len(selection1.target[selection1.target == 0])
post_class1 = post_total - post_class0

print('Before: class0/class1 = %4.2f%%/%4.2f%%' % (100*pre_class0/pre_total, 100*pre_class1/pre_total))
print('After : class0/class1 = %4.2f%%/%4.2f%%' % (100*post_class0/post_total, 100*post_class1/post_total))

Before: class0/class1 = 96.36%/3.64%
After : class0/class1 = 96.18%/3.82%


It's unlikely that the pruning of entries with NaN (or -1) entries affect the distribution of classes. *If any problemt arises later, make sure the pruning was not biased against class1 (there are many class0 entries, so bias against class0 shouldn't be a big issue)

In [21]:
selections = [selection1, selection2, selection3, selection4, 
              selection5, selection6, selection7, selection8]

# RandomForestClassifier

In [34]:
%%time
rstate = 55
reports = []
for i in range(len(selections)):
    selection = selections[i]
    X = selection.iloc[:, :-1]
    y = selection.iloc[:, -1]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rstate)
    rfc = RandomForestClassifier(max_depth=10, n_estimators=100, n_jobs=8)
    rfc.fit(X_train, y_train)
    
    y_pred = rfc.predict(X_test)
    report = classification_report(y_test, y_pred, digits=4,
                                   labels=None, target_names=None)
    print('======== selection %d ========' % (i+1))
    print(report)

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     0.0000    0.0000    0.0000      3726

avg / total     0.9250    0.9618    0.9430     97488



  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     0.0000    0.0000    0.0000      3726

avg / total     0.9250    0.9618    0.9430     97488



  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     0.0000    0.0000    0.0000      3726

avg / total     0.9250    0.9618    0.9430     97488



  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     0.0000    0.0000    0.0000      3726

avg / total     0.9250    0.9618    0.9430     97488



  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     0.0000    0.0000    0.0000      3726

avg / total     0.9250    0.9618    0.9430     97488



  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     0.0000    0.0000    0.0000      3726

avg / total     0.9250    0.9618    0.9430     97488



  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     0.0000    0.0000    0.0000      3726

avg / total     0.9250    0.9618    0.9430     97488

             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     0.0000    0.0000    0.0000      3726

avg / total     0.9250    0.9618    0.9430     97488

CPU times: user 9min 32s, sys: 1.62 s, total: 9min 33s
Wall time: 1min 21s


  'precision', 'predicted', average, warn_for)


# Gradient Boost Tree

In [36]:
%%time
rstate = 55
reports = []
for i in range(len(selections)):
    selection = selections[i]
    X = selection.iloc[:, :-1]
    y = selection.iloc[:, -1]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rstate)
    clf = XGBClassifier(max_depth=10, n_estimators=100, n_jobs=8)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    report = classification_report(y_test, y_pred, digits=4,
                                   labels=None, target_names=None)
    print('======== selection %d ========' % (i+1))
    print(report)

             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     1.0000    0.0003    0.0005      3726

avg / total     0.9633    0.9618    0.9431     97488

             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     0.0000    0.0000    0.0000      3726

avg / total     0.9250    0.9618    0.9430     97488

             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     0.5000    0.0003    0.0005      3726

avg / total     0.9441    0.9618    0.9431     97488

             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     0.5000    0.0005    0.0011      3726

avg / total     0.9441    0.9618    0.9431     97488

             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     0.0000    0.0000 

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     0.0000    0.0000    0.0000      3726

avg / total     0.9250    0.9618    0.9430     97488



  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     0.0000    0.0000    0.0000      3726

avg / total     0.9250    0.9618    0.9430     97488

             precision    recall  f1-score   support

          0     0.9618    1.0000    0.9805     93762
          1     0.0000    0.0000    0.0000      3726

avg / total     0.9250    0.9618    0.9430     97488

CPU times: user 24min, sys: 2.71 s, total: 24min 3s
Wall time: 3min 10s


## Conclusion
Clearly both classifiers perform pretty poorly with the huge imbalance in the class distribution (XGBoost marginally better?). Try SMOTE-ing class1 and RUS-ing (possibly with Tomek links) separately or simultaneously to raise the fraction of class1 during training. Testing should be done with imbalanced set. Another approach would be to go for anomally detection instead of classification but I'd need to learn how to do so first.