# Preliminary Classification
1. Import Data
2. Encode binary and categorical data
3. select features based on previous analysis
4. Try RandomForest and XGBoost

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns

from matplotlib import cm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
%matplotlib inline

In [3]:
from feature_processing import (create_contingency_table,
                                calculate_conditional_prob_bin,
                                encode_my_categorical_labels,
                                calculate_conditional_prob_cat,
                                estimate_cond_prob_density,
                                bin_myFeature)

# Import Training Data

In [4]:
train = pd.read_csv('train.csv', header=0)

all_fs = train.columns[2:]
binary_fs = sorted([f for f in all_fs if '_bin' in f])
categorical_fs = sorted([f for f in all_fs if '_cat' in f])
other_fs = sorted([f for f in all_fs
            if f not in binary_fs
            if f not in categorical_fs])

print("# total of features: %8d" % len(all_fs))
print("# of binary features: %7d" % len(binary_fs))
print("# of categorical features: %1d" % len(categorical_fs))
print("# of other features: %8d" % len(other_fs))

# total of features:       57
# of binary features:      17
# of categorical features: 14
# of other features:       26


# Selecting Feature Space

### Encode categorical features

In [5]:
categoricals_encoded = []
for i, fs in enumerate(categorical_fs):
    categoricals_encoded.append(encode_my_categorical_labels(train[fs]))
categoricals_encoded = pd.concat(categoricals_encoded, axis=1)

In [6]:
categoricals_encoded.columns

Index(['ps_car_01_cat_NaN', 'ps_car_01_cat_0', 'ps_car_01_cat_1',
       'ps_car_01_cat_2', 'ps_car_01_cat_3', 'ps_car_01_cat_4',
       'ps_car_01_cat_5', 'ps_car_01_cat_6', 'ps_car_01_cat_7',
       'ps_car_01_cat_8',
       ...
       'ps_ind_04_cat_0', 'ps_ind_04_cat_1', 'ps_ind_05_cat_NaN',
       'ps_ind_05_cat_0', 'ps_ind_05_cat_1', 'ps_ind_05_cat_2',
       'ps_ind_05_cat_3', 'ps_ind_05_cat_4', 'ps_ind_05_cat_5',
       'ps_ind_05_cat_6'],
      dtype='object', length=184)

### Define select features based on nb02-nb03 analysis

In [7]:
# Lists of feature labels
select_binary_fs = ['ps_ind_07_bin', 'ps_ind_10_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_17_bin']

select_categorical_fs1 = ['ps_car_04_cat_5', 'ps_car_04_cat_7', 'ps_car_06_cat_2', 'ps_car_06_cat_5', 
                          'ps_car_06_cat_8', 'ps_car_11_cat_18', 'ps_car_11_cat_41', 'ps_ind_05_cat_2'] 

select_categorical_fs2 = ['ps_car_04_cat_5', 'ps_car_04_cat_6', 'ps_car_04_cat_7', 'ps_car_04_cat_9', 
                          'ps_car_06_cat_17', 'ps_car_06_cat_2', 'ps_car_06_cat_5', 'ps_car_06_cat_8', 
                          'ps_car_09_cat_1', 'ps_car_11_cat_18', 'ps_car_11_cat_21', 'ps_car_11_cat_4', 
                          'ps_car_11_cat_41', 'ps_car_11_cat_58', 'ps_car_11_cat_63', 'ps_car_11_cat_75', 
                          'ps_car_11_cat_93', 'ps_car_11_cat_97', 'ps_ind_05_cat_2', 'ps_ind_05_cat_6'] 

select_other_fs = ['ps_calc_05', 'ps_calc_07', 'ps_calc_13', 'ps_calc_14',
                   'ps_car_12', 'ps_car_13', 'ps_ind_03', 'ps_ind_14', 'ps_reg_02', 'ps_reg_03']

In [8]:
# Create DataFrames based on the above lists
select_binaries = train[select_binary_fs]

select_cats1 = categoricals_encoded[select_categorical_fs1]
select_cats2 = categoricals_encoded[select_categorical_fs2]

In [16]:
select_others = train[select_other_fs]
select_others.loc[:, 'ps_car_12'] = bin_myFeature(select_others.loc[:, 'ps_car_12'], 0, 1.5, bins=20)[1]
select_others.loc[:, 'ps_car_13'] = bin_myFeature(select_others.loc[:, 'ps_car_13'], 0, 4, bins=50)[1]
select_others.loc[:, 'ps_reg_03'] = bin_myFeature(select_others.loc[:, 'ps_reg_03'], 0, 5, bins=50)[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [19]:
if False:
    
# Combined the above DataFrames to create different sets of features
    select_feature9 = pd.concat([select_others, select_binaries, select_cats1, train.target], axis=1)
    select_feature10 = pd.concat([select_others, select_binaries, select_cats2, train.target], axis=1)
    
    selection9 = select_feature9.replace({-1:np.NaN}).dropna()
    selection10 = select_feature10.replace({-1:np.NaN}).dropna()
    
    selection9.to_csv('./data/select_train_data9.csv', index=False)
    selection10.to_csv('./data/select_train_data10.csv', index=False)
    
    # print the sizes of the feature space
    print('selection9  feature space:', selection9.shape)
    print('selection10 feature space:', selection10.shape)

selection9  feature space: (595212, 24)
selection10 feature space: (595212, 36)
