# Preliminary Classification
1. Import Data
2. Encode binary and categorical data
3. select features based on previous analysis
4. Try RandomForest and XGBoost

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns

from matplotlib import cm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
%matplotlib inline

In [2]:
from feature_processing import (create_contingency_table,
                                calculate_conditional_prob_bin,
                                encode_my_categorical_labels,
                                calculate_conditional_prob_cat,
                                estimate_cond_prob_density,
                                bin_myFeature)

# Import Training Data

In [3]:
train = pd.read_csv('train.csv', header=0)

all_fs = train.columns[2:]
binary_fs = sorted([f for f in all_fs if '_bin' in f])
categorical_fs = sorted([f for f in all_fs if '_cat' in f])
other_fs = sorted([f for f in all_fs
            if f not in binary_fs
            if f not in categorical_fs])

print("# total of features: %8d" % len(all_fs))
print("# of binary features: %7d" % len(binary_fs))
print("# of categorical features: %1d" % len(categorical_fs))
print("# of other features: %8d" % len(other_fs))

# total of features:       57
# of binary features:      17
# of categorical features: 14
# of other features:       26


# Selecting Feature Space

### Encode categorical features

In [4]:
categoricals_encoded = []
for i, fs in enumerate(categorical_fs):
    categoricals_encoded.append(encode_my_categorical_labels(train[fs]))
categoricals_encoded = pd.concat(categoricals_encoded, axis=1)

In [5]:
binaries = train[binary_fs]

In [7]:
others = train[other_fs]
others.loc[:, 'ps_car_12'] = bin_myFeature(others.loc[:, 'ps_car_12'], 0, 1.5, bins=20)[1]
others.loc[:, 'ps_car_13'] = bin_myFeature(others.loc[:, 'ps_car_13'], 0, 4, bins=50)[1]
others.loc[:, 'ps_car_13'] = bin_myFeature(others.loc[:, 'ps_car_13'], 0, 4, bins=50)[1]
others.loc[:, 'ps_car_14'] = bin_myFeature(others.loc[:, 'ps_car_14'], 0, 4, bins=40)[1]
others.loc[:, 'ps_reg_03'] = bin_myFeature(others.loc[:, 'ps_reg_03'], 0, 5, bins=50)[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [8]:
for col in others.columns:
    print('%s: # unique values=%d' % (col, len(others[col].unique())))

ps_calc_01: # unique values=10
ps_calc_02: # unique values=10
ps_calc_03: # unique values=10
ps_calc_04: # unique values=6
ps_calc_05: # unique values=7
ps_calc_06: # unique values=11
ps_calc_07: # unique values=10
ps_calc_08: # unique values=11
ps_calc_09: # unique values=8
ps_calc_10: # unique values=26
ps_calc_11: # unique values=20
ps_calc_12: # unique values=11
ps_calc_13: # unique values=14
ps_calc_14: # unique values=24
ps_car_11: # unique values=5
ps_car_12: # unique values=12
ps_car_13: # unique values=42
ps_car_14: # unique values=7
ps_car_15: # unique values=15
ps_ind_01: # unique values=8
ps_ind_03: # unique values=12
ps_ind_14: # unique values=5
ps_ind_15: # unique values=14
ps_reg_01: # unique values=10
ps_reg_02: # unique values=19
ps_reg_03: # unique values=35


In [9]:
trainset_digitized = pd.concat([others, binaries, categoricals_encoded, train.target], axis=1)

In [10]:
trainset_digitized.head(10)

Unnamed: 0,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,...,ps_ind_04_cat_1,ps_ind_05_cat_NaN,ps_ind_05_cat_0,ps_ind_05_cat_1,ps_ind_05_cat_2,ps_ind_05_cat_3,ps_ind_05_cat_4,ps_ind_05_cat_5,ps_ind_05_cat_6,target
0,0.6,0.5,0.2,3,1,10,1,10,1,5,...,1,0,1,0,0,0,0,0,0,0
1,0.3,0.1,0.3,2,1,9,5,8,1,7,...,0,0,1,0,0,0,0,0,0,0
2,0.5,0.7,0.1,2,2,9,1,8,2,7,...,1,0,1,0,0,0,0,0,0,0
3,0.6,0.9,0.1,2,4,7,1,8,4,2,...,0,0,1,0,0,0,0,0,0,0
4,0.4,0.6,0.0,2,2,6,3,10,2,12,...,1,0,1,0,0,0,0,0,0,0
5,0.7,0.8,0.4,3,1,8,2,11,3,8,...,0,0,1,0,0,0,0,0,0,0
6,0.2,0.6,0.5,2,2,8,1,8,3,10,...,1,0,1,0,0,0,0,0,0,0
7,0.1,0.5,0.1,1,2,7,1,6,1,13,...,0,0,1,0,0,0,0,0,0,0
8,0.9,0.8,0.6,3,1,7,3,9,4,11,...,1,0,1,0,0,0,0,0,0,0
9,0.7,0.8,0.8,2,2,8,2,9,1,11,...,0,0,1,0,0,0,0,0,0,1


In [11]:
if True:
    
# Combined the above DataFrames to create different sets of features
    trainset_digitized.to_csv('./data/trainset_digitized.csv', index=False)
    
    # print the sizes of the feature space
    print('trainset_digitized  feature space:', trainset_digitized.shape)

trainset_digitized  feature space: (595212, 228)
