In [1]:
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
from scipy.io import arff
import matplotlib.pyplot as plt
import itertools


import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score

from sklearn.pipeline import make_pipeline

In [2]:
# Training variables(at the top for quick access).
NUM_TRIALS = 3
FOLDS = 5
DATA_SIZE = 5000 #5000

In [3]:
#  Helpers
class Set:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
# splits targets from data, then splits training from testing data
def get_training_test_sets(data, training_size=1000, pred_col='y'):
    data_shuffled = data.sample(frac=1).reset_index(drop=True)
    data_shuffled_y = pd.DataFrame(data_shuffled[pred_col])
    data_shuffled_X = data_shuffled.drop(pred_col, 1)
    tr_X = data_shuffled_X.iloc[:training_size, :].to_numpy()
    tr_y = data_shuffled_y.iloc[:training_size, :].values.ravel()
    tst_X = data_shuffled_X.iloc[training_size:, :].to_numpy()
    tst_y = data_shuffled_y.iloc[training_size:, :].values.ravel()

    training = Set(tr_X, tr_y)
    testing = Set(tst_X, tst_y)
    
    return training, testing

In [4]:
def clean_data(raw_data,
               column_names=None,
               binary_cols=None,
               one_hot_cols=None,
               continuous_cols=None,
              ):
    
    final_data = raw_data
    
    if column_names is not None:
        final_data.columns = column_names
        
        if binary_cols is not None:
            for col in binary_cols:
                if type(col) is tuple:
                    match = col[1]
                    ind = col[0]
                else:
                    match = final_data[col].unique()[0]
                    ind = col
                final_data[ind] = (final_data[ind] != match).astype(int)
                
        if one_hot_cols is not None:
            final_data = pd.get_dummies(final_data, columns=one_hot_cols)
            
        if final_data.isna().values.any():
            print('Warning!: missing data')
            
        if continuous_cols is not None:
            col_names = final_data.columns
            mask = np.isin(col_names, continuous_cols, invert=True)
            not_continuous = col_names[mask]
            
            reordered_cols = np.concatenate((continuous_cols, not_continuous))
            final_data = final_data[reordered_cols]
            
            # Normalize
            ct = ColumnTransformer([
                ('continuous', StandardScaler(), continuous_cols)
                
            ], remainder='passthrough')
            
            scaled = ct.fit_transform(final_data)
            final_data = pd.DataFrame(scaled, columns=reordered_cols)
            
    else:
        print('No columns names, returning raw data.')
        
    return final_data

In [5]:
# Import data set
adults_raw = pd.read_csv('data/adults/adult.data', header=None)

eye_arff = arff.loadarff('data/eeg_eye/EEG_Eye_State.arff')
eyes_raw = pd.DataFrame(eye_arff[0])

covertype_raw = pd.read_csv('data/covertype/covtype.data', header=None)



adult_process_params = {
    'column_names': ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'y'],
    'binary_cols': ['sex', 'y'],
    'one_hot_cols': ['workclass', 'education', 'marital_status', 'occupation','relationship', 'race', 'native_country'],
    'continuous_cols': ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week'],
}

eyes_process_params = {
    'column_names': ['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 'P8', 'T8', 'FC6', 'F4', 'F8', 'AF4', 'y'],
    'binary_cols': ['y'],
    'continuous_cols': ['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 'P8', 'T8', 'FC6', 'F4', 'F8', 'AF4']
}

# [('y', 2)], # lodgepole pine
covtyp_process_params = {
    'column_names': ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area_0', 'Wilderness_Area_1', 'Wilderness_Area_2', 'Wilderness_Area_3', 'Soil_Type0', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'y'],
    'binary_cols': [('y', 2)],
    'continuous_cols': ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points'],
}
# Cover Type column names can be rebuilt with the code below if need be.
# cov_cols = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']
# Wilderness_Area = ['Wilderness_Area_{}'.format(i) for i in range(4)]
# Soil_Type = ['Soil_Type{}'.format(i) for i in range(40)]
# cov_cols = np.concatenate((cov_cols, Wilderness_Area, Soil_Type, ['y']))
# covertype_raw.columns = cov_cols

adults = clean_data(adults_raw, **adult_process_params)
eyes = clean_data(eyes_raw, **eyes_process_params)
pines = clean_data(covertype_raw, **covtyp_process_params)
# print(adults.head())
# print(eyes.head())
# print(pines.head())

all_data = [adults, eyes, pines]

In [6]:
# Targeted max_features for RandomForest n_feature/3
for d in all_data:
    print((len(d.columns)-1)/3)

35.666666666666664
4.666666666666667
18.0


In [7]:
%%time

classifiers = [
    {
        'name': 'SVC',
        'method': SVC,
        'p_grid': {'C': [1,10,100,1000], 'gamma': [0.001,0.01,0.1,1.0]}
    },
    {
        'name': 'RandomForestClassifier',
        'method': RandomForestClassifier,
        'p_grid': {'n_estimators': [100], 'max_features': ['sqrt',2,5,8,16,18,20,36], 'max_depth': [10,100,None]}
        
    },
    {
        'name': 'GaussianNB',
        'method': GaussianNB,
        'p_grid': {'var_smoothing': [1.0e-5, 1.0e-6, 1.0e-7, 1.0e-8, 1.0e-9]}
    }
]


for classifier in classifiers:
    print('========================')
    print('Starting: {}'.format(classifier['name']))
    print('========================\n')
    
    clf = classifier['method']()
    p_grid = classifier['p_grid']
    
    for d_i in range(len(all_data)):
        data_set = all_data[d_i]
        
        print('  Starting data set {}...'.format(d_i+1))
        print('  ~~~~~~~~~~~~~~~~~~~~~~~')

        for i in range(NUM_TRIALS):

            training_set, testing_set = get_training_test_sets(data_set, DATA_SIZE, pred_col='y')

            print('  Trial {}...'.format(i+1))
            trial_results = []
            outer_cv = KFold(n_splits=FOLDS, shuffle=True, random_state=i)
            best_p = []
            best_score = []

            for tr_i, tst_i in outer_cv.split(training_set.X):
                X_train, X_test = training_set.X[tr_i, :], training_set.X[tst_i, :]
                y_train, y_test = training_set.y[tr_i], training_set.y[tst_i]

                inner_cv = KFold(n_splits=FOLDS, shuffle=True, random_state=i)



                search = GridSearchCV(
                    estimator=clf,
                    param_grid=p_grid,
                    cv=inner_cv,
                    verbose=0,
                    scoring='accuracy',
                    n_jobs=-1,
                    refit=True
                )

                result = search.fit(X_train, y_train)

                model = result.best_estimator_

                y_pred = model.predict(X_test)

                acc = accuracy_score(y_test, y_pred)

                best_p.append(result.best_params_)
                best_score.append(acc)
                trial_results.append(acc)

                print('  >acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))


            print('  Accuracy: %.3f (%.3f)' % (np.mean(trial_results), np.std(trial_results)))

            run = best_score.index(max(best_score))
            best_best_p = best_p[run]

            oo_clf = classifier['method']()
            print('  Training Master: %s' % best_best_p)
            oo_clf.set_params(**best_best_p)
            oo_clf.fit(training_set.X,training_set.y)

            print('  Check Acc on Entire set')
            y_pred = oo_clf.predict(testing_set.X)
            acc = accuracy_score(testing_set.y, y_pred)
            print('  Final Acc = %.3f \n' % acc)
    

Starting: SVC

  Starting data set 1...
  ~~~~~~~~~~~~~~~~~~~~~~~
  Trial 1...
  >acc=0.853, est=0.855, cfg={'C': 10, 'gamma': 0.01}
  >acc=0.845, est=0.861, cfg={'C': 1000, 'gamma': 0.001}
  >acc=0.873, est=0.854, cfg={'C': 100, 'gamma': 0.001}
  >acc=0.840, est=0.861, cfg={'C': 1, 'gamma': 0.1}
  >acc=0.861, est=0.858, cfg={'C': 10, 'gamma': 0.01}
  Accuracy: 0.854 (0.012)
  Training Master: {'C': 100, 'gamma': 0.001}
  Check Acc on Entire set
  Final Acc = 0.851 

  Trial 2...
  >acc=0.852, est=0.854, cfg={'C': 10, 'gamma': 0.01}
  >acc=0.860, est=0.848, cfg={'C': 1000, 'gamma': 0.001}
  >acc=0.851, est=0.853, cfg={'C': 1000, 'gamma': 0.001}
  >acc=0.849, est=0.853, cfg={'C': 100, 'gamma': 0.001}
  >acc=0.850, est=0.856, cfg={'C': 10, 'gamma': 0.01}
  Accuracy: 0.852 (0.004)
  Training Master: {'C': 1000, 'gamma': 0.001}
  Check Acc on Entire set
  Final Acc = 0.851 

  Trial 3...
  >acc=0.849, est=0.855, cfg={'C': 1000, 'gamma': 0.001}
  >acc=0.874, est=0.848, cfg={'C': 1, 'gamma':

  Check Acc on Entire set
  Final Acc = 0.822 

  Trial 2...
  >acc=0.810, est=0.816, cfg={'max_depth': None, 'max_features': 18, 'n_estimators': 100}
  >acc=0.830, est=0.803, cfg={'max_depth': None, 'max_features': 2, 'n_estimators': 100}
  >acc=0.808, est=0.812, cfg={'max_depth': None, 'max_features': 16, 'n_estimators': 100}
  >acc=0.815, est=0.812, cfg={'max_depth': 100, 'max_features': 36, 'n_estimators': 100}
  >acc=0.813, est=0.812, cfg={'max_depth': 100, 'max_features': 8, 'n_estimators': 100}
  Accuracy: 0.815 (0.008)
  Training Master: {'max_depth': None, 'max_features': 2, 'n_estimators': 100}
  Check Acc on Entire set
  Final Acc = 0.813 

  Trial 3...
  >acc=0.832, est=0.810, cfg={'max_depth': None, 'max_features': 16, 'n_estimators': 100}
  >acc=0.797, est=0.821, cfg={'max_depth': 100, 'max_features': 20, 'n_estimators': 100}
  >acc=0.819, est=0.814, cfg={'max_depth': 100, 'max_features': 16, 'n_estimators': 100}
  >acc=0.827, est=0.816, cfg={'max_depth': None, 'max_featu