In [1]:
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools


import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score

from sklearn.pipeline import make_pipeline

In [2]:
# Training variables(at the top for quick access).
NUM_TRIALS = 3
FOLDS = 5
DATA_SIZE = 500 #5000

In [3]:
#  Helpers
class Set:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
# splits targets from data, then splits training from testing data
def get_training_test_sets(data, training_size=1000, pred_col='y'):
    data_shuffled = data.sample(frac=1).reset_index(drop=True)
    data_shuffled_y = pd.DataFrame(data_shuffled[pred_col])
    data_shuffled_X = data_shuffled.drop(pred_col, 1)
    tr_X = data_shuffled_X.iloc[:training_size, :].to_numpy()
    tr_y = data_shuffled_y.iloc[:training_size, :].values.ravel()
    tst_X = data_shuffled_X.iloc[training_size:, :].to_numpy()
    tst_y = data_shuffled_y.iloc[training_size:, :].values.ravel()

    training = Set(tr_X, tr_y)
    testing = Set(tst_X, tst_y)
    
    return training, testing

In [4]:
def clean_data(raw_data,
               column_names=None,
               binary_cols=None,
               one_hot_cols=None,
               continuous_cols=None,
              ):
    
    final_data = raw_data
    
    if column_names is not None:
        final_data.columns = column_names
        
        if binary_cols is not None:
            for col in binary_cols:
                match = final_data[col].unique()[0]
                final_data[col] = (final_data[col] != match).astype(int)
                
        if one_hot_cols is not None:
            final_data = pd.get_dummies(final_data, columns=one_hot_cols)
            
        if final_data.isna().values.any():
            print('Warning!: missing data')
            
        if continuous_cols is not None:
            col_names = final_data.columns
            mask = np.isin(col_names, continuous_cols, invert=True)
            not_continuous = col_names[mask]
            
            reordered_cols = np.concatenate((continuous_cols, not_continuous))
            final_data = final_data[reordered_cols]
            
            # Normalize
            ct = ColumnTransformer([
                ('continuous', StandardScaler(), continuous_cols)
                
            ], remainder='passthrough')
            
            scaled = ct.fit_transform(final_data)
            final_data = pd.DataFrame(scaled, columns=reordered_cols)
            
    else:
        print('No columns names, returning raw data.')
        
    return final_data

In [5]:
# Import data set
adults_raw = pd.read_csv('data/adults/adult.data', header=None)

adult_process_params = {
    'column_names': ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'y'],
    'binary_cols': ['sex', 'y'],
    'one_hot_cols': ['workclass', 'education', 'marital_status', 'occupation','relationship', 'race', 'native_country'],
    'continuous_cols': ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week'],
}

adults = clean_data(adults_raw, **adult_process_params)
# adults

In [6]:
%%time

p_grid = {'C': [1,10,100,1000], 'gamma': [0.001,0.01,0.1,1.0]}

for i in range(NUM_TRIALS):
    
    training_set, testing_set = get_training_test_sets(adults, DATA_SIZE, pred_col='y')
    
    print('Trial {}...'.format(i))
    trial_results = []
    outer_cv = KFold(n_splits=FOLDS, shuffle=True, random_state=i)
    best_p = []
    best_score = []
    
    for tr_i, tst_i in outer_cv.split(training_set.X):
        X_train, X_test = training_set.X[tr_i, :], training_set.X[tst_i, :]
        y_train, y_test = training_set.y[tr_i], training_set.y[tst_i]

        inner_cv = KFold(n_splits=FOLDS, shuffle=True, random_state=i)
    
        svm = SVC(kernel='rbf')

        search = GridSearchCV(
            estimator=svm,
            param_grid=p_grid,
            cv=inner_cv,
            verbose=0,
            scoring='accuracy',
            n_jobs=-1,
            refit=True
        )
        
        result = search.fit(X_train, y_train)
        
        model = result.best_estimator_
        
        y_pred = model.predict(X_test)
        
        acc = accuracy_score(y_test, y_pred)
        
        best_p.append(result.best_params_)
        best_score.append(acc)
        trial_results.append(acc)
        
        print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
        

    print('Accuracy: %.3f (%.3f)' % (np.mean(trial_results), np.std(trial_results)))
    
    run = best_score.index(max(best_score))
    best_best_p = best_p[run]
    
    oo_svm = SVC()
    print('Training Master: %s' % best_best_p)
    oo_svm.set_params(**best_best_p)
    oo_svm.fit(training_set.X,training_set.y)
    
    print('Check Acc on Entire set')
    y_pred = oo_svm.predict(testing_set.X)
    acc = accuracy_score(testing_set.y, y_pred)
    print('Final Acc = %.3f \n' % acc)
    

Trial 0...
>acc=0.810, est=0.835, cfg={'C': 10, 'gamma': 0.01}
>acc=0.820, est=0.853, cfg={'C': 1000, 'gamma': 0.001}
>acc=0.860, est=0.845, cfg={'C': 100, 'gamma': 0.001}
>acc=0.790, est=0.843, cfg={'C': 10, 'gamma': 0.01}
>acc=0.850, est=0.840, cfg={'C': 10, 'gamma': 0.01}
Accuracy: 0.826 (0.026)
0.86
Training Master: {'C': 100, 'gamma': 0.001}
Check Acc on Entire set
Final Acc = 0.838 

Trial 1...
>acc=0.800, est=0.840, cfg={'C': 10, 'gamma': 0.01}
>acc=0.830, est=0.807, cfg={'C': 10, 'gamma': 0.01}
>acc=0.870, est=0.828, cfg={'C': 10, 'gamma': 0.01}
>acc=0.770, est=0.840, cfg={'C': 10, 'gamma': 0.01}
>acc=0.870, est=0.828, cfg={'C': 10, 'gamma': 0.01}
Accuracy: 0.828 (0.039)
0.87
Training Master: {'C': 10, 'gamma': 0.01}
Check Acc on Entire set
Final Acc = 0.833 

Trial 2...
>acc=0.810, est=0.823, cfg={'C': 1000, 'gamma': 0.001}
>acc=0.760, est=0.823, cfg={'C': 1000, 'gamma': 0.001}
>acc=0.770, est=0.832, cfg={'C': 10, 'gamma': 0.01}
>acc=0.800, est=0.802, cfg={'C': 100, 'gamma': 0