In [1]:
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools


import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, accuracy_score

from sklearn.pipeline import make_pipeline

# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression

In [2]:
NUM_TRIALS = 3
FOLDS = 5
DATA_SIZE = 500 #5000

In [3]:
#  Helpers
class Set:
    def __init__(self, X, y):
        self.X = X
        self.y = y

In [4]:
# Import data set
adults = pd.read_csv('data/adults/adult.data', header=None)

# rename columns
adults.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'male', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', '50k_plus']

In [5]:
# Set binary columns with 1s and 0s
adults['male'] = (adults['male'] == 'Male').astype(int)
# adults.rename(columns={'sex':'male'}, inplace=True)

adults['50k_plus'] = (adults['50k_plus'] == ' >50K').astype(int)

# Rename target column
adults.rename(columns={'50k_plus':'y'}, inplace=True)

# Pandas One Hot Encoding
adults = pd.get_dummies(adults, columns=['workclass', 'education', 'marital_status', 'occupation','relationship', 'race', 'native_country'])

In [6]:
# check for nan or null
adults.isna().values.any() # OK!!

False

In [7]:
# organise continuous value columns ahead of binary column
cont_cols = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

# for col in cont_cols:
#     plt.figure()
#     plt.title(col)
#     plt.hist(adults[col])

col_names = adults.columns

mask = np.isin(col_names, cont_cols, invert=True)
non_cont_cols = col_names[mask]

reordered_cols = np.concatenate((cont_cols, non_cont_cols))
adults = adults[reordered_cols]


# Normalize continuous values
col_names = adults.columns

features = adults[col_names]

ct = ColumnTransformer([
    ('adult_continuous', preprocessing.StandardScaler(), cont_cols)
], remainder='passthrough')

X_scaled = ct.fit_transform(features)
adults_processed = pd.DataFrame(X_scaled, columns=col_names)

In [8]:
def get_training_test_sets(data, training_size=1000, pred_col='y'):
    data_shuffled = data.sample(frac=1).reset_index(drop=True)
    data_shuffled_y = pd.DataFrame(data_shuffled[pred_col])
    data_shuffled_X = data_shuffled.drop(pred_col, 1)
    tr_X = data_shuffled_X.iloc[:training_size, :].to_numpy()
    tr_y = data_shuffled_y.iloc[:training_size, :].values.ravel()
    tst_X = data_shuffled_X.iloc[training_size:, :].to_numpy()
    tst_y = data_shuffled_y.iloc[training_size:, :].values.ravel()

    training = Set(tr_X, tr_y)
    testing = Set(tst_X, tst_y)
    
    return training, testing

In [9]:
%%time

p_grid = {'C': [1,10,100,1000], 'gamma': [0.001,0.01,0.1,1.0]}

for i in range(NUM_TRIALS):
    
    training_set, testing_set = get_training_test_sets(adults_processed, DATA_SIZE, pred_col='y')
    
    print('Trial {}...'.format(i))
    trial_results = []
    outer_cv = KFold(n_splits=FOLDS, shuffle=True, random_state=i)
    best_p = []
    best_score = []
    
    for tr_i, tst_i in outer_cv.split(training_set.X):
        X_train, X_test = training_set.X[tr_i, :], training_set.X[tst_i, :]
        y_train, y_test = training_set.y[tr_i], training_set.y[tst_i]

        inner_cv = KFold(n_splits=FOLDS, shuffle=True, random_state=i)
    
        svm = SVC(kernel='rbf')

        search = GridSearchCV(
            estimator=svm,
            param_grid=p_grid,
            cv=inner_cv,
            verbose=0,
            scoring='accuracy',
            n_jobs=-1,
            refit=True
        )
        
        result = search.fit(X_train, y_train)
        
        model = result.best_estimator_
        
        y_pred = model.predict(X_test)
        
        acc = accuracy_score(y_test, y_pred)
        
        best_p.append(result.best_params_)
        best_score.append(acc)
        trial_results.append(acc)
        
        print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
        

    print('Accuracy: %.3f (%.3f)' % (np.mean(trial_results), np.std(trial_results)))
    
    print(max(best_score))
    run = best_score.index(max(best_score))
    best_best_p = best_p[run]
    
    oo_svm = SVC()
    print('Training Master: %s' % best_best_p)
    oo_svm.set_params(**best_best_p)
    oo_svm.fit(training_set.X,training_set.y)
    
    print('Check Acc on Entire set')
    y_pred = oo_svm.predict(testing_set.X)
    acc = accuracy_score(testing_set.y, y_pred)
    print('Final Acc = %.3f \n' % acc)
    

Trial 0...
>acc=0.820, est=0.835, cfg={'C': 1, 'gamma': 0.1}
>acc=0.820, est=0.843, cfg={'C': 100, 'gamma': 0.01}
>acc=0.810, est=0.838, cfg={'C': 10, 'gamma': 0.01}
>acc=0.900, est=0.823, cfg={'C': 10, 'gamma': 0.01}
>acc=0.840, est=0.830, cfg={'C': 10, 'gamma': 0.01}
Accuracy: 0.838 (0.032)
0.9
Training Master: {'C': 10, 'gamma': 0.01}
Check Acc on Entire set
Final Acc = 0.845 

Trial 1...
>acc=0.870, est=0.835, cfg={'C': 10, 'gamma': 0.01}
>acc=0.830, est=0.840, cfg={'C': 100, 'gamma': 0.001}
>acc=0.850, est=0.830, cfg={'C': 1000, 'gamma': 0.001}
>acc=0.820, est=0.858, cfg={'C': 10, 'gamma': 0.01}
>acc=0.870, est=0.845, cfg={'C': 10, 'gamma': 0.01}
Accuracy: 0.848 (0.020)
0.87
Training Master: {'C': 10, 'gamma': 0.01}
Check Acc on Entire set
Final Acc = 0.839 

Trial 2...
>acc=0.840, est=0.830, cfg={'C': 100, 'gamma': 0.001}
>acc=0.810, est=0.857, cfg={'C': 10, 'gamma': 0.01}
>acc=0.830, est=0.835, cfg={'C': 10, 'gamma': 0.01}
>acc=0.860, est=0.847, cfg={'C': 100, 'gamma': 0.001}
>a