In [1]:
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools


import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

from sklearn.pipeline import make_pipeline

# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression

In [2]:
NUM_TRIALS = 3
FOLDS = 5
DATA_SIZE = 5000 #5000

In [3]:
adults = pd.read_csv('data/adults/adult.data', header=None)

adults.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', '50k_plus']

# adults

In [4]:
adults['sex'] = (adults['sex'] == 'Male').astype(int)
adults.rename(columns={'sex':'male'}, inplace=True)

adults['50k_plus'] = (adults['50k_plus'] == ' >50K').astype(int)
adults.rename(columns={'50k_plus':'y'}, inplace=True)

adults = pd.get_dummies(adults, columns=['workclass', 'education', 'marital_status', 'occupation','relationship', 'race', 'native_country'])
# adults

In [5]:
# adults.columns.values
adults.isna().values.any() # OK!!

False

In [6]:
# adults

In [7]:
# continuous
# cont_cols = np.asarray(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week'])
cont_cols = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

# for col in cont_cols:
#     plt.figure()
#     plt.title(col)
#     plt.hist(adults[col])
    
col_names = adults.columns

mask = np.isin(col_names, cont_cols, invert=True)
non_cont = col_names[mask]

all_col = np.concatenate((cont_cols, non_cont))
adults = adults[all_col]

col_names = adults.columns

features = adults[col_names]

ct = ColumnTransformer([
    ('adult_continuous', preprocessing.StandardScaler(), cont_cols)
], remainder='passthrough')


# scaler.mean_
X_scaled = ct.fit_transform(features)
adults_processed = pd.DataFrame(X_scaled, columns=col_names)
# adults_processed


In [8]:
adults_shuffle = adults_processed.sample(frac=1).reset_index(drop=True)
adults_shuffle_y = pd.DataFrame(adults_shuffle['y'])
adults_shuffle = adults_shuffle.drop('y', 1)

adults_training = adults_shuffle.iloc[:DATA_SIZE,:]
adults_training_y = adults_shuffle_y.iloc[:DATA_SIZE,:]
adults_testing = adults_shuffle.iloc[DATA_SIZE:,:]
adults_testing_y = adults_shuffle_y.iloc[DATA_SIZE:,:]

print(adults_training.shape)
print(adults_training_y.shape)
print(adults_testing.shape)
print(adults_testing_y.shape)
adults_training_y['y'].shape

(5000, 107)
(5000, 1)
(27561, 107)
(27561, 1)


(5000,)

In [9]:
%%time

features = adults_training.columns
X = adults_training.loc[:, features].to_numpy()
y = adults_training_y['y'].to_numpy()

p_grid = {'C': [1,10,100,1000], 'gamma': [0.001,0.01,0.1,1.0]}



non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

for i in range(NUM_TRIALS):
    # print('Trial {}...'.format(i))
    trial_results = []
    outer_cv = KFold(n_splits=FOLDS, shuffle=True, random_state=i)
    best_p = []
    best_score = []
    
    for tr_i, tst_i in outer_cv.split(X):
        X_train, X_test = X[tr_i, :], X[tst_i, :]
        y_train, y_test = y[tr_i], y[tst_i]

        inner_cv = KFold(n_splits=FOLDS, shuffle=True, random_state=i)
    
        svm = SVC(kernel='rbf')

        search = GridSearchCV(
            estimator=svm,
            param_grid=p_grid,
            cv=inner_cv,
            verbose=0,
            scoring='accuracy',
            n_jobs=-1,
            refit=True
        )
        
        result = search.fit(X_train, y_train)
        
        model = result.best_estimator_
        
        y_pred = model.predict(X_test)
        
        acc = accuracy_score(y_test, y_pred)
        
        best_p.append(result.best_params_)
        best_score.append(acc)
        trial_results.append(acc)
        
        print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
        

    print('Accuracy: %.3f (%.3f)' % (np.mean(trial_results), np.std(trial_results)))
    
    print(max(best_score))
    run = best_score.index(max(best_score))
    best_best_p = best_p[run]
    
    oo_svm = SVC()
    print('Training Master: %s' % best_best_p)
    oo_svm.set_params(**best_best_p)
    oo_svm.fit(X,y)
    
    print('Check Acc on Entire set')
    y_pred = oo_svm.predict(adults_testing.to_numpy())
    acc = accuracy_score(adults_testing_y.to_numpy(), y_pred)
    print('Final Acc = %.3f \n' % acc)
    

>acc=0.864, est=0.856, cfg={'C': 100, 'gamma': 0.001}
>acc=0.868, est=0.853, cfg={'C': 1, 'gamma': 0.1}
>acc=0.849, est=0.860, cfg={'C': 100, 'gamma': 0.001}
>acc=0.858, est=0.854, cfg={'C': 100, 'gamma': 0.001}
>acc=0.853, est=0.856, cfg={'C': 1, 'gamma': 0.1}
Accuracy: 0.858 (0.007)
0.868
Training Master: {'C': 1, 'gamma': 0.1}
Check Acc on Entire set
Final Acc = 0.851 

>acc=0.844, est=0.857, cfg={'C': 1, 'gamma': 0.1}
>acc=0.856, est=0.861, cfg={'C': 1, 'gamma': 0.1}
>acc=0.866, est=0.853, cfg={'C': 100, 'gamma': 0.001}
>acc=0.850, est=0.855, cfg={'C': 1, 'gamma': 0.01}
>acc=0.853, est=0.855, cfg={'C': 10, 'gamma': 0.01}
Accuracy: 0.854 (0.007)
0.866
Training Master: {'C': 100, 'gamma': 0.001}
Check Acc on Entire set
Final Acc = 0.850 

>acc=0.859, est=0.854, cfg={'C': 10, 'gamma': 0.01}
>acc=0.845, est=0.858, cfg={'C': 100, 'gamma': 0.001}
>acc=0.859, est=0.856, cfg={'C': 10, 'gamma': 0.01}
>acc=0.849, est=0.858, cfg={'C': 10, 'gamma': 0.01}
>acc=0.863, est=0.857, cfg={'C': 1, 'ga

In [10]:
#     non_nested_scores[i] = clf.best_score_
#     print(clf.best_params_)
#     print(clf.best_estimator_)
#     best_model = clf.best_estimator_

#     cv_score = cross_val_score(
#         clf,
#         X=X,
#         y=y,
#         cv=outer_cv,
#         scoring='accuracy',
#         n_jobs=-1
#     )
# #     nested_scores[i] = nested_score.mean()
#     print(clf.best_params_)
#     print('Accuracy: %.3f (%.3f)' % (np.mean(cv_score), np.std(cv_score)))
    
# score_difference = non_nested_scores - nested_scores

# print("Average difference of {:6f} with std. dev. of {:6f}."
#       .format(score_difference.mean(), score_difference.std()))

# plt.figure()
# plt.subplot(211)
# non_nested_scores_line, = plt.plot(non_nested_scores, color='r')
# nested_line, = plt.plot(nested_scores, color='b')
# plt.ylabel("score", fontsize="14")
# plt.legend([non_nested_scores_line, nested_line],
#            ["Non-Nested CV", "Nested CV"],
#            bbox_to_anchor=(1, 0, 0, .8)) #(0, .4, .5, 0))
# plt.title("Non-Nested and Nested Cross Validation on Iris Dataset",
#           x=.5, y=1.1, fontsize="15")

# # Plot bar chart of the difference.
# plt.subplot(212)
# difference_plot = plt.bar(range(NUM_TRIALS), score_difference)
# plt.xlabel("Individual Trial #")
# plt.legend([difference_plot],
#            ["Non-Nested CV - Nested CV Score"],
#            bbox_to_anchor=(1, 0, 0, .8))
# plt.ylabel("score difference", fontsize="14")

# plt.show()

# for col in adults.columns:
#     print(col)
# adults.describe()

In [11]:
# nested_scores

In [12]:
# non_nested_scores

In [13]:
# KFold(n_splits=FOLDS, shuffle=True, random_state=i)

In [14]:
# class Set:
#     def __init__(self, X, y):
#         self.X = X
#         self.y = y

# def get_training_test_sets(data, training_size=1000, pred_col='y'):
#     data_shuffled = data.sample(frac=1).reset_index(drop=True)
#     data_shuffled_y = pd.DataFrame(data_shuffled[pred_col])
#     data_shuffled_X = data_shuffled.drop(pred_col, 1)
#     tr_X = data_shuffled_X.iloc[:training_size]
#     tr_y = data_shuffled_y.iloc[:training_size]
#     tst_X = data_shuffled_X.iloc[training_size:]
#     tst_y = data_shuffled_y.iloc[training_size:]

#     training = Set(tr_X, tr_y)
#     testing = Set(tst_X, tst_y)
    
#     return training, testing


# def get_folds(X, y, num_folds=5):
#     n_per_fold = int(len(X)/num_folds)
#     X_folds = []
#     y_folds = []
#     for i in range(num_folds):
#         start = i*n_per_fold
#         X_folds.append(X[start:start+n_per_fold])
#         y_folds.append(y[start:start+n_per_fold])
        
#     return X_folds, y_folds

# training_set, testing_set = get_training_test_sets(adults_processed, DATA_SIZE, pred_col='y')
# print(adults_processed.shape)
# print(training_set.X.shape, training_set.y.shape)
# print(testing_set.X.shape, testing_set.y.shape)

# x, y = get_folds(adults_training, adults_training_y, FOLDS)
# print(len(x), len(y))
# print(len(x[0]), len(y[0]))

In [15]:
# def train_svm_model(X, y, C=1.0, gamma=0.0001):
#     clf = SVC(C=C, kernel='rbf', gamma=gamma)
#     clf.fit(X, y)
#     return clf

# def test_model(model, test_set):
#     y_pred = model.predict(test_set.X)
#     return accuracy_score(test_set.y.values.ravel(), y_pred)

# params = {'C': [1,10,100,1000], 'gamma': [0.001,0.01,0.1,1.0]}

# def create_grid_params(params_dict):
#     all_params = [elements for i, elements in params_dict.items()]
#     params_grid = list(itertools.product(*all_params))
#     return params_grid
    
# p_grid = create_grid_params(params)
# print(p_grid)
# for p in p_grid:
#     print('using params:', *p)
#     model = train_svm_model(training_set.X, training_set.y.values.ravel(), *p)
#     acc = test_model(model, testing_set)
#     print(acc)
