In [1]:
import pandas as pd
import numpy as np
import sys, os

import matplotlib.pyplot as plt
from sklearn import datasets
import scipy
from matplotlib.colors import ListedColormap
from functools import partial

In [2]:
np.random.seed(5)               # Set the random seed.

In [3]:
datasets = {0:'tic_tac',1:'adult',2:'cloud',3:'my_dota',4:'uci_data'}
# choses via random dieroll
random_seeds = [3,6,1,5,2,2,1,5,2,6]

In [4]:
cloud_data = pd.read_pickle("./data/cloud_data_sample.pkl")  

In [5]:
cloud_data = cloud_data.sample(n=500, random_state=22)

In [6]:
uci_dota = pd.read_pickle("./data/uci_dota_sample.pkl")  
uci_dota = uci_dota.sample(n=500, random_state=22)

In [7]:
my_dota = pd.read_pickle("./data/my_dota.pkl")  
my_dota = my_dota.sample(n=500, random_state=22)

In [8]:
adult_data = pd.read_csv("./data/adult_data.csv")    
adult_data['class'] = adult_data['class'].where(adult_data['class'] == adult_data['class'][0], 1)
adult_data['class'] = adult_data['class'].where(adult_data['class'] == 1, -1)
adult_data= adult_data.sample(n=500, random_state=22)

In [9]:
tic_tac_data = pd.read_pickle("./data/tic_tac_data.pkl")  
tic_tac_data.head()

Unnamed: 0,x_won,t_l_b,t_l_o,t_l_x,t_m_b,t_m_o,t_m_x,t_r_b,t_r_o,t_r_x,...,m_r_x,b_l_b,b_l_o,b_l_x,b_m_b,b_m_o,b_m_x,b_r_b,b_r_o,b_r_x
0,1,0,0,1,0,0,1,0,0,1,...,0,0,0,1,0,1,0,0,1,0
1,1,0,0,1,0,0,1,0,0,1,...,0,0,1,0,0,0,1,0,1,0
2,1,0,0,1,0,0,1,0,0,1,...,0,0,1,0,0,1,0,0,0,1
3,1,0,0,1,0,0,1,0,0,1,...,0,0,1,0,1,0,0,1,0,0
4,1,0,0,1,0,0,1,0,0,1,...,0,1,0,0,0,1,0,1,0,0


In [10]:
X = []
X.append(tic_tac_data.drop(columns='x_won'))
X.append(adult_data.drop(columns='class'))
X.append(cloud_data.drop(columns='binary_class'))
X.append(my_dota.drop(columns='team_won'))
X.append(uci_dota.drop(columns='team_won'))


In [11]:
Y = []
Y.append(tic_tac_data['x_won'].values.reshape(-1,1).astype(np.float))
Y.append(adult_data['class'].values.reshape(-1,1).astype(np.float))
Y.append(cloud_data['binary_class'].values.reshape(-1,1).astype(np.float))
Y.append(my_dota['team_won'].values.reshape(-1,1).astype(np.float))
Y.append(uci_dota['team_won'].values.reshape(-1,1).astype(np.float))

In [12]:
def get_random_train_test(prop_test):
    X_train = []
    X_test = []
    Y_train = []
    Y_test = []
    
    for i in datasets.keys():
       # print()
        X_and_Y = np.hstack((X[i], Y[i]))     # Stack them together for shuffling.
        np.random.shuffle(X_and_Y)      # Shuffle the data points in X_and_Y array

        #print(datasets[i])
       # print('X', X[i].shape)
       # print('Y', Y[i].shape)
        
        X_shuffled = X_and_Y[:,:-1]
        Y_shuffled = X_and_Y[:,-1]
        
        pivot = int(X[i].shape[0]*(1-prop_test))
        X_train.append(X_shuffled[:pivot]) 
        Y_train.append(Y_shuffled[:pivot])             
        X_test.append(X_shuffled[pivot:]) 
        Y_test.append(Y_shuffled[pivot:])
        
       # print('X_train', X_train[i].shape)
       # print('Y_train', Y_train[i].shape)
        
       # print('X_test', X_test[i].shape)
       # print('Y_test', Y_test[i].shape)
        
    return X_train, Y_train, X_test, Y_test

    

In [59]:
import math
import seaborn as sns       
import numbers
import copy

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC



cv_num = 5

In [14]:
def plot_grid_search_validation_curve(grid, param_to_vary,
                                      title='Validation Curve', ylim=None,
                                      xlim=None, log=None):
    """Plots train and cross-validation scores from a GridSearchCV instance's
    best params while varying one of those params."""

    df_cv_results = pd.DataFrame(grid.cv_results_)
    train_scores_mean = df_cv_results['mean_train_score']
    valid_scores_mean = df_cv_results['mean_test_score']
    train_scores_std = df_cv_results['std_train_score']
    valid_scores_std = df_cv_results['std_test_score']

    param_cols = [c for c in df_cv_results.columns if c[:6] == 'param_']
    param_ranges = [grid.param_grid[p[6:]] for p in param_cols]
    param_ranges_lengths = [len(pr) for pr in param_ranges]

    train_scores_mean = np.array(train_scores_mean).reshape(*param_ranges_lengths)
    valid_scores_mean = np.array(valid_scores_mean).reshape(*param_ranges_lengths)
    train_scores_std = np.array(train_scores_std).reshape(*param_ranges_lengths)
    valid_scores_std = np.array(valid_scores_std).reshape(*param_ranges_lengths)

    param_to_vary_idx = param_cols.index('param_{}'.format(param_to_vary))

    slices = []
    for idx, param in enumerate(grid.best_params_):
        if (idx == param_to_vary_idx):
            slices.append(slice(None))
            continue
        best_param_val = grid.best_params_[param]
        idx_of_best_param = 0
        if isinstance(param_ranges[idx], np.ndarray):
            idx_of_best_param = param_ranges[idx].tolist().index(best_param_val)
        else:
            idx_of_best_param = param_ranges[idx].index(best_param_val)
        slices.append(idx_of_best_param)

    train_scores_mean = train_scores_mean[tuple(slices)]
    valid_scores_mean = valid_scores_mean[tuple(slices)]
    train_scores_std = train_scores_std[tuple(slices)]
    valid_scores_std = valid_scores_std[tuple(slices)]

    plt.clf()

    plt.title(title)
    plt.xlabel(param_to_vary)
    plt.ylabel('Score')

    if (ylim is None):
        plt.ylim(0.0, 1.1)
    else:
        plt.ylim(*ylim)

    if (not (xlim is None)):
        plt.xlim(*xlim)

    lw = 2

    plot_fn = plt.plot
    if log:
        plot_fn = plt.semilogx

    param_range = param_ranges[param_to_vary_idx]
    if (not isinstance(param_range[0], numbers.Number)):
        param_range = [str(x) for x in param_range]
    plot_fn(param_range, train_scores_mean, label='Training score', color='r',
            lw=lw)
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color='r', lw=lw)
    plot_fn(param_range, valid_scores_mean, label='Cross-validation score',
            color='b', lw=lw)
    plt.fill_between(param_range, valid_scores_mean - valid_scores_std,
                     valid_scores_mean + valid_scores_std, alpha=0.1,
                     color='b', lw=lw)

    plt.legend(loc='lower right')

    plt.show()

In [15]:


# Calculate error given feature vectors X and labels Y.
def calc_error(Y_pred, Y):
    return np.array(y_predicted !=  Y_test[0]).astype(np.float32).mean()

[1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

In [83]:
#KNN model
pipe_knn = Pipeline([
    ('sc', StandardScaler()),     
    ('knn', KNeighborsClassifier()) 
])
k_max = 10

def get_knn_parm(n):
    step = math.log((n / cv_num) * (cv_num-1) ,k_max)
    return {'knn__n_neighbors': np.power(np.arange(1,k_max), step).astype(int), 'knn__p' : [1, 2,3]}


In [84]:
# SVM model
pipe_svm = Pipeline([
    ('sc', StandardScaler()),     
    ('svm', SVC()) 
])
k_max = 10

def get_svm_parm(n):
    return {'svm__C': [10**c for c in np.arange(-7,3, dtype=float)], 'svm__kernel' : ['linear','rbf']}

In [85]:
classifiers = {'KNN' : 
                   {'pipe' : pipe_knn, 'params' : get_knn_parm, 'grid_list':[],'test_auc':[],'train_auc':[],'val_auc':[]},
                'SVM' : 
               {'pipe' : pipe_svm, 'params' : get_svm_parm, 'grid_list':[],'test_auc':[],'train_auc':[],'val_auc':[]}
              }
            


In [86]:
def train(x,y, model):
    clf = GridSearchCV(estimator=model['pipe'],           
                  param_grid=model['params'](x.shape[0]), 
                  cv=5,
                  return_train_score=True,n_jobs=5, pre_dispatch = '2*n_jobs',  refit=True,scoring = 'accuracy')

    return clf.fit(x, y)

In [87]:
def test(x_test,y_test, model):
    y_pred  = model.predict(x_test)
    return accuracy_score(y_pred,y_test)
    

In [88]:
dict = copy.deepcopy(dict)

In [89]:
partions = [0.2,0.5,0.8]

In [90]:
def new_results_dic(partions, datasets, classifiers):
    results_dic = {}
    for i, part in enumerate(partions, start=0):
        results_dic[i] = {'partion' : part, 'datasets':{}}
        for dataset in datasets.items():
            results_dic[i]['datasets'][dataset[1]] = {'index' : dataset[0], 'classifiers':copy.deepcopy(classifiers)}
    return results_dic
        
    

In [91]:
def update_avrg_auc_over_n_trials(n, results_dic):
    
    for i in range(0,n):
        print('\ttrial', i)
        X_train, Y_train, X_test, Y_test = get_random_train_test(results_dic['partion'])
        for dataset in results_dic['datasets']:
            print('\t\tdataset', dataset)
            dataset = results_dic['datasets'][dataset]
            for classifier in dataset['classifiers']:
                print('\t\t\ttunning ', classifier)
                classifier = dataset['classifiers'][classifier]
                grid = train(X_train[dataset['index']],Y_train[dataset['index']], classifier)
                classifier['test_auc'].append(test(X_test[dataset['index']], Y_test[dataset['index']],grid))
                classifier['train_auc'].append(grid.cv_results_['mean_train_score'][grid.best_index_])
                classifier['val_auc'].append(grid.cv_results_['mean_test_score'][grid.best_index_])

                classifier['grid_list'].append(grid)
    
        

In [94]:
def run_core_loop():
    results_dic = new_results_dic(partions, datasets, classifiers)
    for part in results_dic.values():
        print('partion', part['partion'])
        update_avrg_auc_over_n_trials(3, part)
    return results_dic
        

In [None]:
results_dic = run_core_loop()

partion 0.2
	trial 0
		dataset tic_tac
			tunning  KNN


In [33]:
from statistics import mean 


In [58]:
for part in results_dic.values():
    print('partion', int(100*(1-part['partion'])),'/', int(100*part['partion']))
    for dataset in part['datasets']:
        print('\tdataset', dataset)
        dataset = part['datasets'][dataset]
        for classifier in dataset['classifiers']:
            print('\t\t',classifier)
            classifier = dataset['classifiers'][classifier]
            print('\t\t\ttest_auc', mean(classifier['test_auc']))
            print('\t\t\ttrain_auc',mean(classifier['train_auc']))
            print('\t\t\tval_auc',mean(classifier['val_auc']))


partion 80 / 20
	dataset tic_tac
		 KNN
			test_auc 0.75
			train_auc 0.9307013615669215
			val_auc 0.7885097473332767
	dataset adult
		 KNN
			test_auc 0.7466666666666667
			train_auc 0.7575000000000001
			val_auc 0.7541666666666667
	dataset cloud
		 KNN
			test_auc 0.7466666666666667
			train_auc 0.775625
			val_auc 0.7683333333333334
	dataset my_dota
		 KNN
			test_auc 0.54
			train_auc 1.0
			val_auc 0.5599999999999999
	dataset uci_data
		 KNN
			test_auc 0.52
			train_auc 0.5758333333333333
			val_auc 0.555
partion 50 / 50
	dataset tic_tac
		 KNN
			test_auc 0.7849686847599165
			train_auc 0.8905669785320568
			val_auc 0.8183406432748538
	dataset adult
		 KNN
			test_auc 0.7506666666666667
			train_auc 0.757
			val_auc 0.7493333333333333
	dataset cloud
		 KNN
			test_auc 0.7506666666666667
			train_auc 0.785
			val_auc 0.7693333333333334
	dataset my_dota
		 KNN
			test_auc 0.49866666666666665
			train_auc 0.646
			val_auc 0.552
	dataset uci_data
		 KNN
			test_auc 0.46666666666666

In [None]:
#plot_grid_search_validation_curve(grid, 'knn__p', log=False, ylim=(.4, 1.02))
#plot_grid_search_validation_curve(grid, 'knn__n_neighbors', log=True, ylim=(.4, 1.02))
#parm = classifiers['KNN']['params'](X_train[0].shape[0])
#scores = grid.cv_results_['mean_test_score'].reshape(len(parm['knn__n_neighbors']),len(parm['knn__p']))
#ax = sns.heatmap(scores