In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
from sklearn.calibration import calibration_curve
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV

from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import LogisticRegression

from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [3]:
import operator

In [4]:
def fillMissingData(df):
    table = df.count()/df.shape[0]
    i = 0
    for x in table:
        if x < 1.0: 
            if 'time' in df.columns.values[i]:
                df[df.columns.values[i]].fillna(350, inplace=True)
            else:
                df[df.columns.values[i]].fillna(df[df.columns.values[i]].mean(), inplace=True)
        i = i + 1
    return df

In [5]:
def prepareData():
    df = pd.read_csv('./features.csv', index_col='match_id')
    y = df.radiant_win.values[:]
    #df.fillna(600, inplace=True) # like 10 min after start
    df = fillMissingData(df)
    df.drop(['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'], axis = 1, inplace=True)
    return (df, y)

In [6]:
def dataPick(data):
    N = np.max(np.unique(data[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']].values))
    X_pick = np.zeros((data.shape[0], N))

    for i, match_id in enumerate(data.index):
        for p in xrange(5):
            X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
    for i, match_id in enumerate(data.index):
        for p in xrange(5):
            X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
    
    res = data.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis = 1)
    
    return np.hstack((res.values[:,:], X_pick))

In [7]:
(train_data, train_labels) = prepareData()

In [8]:
X = dataPick(train_data)

In [9]:
y = train_labels

In [28]:
# Create classifiers
lr = LogisticRegression()
gnb = GaussianNB()
svc = LinearSVC(C=1.0)
rfc = RandomForestClassifier(n_estimators=100)

# Ordinary test

In [29]:
def testClassifiers(X,y):
    kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=42)
    score = {}
    for clf, name in [(lr, 'Logistic'),
                  (gnb, 'Naive Bayes'),
                  (svc, 'Support Vector Classification'),
                  (rfc, 'Random Forest')]:
        score_tmp = []
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf.fit(X_train, y_train)
            if hasattr(clf, "predict_proba"):
                prob_pos = clf.predict_proba(X_test)[:, 1]
            else:  # use decision function
                prob_pos = clf.decision_function(X_test)
                prob_pos = \
                    (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
            fraction_of_positives, mean_predicted_value = \
                calibration_curve(y_test, prob_pos, n_bins=10)

            score_tmp.append(roc_auc_score(y_test, prob_pos))
        score[name] = np.array(score_tmp).mean()
    
    score_sorted = sorted(score.items(), key=operator.itemgetter(1), reverse=True)
    for x in score_sorted:
        print "Classifier - ", x[0], ', MRA: ', x[1]

In [30]:
testClassifiers(X, y)

Classifier -  Naive Bayes , MRA:  0.707023094556
Classifier -  Random Forest , MRA:  0.696199849804
Classifier -  Logistic , MRA:  0.513447955418
Classifier -  Support Vector Classification , MRA:  0.497812213812


# Test with scale

In [10]:
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

In [31]:
testClassifiers(X_scale, y)

Classifier -  Logistic , MRA:  0.752091473499
Classifier -  Support Vector Classification , MRA:  0.747740590681
Classifier -  Random Forest , MRA:  0.697400783161
Classifier -  Naive Bayes , MRA:  0.674894517214


### without fillna
1. Classifier -  Logistic , MRA:  0.751925110015
2. Classifier -  Support Vector Classification , MRA:  0.747484959226
3. Classifier -  Random Forest , MRA:  0.696883098439
4. Classifier -  Naive Bayes , MRA:  0.676329602573
### fillna 600
1. Classifier -  Logistic , MRA:  0.752091473499
2. Classifier -  Support Vector Classification , MRA:  0.746998109759
3. Classifier -  Random Forest , MRA:  0.696726010178
4. Classifier -  Naive Bayes , MRA:  0.674894517214

# PCA

In [34]:
pca = PCA(n_components=150)

In [35]:
X_PCA = pca.fit_transform(X)

In [37]:
testClassifiers(X_PCA, y)

Classifier -  Random Forest , MRA:  0.712578675283
Classifier -  Logistic , MRA:  0.712532039844
Classifier -  Naive Bayes , MRA:  0.709058603415
Classifier -  Support Vector Classification , MRA:  0.5801028208


# Parameters

In [32]:
gridLogistic = {'C': np.power(10.0, np.arange(-2, 2)), 'penalty':('l1', 'l2')}

In [33]:
kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=42)
gs = GridSearchCV(lr, gridLogistic, scoring='roc_auc', cv=kf, verbose=8, n_jobs=3)

In [34]:
gs.fit(X_scale, y)

Fitting 5 folds for each of 22 candidates, totalling 110 fits


[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    4.8s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   43.1s
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:  3.6min
[Parallel(n_jobs=3)]: Done 110 out of 110 | elapsed:  7.0min finished


[CV] penalty=l1, C=1e-05 .............................................
[CV] penalty=l1, C=1e-05 .............................................
[CV] penalty=l1, C=1e-05 .............................................
[CV] .................... penalty=l1, C=1e-05, score=0.500000 -   0.7s[CV] .................... penalty=l1, C=1e-05, score=0.500000 -   1.1s[CV] .................... penalty=l1, C=1e-05, score=0.500000 -   1.1s


[CV] penalty=l1, C=1e-05 .............................................
[CV] penalty=l2, C=1e-05 .............................................
[CV] penalty=l1, C=1e-05 .............................................
[CV] .................... penalty=l1, C=1e-05, score=0.500000 -   0.8s[CV] .................... penalty=l2, C=1e-05, score=0.710976 -   1.6s[CV] .................... penalty=l1, C=1e-05, score=0.500000 -   0.8s


[CV] penalty=l2, C=1e-05 .............................................
[CV] penalty=l2, C=1e-05 .............................................
[CV] p

GridSearchCV(cv=sklearn.cross_validation.KFold(n=97230, n_folds=5, shuffle=True, random_state=42),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'penalty': ('l1', 'l2'), 'C': array([  1.00000e-05,   1.00000e-04,   1.00000e-03,   1.00000e-02,
         1.00000e-01,   1.00000e+00,   1.00000e+01,   1.00000e+02,
         1.00000e+03,   1.00000e+04,   1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=8)

[CV] ................... penalty=l2, C=1000.0, score=0.751571 -   7.1s[CV] .................... penalty=l2, C=100.0, score=0.749778 -   6.3s


[CV] penalty=l1, C=10000.0 ...........................................
[CV] penalty=l1, C=10000.0 ...........................................
[CV] penalty=l1, C=1000.0 ............................................
[CV] .................. penalty=l1, C=10000.0, score=0.749347 -  18.7s[CV] .................. penalty=l1, C=10000.0, score=0.749779 -  20.5s[CV] ................... penalty=l1, C=1000.0, score=0.749347 -  18.9s


[CV] penalty=l1, C=10000.0 ...........................................
[CV] penalty=l2, C=10000.0 ...........................................
[CV] penalty=l1, C=1000.0 ............................................
[CV] .................. penalty=l1, C=10000.0, score=0.756164 -  22.0s[CV] .................. penalty=l2, C=10000.0, score=0.749348 -   8.8s[CV] ................... penalty=l1, C=1000.0, score=0.756164 -  23.6s


[CV] 

In [42]:
def gridScore2(gs, par1, par2):
    res = {}
    for a in gs.grid_scores_:
        res[str(a.parameters[par1]) + str(":") + str(a.parameters[par2])] = a.mean_validation_score

    res_sorted = sorted(res.items(), key=operator.itemgetter(1), reverse=True)
    for x in res_sorted:
            print "parameters - ", x[0], ', MVC: ', x[1]

In [37]:
gridScore2(gs, 'penalty', 'C')

parameters -  l1:0.1 , MVC:  0.752137533835
parameters -  l2:0.01 , MVC:  0.752129368151
parameters -  l1:1.0 , MVC:  0.752096639091
parameters -  l2:0.1 , MVC:  0.752096532303
parameters -  l1:10000.0 , MVC:  0.752092606693
parameters -  l1:10.0 , MVC:  0.752092515609
parameters -  l1:1000.0 , MVC:  0.752092500826
parameters -  l1:100.0 , MVC:  0.752092494492
parameters -  l1:100000.0 , MVC:  0.752092012482
parameters -  l2:1.0 , MVC:  0.752091470321
parameters -  l2:10.0 , MVC:  0.752090721452
parameters -  l2:100.0 , MVC:  0.752090710892
parameters -  l2:1000.0 , MVC:  0.752090700309
parameters -  l2:10000.0 , MVC:  0.752090695012
parameters -  l2:100000.0 , MVC:  0.752090695012
parameters -  l2:0.001 , MVC:  0.751795603984
parameters -  l1:0.01 , MVC:  0.751633088156
parameters -  l2:0.0001 , MVC:  0.743020815783
parameters -  l1:0.001 , MVC:  0.725082680993
parameters -  l2:1e-05 , MVC:  0.713362013007
parameters -  l1:0.0001 , MVC:  0.5
parameters -  l1:1e-05 , MVC:  0.5


In [38]:
gridSVM = {'C': np.power(10.0, np.arange(-5, 6))}

In [39]:
svc = LinearSVC()

In [40]:
gs = GridSearchCV(svc, gridSVM, scoring='roc_auc', cv=kf, verbose=8, n_jobs=3)

In [41]:
gs.fit(X_scale, y)

Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   11.9s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed: 11.5min
[Parallel(n_jobs=3)]: Done  55 out of  55 | elapsed: 23.1min finished


[CV] C=1e-05 .........................................................
[CV] C=1e-05 .........................................................
[CV] C=1e-05 .........................................................
[CV] ................................ C=1e-05, score=0.739169 -   1.7s[CV] ................................ C=1e-05, score=0.741416 -   2.0s[CV] ................................ C=1e-05, score=0.740410 -   2.0s


[CV] C=1e-05 .........................................................
[CV] C=1e-05 .........................................................
[CV] C=0.0001 ........................................................
[CV] ................................ C=1e-05, score=0.745182 -   1.8s[CV] ................................ C=1e-05, score=0.743085 -   1.8s[CV] ............................... C=0.0001, score=0.748848 -   1.8s


[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C

GridSearchCV(cv=sklearn.cross_validation.KFold(n=97230, n_folds=5, shuffle=True, random_state=42),
       error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'C': array([  1.00000e-05,   1.00000e-04,   1.00000e-03,   1.00000e-02,
         1.00000e-01,   1.00000e+00,   1.00000e+01,   1.00000e+02,
         1.00000e+03,   1.00000e+04,   1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=8)

In [18]:
def gridScore1(gs, par1):
    res = {}
    for a in gs.grid_scores_:
        res[str(a.parameters[par1])] = a.mean_validation_score
        
    res_sorted = sorted(res.items(), key=operator.itemgetter(1), reverse=True)
    for x in res_sorted:
            print "parameter - ", x[0], ', MVC: ', x[1]

In [47]:
gridScore1(gs, 'C')

parameter -  0.001 , MVC:  0.752098112662
parameter -  0.01 , MVC:  0.7520622093
parameter -  0.1 , MVC:  0.752055735624
parameter -  0.0001 , MVC:  0.75173579028
parameter -  1.0 , MVC:  0.748283322873
parameter -  1e-05 , MVC:  0.741852549586
parameter -  10.0 , MVC:  0.640394309524
parameter -  100.0 , MVC:  0.625369205022
parameter -  10000.0 , MVC:  0.604841347115
parameter -  100000.0 , MVC:  0.60425280191
parameter -  1000.0 , MVC:  0.593304559932


# Try the best

In [48]:
lr = LogisticRegression(penalty='l1', C=0.1)

In [49]:
lr.fit(X_scale, y)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
svc = LinearSVC(C=0.001)

In [51]:
svc.fit(X_scale, y)

LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

## Test Data

In [64]:
df_test = pd.read_csv('./features_test.csv', index_col='match_id')
df_test = fillMissingData(df_test)
X_test = dataPick(df_test)
X_test_scaled = scaler.transform(X_test)

In [65]:
y_pred = lr.predict_proba(X_test_scaled)

In [66]:
df_test['radiant_win'] = y_pred[:, 0]

In [67]:
df_to_write = df_test['radiant_win']
df_to_write.to_csv('results_logical.csv', sep=',', encoding='utf-8', header=True)

In [68]:
#y_pred = svc.predict_proba(X_test)
prob_pos = svc.decision_function(X_test_scaled)
prob_pos = \
    (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())

In [69]:
df_test['radiant_win'] = prob_pos[:]
df_to_write = df_test['radiant_win']
df_to_write.to_csv('results_svc.csv', sep=',', encoding='utf-8', header=True)

array([ 0.61676201,  0.5657714 ,  0.33040907, ...,  0.35864969,
        0.5220851 ,  0.43877303])

### Result - svm better

# KNN

In [98]:
gridKNN = {'n_neighbors': np.power(2.0, np.arange(1, 6))}
clfKNN = KNeighborsClassifier()

In [99]:
kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=42)
gs = GridSearchCV(clfKNN, gridKNN, scoring='roc_auc', cv=kf, verbose=8, n_jobs=3)

In [100]:
gs.fit(X_scale, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed: 71.3min
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed: 153.9min finished
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwar

[CV] n_neighbors=2.0 .................................................
[CV] n_neighbors=2.0 .................................................
[CV] n_neighbors=2.0 .................................................
[CV] ........................ n_neighbors=2.0, score=0.550084 -17.2min[CV] ........................ n_neighbors=2.0, score=0.550685 -16.9min[CV] ........................ n_neighbors=2.0, score=0.557550 -16.8min


[CV] n_neighbors=4.0 .................................................
[CV] n_neighbors=2.0 .................................................
[CV] n_neighbors=2.0 .................................................
[CV] ........................ n_neighbors=4.0, score=0.567316 -16.8min[CV] ........................ n_neighbors=2.0, score=0.549147 -17.3min[CV] ........................ n_neighbors=2.0, score=0.552380 -17.8min


[CV] n_neighbors=4.0 .................................................
[CV] n_neighbors=4.0 .................................................
[CV] n

GridSearchCV(cv=sklearn.cross_validation.KFold(n=97230, n_folds=5, shuffle=True, random_state=42),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'n_neighbors': array([  2.,   4.,   8.,  16.,  32.])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=8)

In [101]:
gridScore1(gs, 'n_neighbors')

parameter -  32.0 , MVC:  0.645263637464
parameter -  16.0 , MVC:  0.618230633313
parameter -  8.0 , MVC:  0.591739975357
parameter -  4.0 , MVC:  0.568449020221
parameter -  2.0 , MVC:  0.551969265077


In [102]:
gridKNN = {'n_neighbors': np.power(2.0, np.arange(6, 10))}

In [103]:
gs = GridSearchCV(clfKNN, gridKNN, scoring='roc_auc', cv=kf, verbose=8, n_jobs=3)

In [104]:
gs.fit(X_scale, y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed: 73.6min
[Parallel(n_jobs=3)]: Done  20 out of  20 | elapsed: 130.2min finished
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwargs) for func, args, kwargs in self.items]
  return [func(*args, **kwar

[CV] n_neighbors=64.0 ................................................
[CV] n_neighbors=64.0 ................................................
[CV] n_neighbors=64.0 ................................................
[CV] ....................... n_neighbors=64.0, score=0.667956 -18.1min[CV] ....................... n_neighbors=64.0, score=0.666308 -17.5min[CV] ....................... n_neighbors=64.0, score=0.665404 -17.4min


[CV] n_neighbors=128.0 ...............................................
[CV] n_neighbors=64.0 ................................................
[CV] n_neighbors=64.0 ................................................
[CV] ...................... n_neighbors=128.0, score=0.683860 -18.7min[CV] ....................... n_neighbors=64.0, score=0.664029 -18.4min[CV] ....................... n_neighbors=64.0, score=0.672486 -18.7min


[CV] n_neighbors=128.0 ...............................................
[CV] n_neighbors=128.0 ...............................................
[CV] n

GridSearchCV(cv=sklearn.cross_validation.KFold(n=97230, n_folds=5, shuffle=True, random_state=42),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'n_neighbors': array([  64.,  128.,  256.,  512.])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=8)

In [105]:
gridScore1(gs, 'n_neighbors')

parameter -  512.0 , MVC:  0.700054467442
parameter -  256.0 , MVC:  0.694924849166
parameter -  128.0 , MVC:  0.683160458353
parameter -  64.0 , MVC:  0.667236587863


In [106]:
pca = PCA(n_components=150)

In [107]:
X_PCA = pca.fit_transform(X_scale, y)

In [110]:
X_PCA[0]

array([-1.9355369 , -1.51024403, -1.96806795,  1.69545284,  1.7645308 ,
       -0.33900647,  0.92760675,  0.70738976,  3.10799496, -1.04628992,
        0.73933871,  0.3271432 , -0.93045407, -1.04836149,  0.02627629,
        1.25625948,  0.69750044,  1.9861236 , -0.45626928,  0.39061501,
       -1.52764942,  0.74732445, -0.18303594,  0.93500983,  1.46733803,
       -0.66314662,  0.3378304 ,  0.8886581 ,  0.80003824,  0.04402689,
        1.04935863, -0.76256212,  0.51378513, -0.16991222, -0.23582013,
        2.84620572, -1.72610019,  2.07427909,  1.16610327,  0.9270081 ,
        0.69998721, -1.48450686,  0.256101  ,  0.94513736, -1.15030737,
       -0.30501772,  1.10412669,  0.37867131,  1.52287504, -0.54015358,
       -1.94289982, -0.94432529, -2.06765252, -0.10854373, -1.11224275,
        0.59268899, -0.13994539,  3.91306067, -0.20530957,  0.46932947,
       -0.21837178, -0.25251149, -0.00970731, -1.87816238,  0.33898517,
        1.51097274,  1.4941962 ,  0.42742166, -0.82852625,  0.62

# AdaBoost

In [12]:
gridAdaBoost = {'n_estimators': [25, 50, 100]}
clfAdaBoost = AdaBoostClassifier()

In [13]:
kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=42)
gs = GridSearchCV(clfAdaBoost, gridAdaBoost, scoring='roc_auc', cv=kf, verbose=8, n_jobs=3)

In [16]:
gs.fit(X, y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] n_estimators=25 .................................................
[CV] n_estimators=25 .................................................
[CV] n_estimators=25 .................................................
[CV] ........................ n_estimators=25, score=0.691324 -  20.6s[CV] ........................ n_estimators=25, score=0.690014 -  22.7s[CV] ........................ n_estimators=25, score=0.685563 -  23.2s


[CV] n_estimators=25 .................................................
[CV] n_estimators=25 .................................................
[CV] n_estimators=50 .................................................
[CV] ........................ n_estimators=25, score=0.693607 -  21.3s[CV] ........................ n_estimators=25, score=0.693613 -  19.8s[CV] ........................ n_estimators=50, score=0.707160 -  45.0s


[CV] n_estimators=50 .................................................
[CV] n_estimators

[Parallel(n_jobs=3)]: Done  15 out of  15 | elapsed:  4.4min finished


GridSearchCV(cv=sklearn.cross_validation.KFold(n=97230, n_folds=5, shuffle=True, random_state=42),
       error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'n_estimators': [25, 50, 100]}, pre_dispatch='2*n_jobs',
       refit=True, scoring='roc_auc', verbose=8)

In [19]:
gridScore1(gs, 'n_estimators')

parameter -  100 , MVC:  0.726929660432
parameter -  50 , MVC:  0.709996935759
parameter -  25 , MVC:  0.690824098903


In [20]:
gs.fit(X_scale, y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] n_estimators=25 .................................................
[CV] n_estimators=25 .................................................
[CV] n_estimators=25 .................................................
[CV] ........................ n_estimators=25, score=0.691324 -  19.8s[CV] ........................ n_estimators=25, score=0.690014 -  23.8s[CV] ........................ n_estimators=25, score=0.685563 -  19.8s


[CV] n_estimators=25 .................................................
[CV] n_estimators=50 .................................................
[CV] n_estimators=25 .................................................
[CV] ........................ n_estimators=25, score=0.693607 -  24.0s[CV] ........................ n_estimators=50, score=0.707160 -  33.2s[CV] ........................ n_estimators=25, score=0.693613 -  21.1s


[CV] n_estimators=50 .................................................
[CV] n_estimators

[Parallel(n_jobs=3)]: Done  15 out of  15 | elapsed:  4.2min finished


GridSearchCV(cv=sklearn.cross_validation.KFold(n=97230, n_folds=5, shuffle=True, random_state=42),
       error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'n_estimators': [25, 50, 100]}, pre_dispatch='2*n_jobs',
       refit=True, scoring='roc_auc', verbose=8)

In [21]:
gridScore1(gs, 'n_estimators')

parameter -  100 , MVC:  0.726929660432
parameter -  50 , MVC:  0.709996935759
parameter -  25 , MVC:  0.690824098903
