In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.calibration import calibration_curve
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV

from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import LogisticRegression

from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [3]:
import operator

In [4]:
df = pd.read_csv('./data/new_dataframe.csv', index_col='match_id')

In [5]:
y = df['win'].values[:]

In [6]:
y.shape

(97230,)

In [7]:
X = df.drop('win', axis = 1).values[:]

In [8]:
X.shape

(97230, 966)

In [9]:
def testClassifiers(X,y):
    lr = LogisticRegression()
    gnb = GaussianNB()
    svc = LinearSVC(C=1.0)
    rfc = RandomForestClassifier(n_estimators=100)
    kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=42)
    score = {}
    for clf, name in [(lr, 'Logistic'),
                  (gnb, 'Naive Bayes'),
                  (svc, 'Support Vector Classification'),
                  (rfc, 'Random Forest')]:
        score_tmp = []
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf.fit(X_train, y_train)
            if hasattr(clf, "predict_proba"):
                prob_pos = clf.predict_proba(X_test)[:, 1]
            else:  # use decision function
                prob_pos = clf.decision_function(X_test)
                prob_pos = \
                    (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
            fraction_of_positives, mean_predicted_value = \
                calibration_curve(y_test, prob_pos, n_bins=10)

            score_tmp.append(roc_auc_score(y_test, prob_pos))
        score[name] = np.array(score_tmp).mean()
    
    score_sorted = sorted(score.items(), key=operator.itemgetter(1), reverse=True)
    for x in score_sorted:
        print "Classifier - ", x[0], ', MRA: ', x[1]

In [10]:
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

In [11]:
testClassifiers(X_scale, y)

Classifier -  Logistic , MRA:  0.752463791761
Classifier -  Support Vector Classification , MRA:  0.75051583198
Classifier -  Random Forest , MRA:  0.718545404255
Classifier -  Naive Bayes , MRA:  0.692483670969
