In [102]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import svm, datasets
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.svm import SVC  
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score
from sklearn.utils import shuffle
from sklearn.cross_validation import KFold

from sklearn.metrics import roc_auc_score

In [111]:
def DATA(filename):
    #filename = 'classifier-train-data.csv'
    data = pd.read_csv(filename)

    #specify featuure and target
    Feature = data.drop(['class_id', 'Unnamed: 0'], axis=1)
    Target  = data['class_id']
    
    # shuffle the dataset
    X, y = shuffle(Feature, Target, random_state=0)

    #standardize data
    X = (X - X.mean()) / X.std()

    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.5, random_state=0)

    return X_train, X_test, y_train, y_test, X, y


def run_SVM (X_train, X_test, y_train, y_test, kernel= 'poly'):
    #Train algorithm
    svclassifier = SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
                      decision_function_shape='ovr', degree=3, gamma=0.01, kernel= kernel,
                      max_iter=-1, probability=True, random_state=None, shrinking=True,
                      tol=0.001, verbose=False)
    
    svclassifier.fit(X_train, y_train)
    #make prediction
    y_pred = svclassifier.predict(X_test)
    
    # results
    print(confusion_matrix(y_test,y_pred))  
    print(classification_report(y_test,y_pred))
    print("Accuracy:  %f" % (accuracy_score(y_test, y_pred)))
    
    #AUROC results
    prob_y = svclassifier.predict_proba(X_test)
    prob_y  = [p[1] for p in prob_y]
    print("AUROC:  %f" % roc_auc_score(y_test, prob_y) )
    
    return svclassifier
    
    # Validate with k-fold
def run_kfold(clf):
    kf = KFold(891, n_folds=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf:
        fold += 1
        X_train, X_test =X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))     
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome)) 

In [112]:
filename = './feature-tables/train-data_1526424532.csv'
X_train, X_test, y_train, y_test, X, y = DATA(filename)
run_SVM (X_train, X_test, y_train, y_test, 'linear')

[[2818  319]
 [   6   45]]
             precision    recall  f1-score   support

       -1.0       1.00      0.90      0.95      3137
        1.0       0.12      0.88      0.22        51

avg / total       0.98      0.90      0.93      3188

Accuracy:  0.898055
AUROC:  0.910411


SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [113]:
run_SVM (X_train, X_test, y_train, y_test, 'rbf')

[[2891  246]
 [  12   39]]
             precision    recall  f1-score   support

       -1.0       1.00      0.92      0.96      3137
        1.0       0.14      0.76      0.23        51

avg / total       0.98      0.92      0.95      3188

Accuracy:  0.919072
AUROC:  0.906355


SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [114]:
run_SVM (X_train, X_test, y_train, y_test, 'sigmoid')

[[2936  201]
 [  19   32]]
             precision    recall  f1-score   support

       -1.0       0.99      0.94      0.96      3137
        1.0       0.14      0.63      0.23        51

avg / total       0.98      0.93      0.95      3188

Accuracy:  0.930991
AUROC:  0.921844


SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='sigmoid',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [115]:
scv=run_SVM (X_train, X_test, y_train, y_test,'poly')

[[3086   51]
 [  37   14]]
             precision    recall  f1-score   support

       -1.0       0.99      0.98      0.99      3137
        1.0       0.22      0.27      0.24        51

avg / total       0.98      0.97      0.97      3188

Accuracy:  0.972396
AUROC:  0.114391


In [77]:
#run_kfold(svc)

In [None]:
parameters = [{'kernel': ['poly'],
               'gamma': [1e-4, 1e-3, 0.01, 0.1, 0.2, 0.5],
                'C': [1, 10, 100, 1000]},
             {'kernel': ['sigmoid'], 'C': [1, 10, 100, 1000]}]

clf = GridSearchCV(svm.SVC(decision_function_shape='ovr', class_weight='balanced', probability=True), parameters, cv=100, n_jobs=4)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on training set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()



In [85]:
clf.best_estimator_

SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='poly',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [91]:
print("Best Score:  %f " % (clf.best_score_ ))
print("Best Params:  %r" % (clf.best_params_ ))
print("Best Index:  %f" % (clf.best_index_ ))
print("Best Estimator:  %r" % (clf.best_estimator_ ))


Best Score:  0.971765 
Best Params:  {'C': 1, 'gamma': 0.01, 'kernel': 'poly'}
Best Index:  2.000000
Best Estimator:  SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='poly',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
