In [None]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import svm, datasets, preprocessing
from sklearn.svm import SVC  
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, matthews_corrcoef, roc_curve, auc
from sklearn.utils import shuffle
from sklearn.cross_validation import KFold

In [None]:
#read dataset
filename = 'C:\\AIA-2018\\examples\\feature-tables\\train-data_1526424532.csv'
data = pd.read_csv(filename)

#specify feature and target
Feature = data.drop(['class_id', 'Unnamed: 0'], axis=1)
Target  = data['class_id']

In [None]:
# shuffle the dataset
X = Feature.as_matrix()
y = Target.as_matrix()
X, y = shuffle(X, y, random_state=None)

#standardize data (mean = 0)
X = preprocessing.robust_scale(X)

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
parameters = [{'kernel': ['rbf'],
               'gamma': [1e-4, 1e-3, 0.01, 0.1, 0.2, 0.5],
                'C': [1, 10, 100, 1000]},
              {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

#Train algorithm
svclassifier = SVC(C=5, class_weight='balanced', kernel='linear', probability=True)  
svclassifier.fit(X_train, y_train)

In [None]:
# Runs the classifier and outputs the metrics
y_pred = svclassifier.predict(X_test)
print('Confusion matrix:')
print(confusion_matrix(y_test,y_pred,labels=[1,-1]),'\n')
print('Classification report:')
print(classification_report(y_test,y_pred,labels=[1,-1]),'\n')
print('Accuracy:')
print(accuracy_score(y_test, y_pred),'\n')
print('Matthew Correlation Coefficient:')
print(matthews_corrcoef(y_test, y_pred),'\n')
prob = svclassifier.predict_proba(X_test)
y_pred.shape

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, prob[:,1], pos_label=1, drop_intermediate=True)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
reg_per_img = (y_pred.size - 51) / 205
fp = false_positive_rate * reg_per_img
plt.plot(fp, true_positive_rate, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.xlim([-0,5])
plt.ylim([-0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive per Image')
plt.show()

In [None]:
# Validate with k-fold
def run_kfold(clf):
    kf = KFold(891, n_folds=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf:
        fold += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        svclassifier.fit(X_train, y_train)
        predictions = svclassifier.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))     
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome))
    return svclassifier
    
svclassifier = run_kfold(svclassifier)