In [None]:
# Split the dataset in the beggining, train and test
# Reverse, and sum the probabilities
# FROC curve: add masses not detected to the y axis and scale the x axis
# Keep the regions for the same image in the same set


#read dataset
filename = 'C:\\AIA-2018\\examples\\feature-tables\\train-data_1527086478.csv'
data = pd.read_csv(filename)

#specify feature and target
Feature = data.drop(['class_id', 'Unnamed: 0','img_name'], axis=1)
Target  = data['class_id']

# shuffle the dataset
X = Feature.as_matrix()
y = Target.as_matrix()
X, y = shuffle(X, y, random_state=None)

#standardize data (mean = 0)
X = preprocessing.robust_scale(X)

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

svclassifier = SVC(C=0.5, class_weight={1: 7}, gamma=0.003, kernel='rbf', probability=True)
svclassifier.fit(X_train, y_train)
prob1 = svclassifier.predict_proba(X_test)
prob1 = np.column_stack((prob1,y_test))

svclassifier.fit(X_test, y_test)
prob2 = svclassifier.predict_proba(X_train)
prob2 = np.column_stack((prob2,y_train))

full_prob = np.concatenate((prob1,prob2),axis=0)

In [None]:
# ROC Curve section for one splitting of the dataset
false_positive_rate, true_positive_rate, thresholds = roc_curve(full_prob[:,2], full_prob[:,1], pos_label=1, drop_intermediate=True)
roc_auc = auc(false_positive_rate, true_positive_rate)

unique, counts = np.unique(y, return_counts=True)
regions = y.shape[0]
pos_reg = counts[1]
neg_reg = counts[0]
num_img = 410
num_pos_img = 115
neg_reg_per_img = neg_reg / num_img
fppi = false_positive_rate * neg_reg_per_img
true_positive_rate = true_positive_rate * pos_reg / num_pos_img

area_until1 = np.trapz(true_positive_rate[0:124],fppi[0:124])

plt.title('Receiver Operating Characteristic')
plt.plot(fppi, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.xlim([-0,5])
plt.ylim([-0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive per Image')
plt.grid(color='k', linestyle='dotted', linewidth=0.5, alpha=0.5)
plt.show()

In [None]:
#Train algorithm
#svclassifier = SVC(C=0.10, class_weight={1:100}, kernel='linear', probability=True)
svclassifier = SVC(C=0.5, class_weight={1: 7}, gamma=0.003, kernel='rbf', probability=True)
#svclassifier = SVC(C=0.1, class_weight={1: 5}, gamma=0.001, kernel='sigmoid', probability=True)
svclassifier.fit(X_train, y_train)

In [None]:
# Runs the classifier and outputs the metrics
y_pred = svclassifier.predict(X_test)
print('Confusion matrix:')
print(confusion_matrix(y_test,y_pred,labels=[1,-1]),'\n')
print('Classification report:')
print(classification_report(y_test,y_pred,labels=[1,-1]),'\n')
print('Accuracy:')
print(accuracy_score(y_test, y_pred),'\n')
print('Matthew Correlation Coefficient:')
print(matthews_corrcoef(y_test, y_pred),'\n')
prob = svclassifier.predict_proba(X_test)

In [None]:
# ROC Curve section for one splitting of the dataset
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, prob[:,1], pos_label=1, drop_intermediate=True)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
reg_per_img = (y_pred.size - 51) / 205
fp = false_positive_rate * reg_per_img
plt.plot(fp, true_positive_rate, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.xlim([-0,5])
plt.ylim([-0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive per Image')
plt.show()

In [None]:
# Computations in order to properly scale the false positive ratio
unique, counts = np.unique(y, return_counts=True)
regions = y.shape[0]
neg_reg = counts[0]
pos_reg = counts[1]
num_img = 410
reg_per_img = regions / num_img
neg_reg_per_img = neg_reg / num_img
neg_reg_per_img
scaling = 5 / neg_reg_per_img

In [None]:
# Receiver Operating Characteristic (ROC) with cross validation
cv = StratifiedKFold(n_splits=6, shuffle=True)

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

i = 0
matthew = 0
for train, test in cv.split(X, y):
    probas_ = svclassifier.fit(X[train], y[train]).predict_proba(X[test])
    y_pred = svclassifier.predict(X[test])
    matthew = matthew + matthews_corrcoef(y[test], y_pred)
    
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    
    fpr = neg_reg_per_img * fpr
    
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3,
             label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, neg_reg_per_img], [0, 1], linestyle='--', lw=2, color='r',
         label='Luck', alpha=.8)

mean_fpr = mean_fpr * neg_reg_per_img

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([0, neg_reg_per_img])
plt.ylim([0, 1])
plt.xlabel('False Positive per Image')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
print(matthew/6)

In [None]:
# Receiver Operating Characteristic (ROC) with cross validation
cv = StratifiedKFold(n_splits=6, shuffle=True)

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

i = 0
matthew = 0
for train, test in cv.split(X, y):
    probas_ = svclassifier.fit(X[train], y[train]).predict_proba(X[test])
    y_pred = svclassifier.predict(X[test])
    matthew = matthew + matthews_corrcoef(y[test], y_pred)
    
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    
    #fpr = neg_reg_per_img * fpr
    
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3,
             label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Luck', alpha=.8)

#mean_fpr = mean_fpr * neg_reg_per_img

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
print(matthew/6)

In [None]:
# Implementation of AdaBoostClassifier on top of the SVM
SVM_classif = SVC(C=10, class_weight='balanced', kernel='linear', probability=True)
clf = AdaBoostClassifier(SVM_classif,n_estimators=50,learning_rate=1, algorithm='SAMME.R')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Confusion matrix:')
print(confusion_matrix(y_test,y_pred,labels=[1,-1]),'\n')
print('Classification report:')
print(classification_report(y_test,y_pred,labels=[1,-1]),'\n')
print('Accuracy:')
print(accuracy_score(y_test, y_pred),'\n')
print('Matthew Correlation Coefficient:')
print(matthews_corrcoef(y_test, y_pred),'\n')
prob = clf.predict_proba(X_test)