In [1]:
import pandas as pd
import pylab as pl
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize as opt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from util import plot_confusion_matrix

In [None]:
df = pd.read_csv("data/cell_samples.csv")
# Coerce non-numeric values in BareNuc column and remove any rows where the data cannont be coerced
df = df[pd.to_numeric(df['BareNuc'], errors='coerce').notnull()]
df['BareNuc'] = df['BareNuc'].astype('int')

# Select features and target
Features = df[['Clump', 'UnifSize', 'UnifShape', 'MargAdh', 'SingEpiSize', 'BareNuc', 'BlandChrom', 'NormNucl', 'Mit']].values
# Features = np.asarray(Features) - this is not necessary
Features[0:5]

In [None]:
Target = df['Class'].values
Target[0:5]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Features, Target, test_size=0.2, random_state=4)
print('Train set:', X_train.shape, y_train.shape)
print('Test set:', X_test.shape, y_test.shape)

### Modeling

The SVM algorithm offers a choice of kernel functions for performing its processing. Basically, mapping data into a higher dimensional space is called kernelling. The mathematical function used for the transformation is known as the kernel function, and can be of different types, such as:

    1.Linear
    2.Polynomial
    3.Radial basis function (RBF)
    4.Sigmoid
Each of these functions has its characteristics, its pros and cons, and its equation, but as there's no easy way of knowing which function performs best with any given dataset. We usually choose different functions in turn and compare the results. Let's just use the default, RBF (Radial Basis Function) for this lab.


In [None]:
from sklearn import svm
model = svm.SVC(kernel='linear')
model.fit(X_train, y_train)
y_test_ = model.predict(X_test)
print(y_test_[0:5])
print(y_test[0:5])

### Evaluation 
- Confusion Matrix 
- F1 Score
- Jaccard

##### Confusion Matrix

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import itertools

# Confusion Matrix
# For a binary classification problem, the confusion matrix is a 2x2 table:
#                  | Predicted Positive	| Predicted Negative
# _________________|____________________|____________________
# Actual Positive  | True Positive (TP)	| False Negative (FN)
# Actual Negative  | False Positive (FP)| True Negative (TN)
#
from sklearn.metrics import classification_report, confusion_matrix
cnf_matrix = confusion_matrix(y_test, y_test_, labels=[2,4])
cnf_matrix



In [None]:
plot_confusion_matrix(cnf_matrix, classes=['Benign(2)', 'Malignant(4)'], normalize=False, title='Confusion matrix')
print (classification_report(y_test, y_test_))

##### F1 Score
- F1 = 1 : Perfect precision and recall.
- F1 = 0 : Either precision or recall is zero.

In [None]:
from sklearn.metrics import f1_score
f1_score(y_test, y_test_, average='weighted')

##### Jaccard Score
- J = 1 : Perfect match between the predicted and true labels.
- J = 0 : No overlap between the predicted and true labels.

In [None]:
y_test[0:20]

In [None]:
from sklearn.metrics import jaccard_score
print('jaccard_score for 2: ',jaccard_score(y_test, y_test_, pos_label=2))
print('jaccard_score for 4: ',jaccard_score(y_test, y_test_, pos_label=4))
print('jaccard_score for weighted: ',jaccard_score(y_test, y_test_, average='weighted'))