In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix

%matplotlib inline



In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
colnames = ['Code_num', 'Clump_thickness','Cell_size', 'Cell_shape', 'Adhesion', 'Epithelial_Cell_Size', 'Bare_nuclei', 'Chromatin', 'Normal_nuclei', 'Mitoses', 'Class']
dataset = pd.read_csv(url, header = None, names=colnames)

## A)

In [3]:
dataset = dataset.drop(['Code_num'], axis = 1)
dataset.replace('?', np.nan, inplace=True)
print('Shape before removing the NA :', dataset.shape)
dataset.dropna(how='any', axis=0, inplace=True)
dataset['Bare_nuclei']= dataset['Bare_nuclei'].astype('float')
dataset['Class'] = dataset['Class'].replace({2:0, 4:1})
print('Shape after removing the NA  :', dataset.shape)

Shape before removing the NA : (699, 10)
Shape after removing the NA  : (683, 10)


In [4]:
dataset.dtypes

Clump_thickness           int64
Cell_size                 int64
Cell_shape                int64
Adhesion                  int64
Epithelial_Cell_Size      int64
Bare_nuclei             float64
Chromatin                 int64
Normal_nuclei             int64
Mitoses                   int64
Class                     int64
dtype: object

In [5]:
dataset[dataset['Class'] == 1].shape

(239, 10)

## B)

In [6]:
X = dataset.drop(['Class'], axis = 1)
y = dataset['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 123)

In [7]:
X.shape

(683, 9)

In [8]:
X_train.shape[0]

512

In [9]:
X_test.shape[0]

171

In [10]:
cv = 5
scoring = 'accuracy'
C_sets = 10. ** np.arange(-3, 5)
param_grid_set = dict(C = C_sets)
svm_model = SVC(kernel = 'linear', random_state = 123)

In [None]:
grid_linear = GridSearchCV(cv=cv, estimator = svm_model, param_grid = param_grid_set, scoring = scoring)
grid_linear.fit(X_train, y_train)
grid_linear.best_estimator_

In [12]:
svm_linear = grid_linear.best_estimator_
svm_linear.fit(X_train, y_train)
y_pred = svm_linear.predict(X_test)
confusion_matrix(y_pred=y_pred, y_true=y_test, labels=[1, 0])

array([[ 63,   0],
       [  3, 105]], dtype=int64)

## C)

In [None]:
svm_model = SVC(kernel = 'poly', degree = 2, random_state = 123)
grid_poly = GridSearchCV(cv=5, estimator=svm_model,param_grid=param_grid_set, scoring=scoring, n_jobs=3)
grid_poly.fit(X_train, y_train)
grid_poly.best_estimator_

SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=2, gamma=0.10000000000000001,
  kernel='poly', max_iter=-1, probability=False, random_state=123,
  shrinking=True, tol=0.001, verbose=False)

In [14]:
svm_poly = grid_poly.best_estimator_
svm_poly.fit(X_train, y_train)
y_pred = svm_poly.predict(X_test)
confusion_matrix(y_pred= y_pred, y_true=y_test, labels=[1, 0])

array([[ 63,   0],
       [  3, 105]], dtype=int64)

## D)

In [27]:
svm_model = SVC(kernel='poly', degree = 3, random_state = 123)
grid_poly_3 = GridSearchCV(cv = 5, estimator=svm_model, param_grid=param_grid_set, scoring=scoring,n_jobs=3)
grid_poly_3.fit(X_train, y_train)
grid_poly_3.best_estimator_

SVC(C=0.001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.10000000000000001,
  kernel='poly', max_iter=-1, probability=False, random_state=123,
  shrinking=True, tol=0.001, verbose=False)

In [16]:
svm_poly_3 = grid_poly_3.best_estimator_
svm_poly_3.fit(X_train, y_train)
y_pred = svm_poly_3.predict(X_test)
confusion_matrix(y_pred=y_pred, y_true=y_test, labels=[1, 0])

array([[ 59,   4],
       [  3, 105]], dtype=int64)

## E)

In [22]:
gamma_sets = 10. ** np.arange(-4, 4)
param_grid_set = dict(C = C_sets, gamma = gamma_sets)

In [26]:
svm_model  = SVC(kernel = 'rbf', random_state=123)
grid_rbf = GridSearchCV(cv=5, estimator=svm_model, n_jobs=3, param_grid=param_grid_set, scoring=scoring)
grid_rbf.fit(X_train, y_train)
grid_rbf.best_estimator_

SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=123, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
svm_rbf = grid_rbf.best_estimator_
svm_rbf.fit(X_train, y_train)
y_pred = svm_rbf.predict(X_test)
confusion_matrix(y_pred=y_pred, y_true=y_test, labels=[1, 0])

array([[ 63,   0],
       [  3, 105]], dtype=int64)