# **Breast Cancer Wisconsin (Original) Data Set**

## Importing the libraries

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [11]:
dataset = pd.read_csv('breast-cancer-wisconsin.csv')

## Taking care of missing data

In [14]:
#figure out all coulmns with nulls
for r in dataset.columns:
  if (dataset[r].isnull().values.any()):
    print('column {0} has null and had data type {1}'.format(r,dataset[r].dtypes))

column  Bare Nuclei has null and had data type float64


In [18]:
dataset.isnull().sum()

Sample code number               0
 Clump Thickness                 0
 Uniformity of Cell Size         0
 Uniformity of Cell Shape        0
 Marginal Adhesion               0
 Single Epithelial Cell Size     0
 Bare Nuclei                    16
 Bland Chromatin                 0
 Normal Nucleoli                 0
 Mitoses                         0
 Class                           0
dtype: int64

In [19]:
dataset.dropna(inplace=True)

In [20]:
dataset.isnull().sum()

Sample code number              0
 Clump Thickness                0
 Uniformity of Cell Size        0
 Uniformity of Cell Shape       0
 Marginal Adhesion              0
 Single Epithelial Cell Size    0
 Bare Nuclei                    0
 Bland Chromatin                0
 Normal Nucleoli                0
 Mitoses                        0
 Class                          0
dtype: int64

In [21]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Sample code number            683 non-null    int64  
 1    Clump Thickness              683 non-null    int64  
 2    Uniformity of Cell Size      683 non-null    int64  
 3    Uniformity of Cell Shape     683 non-null    int64  
 4    Marginal Adhesion            683 non-null    int64  
 5    Single Epithelial Cell Size  683 non-null    int64  
 6    Bare Nuclei                  683 non-null    float64
 7    Bland Chromatin              683 non-null    int64  
 8    Normal Nucleoli              683 non-null    int64  
 9    Mitoses                      683 non-null    int64  
 10   Class                        683 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 64.0 KB


## Splitting the dataset into the Training set and Test set

In [22]:
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Feature Scaling

In [24]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Training the Kernel SVM model on the Training set

In [25]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

## Making the Confusion Matrix and calculating accuracy

In [26]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[101   6]
 [  3  61]]


0.9473684210526315

Source for Dataset:
Creator:
Dr. WIlliam H. Wolberg (physician)
University of Wisconsin Hospitals
Madison, Wisconsin, USA

Donor:
Olvi Mangasarian (mangasarian '@' cs.wisc.edu)
Received by David W. Aha (aha '@' cs.jhu.edu)

https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Original%29