## DIAGNOSIS OF BREAST CANCER  USING A KNN CLASSIFICATION

DATASET SOURCE: "UCI MACHINE LEARNING REPOSITRY" 

### IMPORTING THE LIBRARIES 

In [170]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [171]:
dataset=pd.read_csv("breast-cancer-wisconsin.csv")

In [172]:
dataset.sample(10)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
133,867387,B,15.71,13.93,102.0,761.7,0.09462,0.09462,0.07135,0.05933,...,17.5,19.25,114.3,922.8,0.1223,0.1949,0.1709,0.1374,0.2723,0.07071
342,89827,B,11.06,14.96,71.49,373.9,0.1033,0.09097,0.05397,0.03341,...,11.92,19.9,79.76,440.0,0.1418,0.221,0.2299,0.1075,0.3301,0.0908
247,884626,B,12.89,14.11,84.95,512.2,0.0876,0.1346,0.1374,0.0398,...,14.39,17.7,105.0,639.1,0.1254,0.5849,0.7727,0.1561,0.2639,0.1178
12,846226,M,19.17,24.8,132.4,1123.0,0.0974,0.2458,0.2065,0.1118,...,20.96,29.94,151.7,1332.0,0.1037,0.3903,0.3639,0.1767,0.3176,0.1023
97,862261,B,9.787,19.94,62.11,294.5,0.1024,0.05301,0.006829,0.007937,...,10.92,26.29,68.81,366.1,0.1316,0.09473,0.02049,0.02381,0.1934,0.08988
207,879830,M,17.01,20.26,109.7,904.3,0.08772,0.07304,0.0695,0.0539,...,19.8,25.05,130.0,1210.0,0.1111,0.1486,0.1932,0.1096,0.3275,0.06469
176,872608,B,9.904,18.06,64.6,302.4,0.09699,0.1294,0.1307,0.03716,...,11.26,24.39,73.07,390.2,0.1301,0.295,0.3486,0.0991,0.2614,0.1162
234,882488,B,9.567,15.91,60.21,279.6,0.08464,0.04087,0.01652,0.01667,...,10.51,19.16,65.74,335.9,0.1504,0.09515,0.07161,0.07222,0.2757,0.08178
23,851509,M,21.16,23.04,137.2,1404.0,0.09428,0.1022,0.1097,0.08632,...,29.17,35.59,188.0,2615.0,0.1401,0.26,0.3155,0.2009,0.2822,0.07526
256,88649001,M,19.55,28.77,133.6,1207.0,0.0926,0.2063,0.1784,0.1144,...,25.05,36.27,178.6,1926.0,0.1281,0.5329,0.4251,0.1941,0.2818,0.1005


###### Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image

##### This can be found on UCI Machine Learning Repository:
   https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29

Attribute Information:

    1) ID number 
    2) Diagnosis (M = malignant, B = benign) 
    3-32)
    Ten real-valued features are computed for each cell nucleus:
    a) radius (mean of distances from center to points on the perimeter) 
    b) texture (standard deviation of gray-scale values) 
    c) perimeter
    d) area 
    e) smoothness (local variation in radius lengths) 
    f) compactness (perimeter^2 / area - 1.0) 
    g) concavity (severity of concave portions of the contour) 
    h) concave points (number of concave portions of the contour) 
    i) symmetry 
    j) fractal dimension ("coastline approximation" - 1)
##### The mean, standard error and "worst" or largest (mean of the three largest values) of these features were computed for each image, resulting in 30 features. For instance, field 3 is Mean Radius, field 13 is Radius SE, field 23 is Worst Radius.

    All feature values are recoded with four significant digits.
    Missing attribute values: none
    Class distribution: 357 benign, 212 malignant


In [173]:
dataset.shape

(569, 32)

In [174]:
dataset.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [175]:
dataset.isna().sum()

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

### 'diagonisis' IS THE DEPENDENT VARIABLE 

In [176]:
print(dataset.diagnosis.unique())

['M' 'B']


###  Column "ID" 
IT DOESN'T HELP THE MACHINE TO LEARN AS IT IS A UNIQUE FEATURE AND HAS NOTHING TO DO WITH THE "Class" of breast_cancer

In [177]:
del dataset['id']

### COLUMN "id" HAS BEEN REMOVED

In [178]:
dataset.sample(5)


Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
401,B,11.93,10.91,76.14,442.7,0.08872,0.05242,0.02606,0.01796,0.1601,...,13.8,20.14,87.64,589.5,0.1374,0.1575,0.1514,0.06876,0.246,0.07262
496,B,12.65,18.17,82.69,485.6,0.1076,0.1334,0.08017,0.05074,0.1641,...,14.38,22.15,95.29,633.7,0.1533,0.3842,0.3582,0.1407,0.323,0.1033
549,B,10.82,24.21,68.89,361.6,0.08192,0.06602,0.01548,0.00816,0.1976,...,13.03,31.45,83.9,505.6,0.1204,0.1633,0.06194,0.03264,0.3059,0.07626
225,B,14.34,13.47,92.51,641.2,0.09906,0.07624,0.05724,0.04603,0.2075,...,16.77,16.9,110.4,873.2,0.1297,0.1525,0.1632,0.1087,0.3062,0.06072
21,B,9.504,12.44,60.34,273.9,0.1024,0.06492,0.02956,0.02076,0.1815,...,10.23,15.66,65.13,314.9,0.1324,0.1148,0.08867,0.06227,0.245,0.07773


##  Checking the datatypes of the attributes

In [179]:
for i in dataset.columns:
    print(i,"----dtype -------",dataset[i].dtype)

diagnosis ----dtype ------- object
radius_mean ----dtype ------- float64
texture_mean ----dtype ------- float64
perimeter_mean ----dtype ------- float64
area_mean ----dtype ------- float64
smoothness_mean ----dtype ------- float64
compactness_mean ----dtype ------- float64
concavity_mean ----dtype ------- float64
concave points_mean ----dtype ------- float64
symmetry_mean ----dtype ------- float64
fractal_dimension_mean ----dtype ------- float64
radius_se ----dtype ------- float64
texture_se ----dtype ------- float64
perimeter_se ----dtype ------- float64
area_se ----dtype ------- float64
smoothness_se ----dtype ------- float64
compactness_se ----dtype ------- float64
concavity_se ----dtype ------- float64
concave points_se ----dtype ------- float64
symmetry_se ----dtype ------- float64
fractal_dimension_se ----dtype ------- float64
radius_worst ----dtype ------- float64
texture_worst ----dtype ------- float64
perimeter_worst ----dtype ------- float64
area_worst ----dtype ------- float

### SPLITTING INTO DEPENDENT AND INDEPENDENT VARIABLES

In [180]:
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

# Splitting the dataset into the Training set and Test set

In [181]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)

## MODEL CREATION AND TRAINING WITH TRAINING SET

In [182]:
from sklearn.neighbors import KNeighborsClassifier
cls=cls=KNeighborsClassifier(n_neighbors=9,weights='uniform',algorithm='auto')
cls.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=9, p=2,
           weights='uniform')

In [183]:
# Predicting the Test set results
y_pred = cls.predict(X_test)

## Confusion Matrix

In [184]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [185]:
print(cm)

[[147   1]
 [  4  76]]


In [186]:
## confusion matrix results
#--[147+76]=[223]--correct diagonisis
#--[1+4]=[5]-----incorrect diagonisis

## ACCURACY

In [187]:
print("ACCURACY OF MODEL IS : ",cls.score(X_test,y_test)*100,"%")

ACCURACY OF MODEL IS :  97.80701754385966 %


## WE HAVE CREATED A MODEL THAT CAN  DIAGONISE THE BREAST CANCER WITH AN ACUURACY OF 97.80 %