# K Nearest Neighbors

In [35]:
import numpy as np
from sklearn import preprocessing, neighbors
from sklearn.model_selection import cross_validate
import pandas as pd
pd.set_option("display.max_columns",500)
pd.set_option("display.max_rows",999)

In [17]:
from sklearn.model_selection import train_test_split

In [3]:
with open("../datasets/cancer/breast-cancer-wisconsin.names.txt", "r") as f:
    print(f.read())

Citation Request:
   This breast cancer databases was obtained from the University of Wisconsin
   Hospitals, Madison from Dr. William H. Wolberg.  If you publish results
   when using this database, then please include this information in your
   acknowledgements.  Also, please cite one or more of:

   1. O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear 
      programming", SIAM News, Volume 23, Number 5, September 1990, pp 1 & 18.

   2. William H. Wolberg and O.L. Mangasarian: "Multisurface method of 
      pattern separation for medical diagnosis applied to breast cytology", 
      Proceedings of the National Academy of Sciences, U.S.A., Volume 87, 
      December 1990, pp 9193-9196.

   3. O. L. Mangasarian, R. Setiono, and W.H. Wolberg: "Pattern recognition 
      via linear programming: Theory and application to medical diagnosis", 
      in: "Large-scale numerical optimization", Thomas F. Coleman and Yuying
      Li, editors, SIAM Publications, Philadelphia 199

In [20]:
df = pd.read_csv(
    "../datasets/cancer/breast-cancer-wisconsin.data.txt", 
    header=None, 
    names=["sample_code_number", "clump_thickness",
            "uniformity_of_cell_size", "uniformity_of_cell_shape",
            "marginal_adhesion","single_epithelial_cell_size",
            "bare_nuclei","bland_chromatin","normal_nucleoli",
            "mitoses","class"]
)
df.head()

Unnamed: 0,sample_code_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [21]:
df.drop(["sample_code_number"],axis=1,inplace=True)
df.replace("?", -9999, inplace=True)
Y = df.loc[:,"class"].values
X = df.loc[:,df.columns.values["class" != df.columns]]

# KNN classifier

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=.2)

In [23]:
clf = neighbors.KNeighborsClassifier()

In [24]:
clf.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [25]:
acc = clf.score(X_test, Y_test)
acc

0.9642857142857143

# Clasificar nuevos datos

In [39]:
df[df!=-9999].dropna().groupby(by="class").describe()

Unnamed: 0_level_0,clump_thickness,clump_thickness,clump_thickness,clump_thickness,clump_thickness,clump_thickness,clump_thickness,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_size,uniformity_of_cell_size,uniformity_of_cell_size,uniformity_of_cell_size,uniformity_of_cell_size,uniformity_of_cell_size,uniformity_of_cell_size,uniformity_of_cell_shape,uniformity_of_cell_shape,uniformity_of_cell_shape,uniformity_of_cell_shape,uniformity_of_cell_shape,uniformity_of_cell_shape,uniformity_of_cell_shape,uniformity_of_cell_shape,marginal_adhesion,marginal_adhesion,marginal_adhesion,marginal_adhesion,marginal_adhesion,marginal_adhesion,marginal_adhesion,marginal_adhesion,single_epithelial_cell_size,single_epithelial_cell_size,single_epithelial_cell_size,single_epithelial_cell_size,single_epithelial_cell_size,single_epithelial_cell_size,single_epithelial_cell_size,single_epithelial_cell_size,bland_chromatin,bland_chromatin,bland_chromatin,bland_chromatin,bland_chromatin,bland_chromatin,bland_chromatin,bland_chromatin,normal_nucleoli,normal_nucleoli,normal_nucleoli,normal_nucleoli,normal_nucleoli,normal_nucleoli,normal_nucleoli,normal_nucleoli,mitoses,mitoses,mitoses,mitoses,mitoses,mitoses,mitoses,mitoses
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2
2,444.0,2.963964,1.672661,1.0,1.0,3.0,4.0,8.0,444.0,1.306306,0.855657,1.0,1.0,1.0,1.0,9.0,444.0,1.414414,0.957031,1.0,1.0,1.0,1.0,8.0,444.0,1.346847,0.917088,1.0,1.0,1.0,1.0,10.0,444.0,2.108108,0.877112,1.0,2.0,2.0,2.0,10.0,444.0,2.083333,1.062299,1.0,1.0,2.0,3.0,7.0,444.0,1.261261,0.954606,1.0,1.0,1.0,1.0,8.0,444.0,1.065315,0.509738,1.0,1.0,1.0,1.0,8.0
4,239.0,7.188285,2.437907,1.0,5.0,8.0,10.0,10.0,239.0,6.577406,2.724244,1.0,4.0,6.0,10.0,10.0,239.0,6.560669,2.569104,1.0,4.0,6.0,9.0,10.0,239.0,5.585774,3.196631,1.0,3.0,5.0,8.0,10.0,239.0,5.32636,2.443087,1.0,3.0,5.0,6.5,10.0,239.0,5.974895,2.282422,1.0,4.0,7.0,7.0,10.0,239.0,5.857741,3.348876,1.0,3.0,6.0,9.5,10.0,239.0,2.60251,2.564495,1.0,1.0,1.0,3.0,10.0


In [40]:
sample_measure = np.array(
    [
        [4,2,1,1,1,2,3,2,1]
    ]
)

In [41]:
pred_s = clf.predict(sample_measure)

In [42]:
pred_s

array([2])

In [43]:
sample_measure = np.array(
    [
        [4,2,1,1,1,2,3,2,1],
        [7,6,1,5,1,2,3,2,1]
    ]
)
pred_s = clf.predict(sample_measure)
pred_s

array([2, 2])