<h1 style="text-align:center">   
      <font color = pink >
                Breast Cancer Prediction with KNN 
        </font>    
</h1>   
<hr style="width:100%;height:5px;border-width:0;color:gray;background-color:gray">
<center><img style = "height:450px;" src="https://static-01.hindawi.com/styles/hindawi_wide/s3/2019-11/Cancer_Awareness-2019_blog_v1.0_noText.jpg?itok=CR034IE-"></center>

# Introduction

The aim of the project, to determine whether the breast cancer cell is malignant or benign.

**Content:**

1. [Load and Check Data](#1)
1. [Variable Description](#2)
1. [Data Analysis](#3)
1. [Outlier Deteciton](#4)    
1. [Modeling](#5)
    * [Train - Test Split](#6)
    * [Standrization](#7)
    * [K-Nearest Neighbors Classifier](#8)
    * [KNN Best Parameters](#9) 
1. [Principal Component Analysis (PCA)](#10)
1. [Neighborhood Component Analysis (NCA)](#11)
1. [Result](#12)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id = "1" ></a>
# Load and Check Data

In [None]:
data = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

In [None]:
data.head()

In [None]:
#We drop the features which we do not want
data.drop(['Unnamed: 32','id'], inplace = True, axis = 1)

In [None]:
#We change the title of the properties
data = data.rename(columns = {"diagnosis":"target"})

In [None]:
data.columns

<a id = "2"></a>
# Variable Description 
Target (M = malignant, B = benign)

Ten real-valued features are computed for each cell nucleus:
1. radius (mean of distances from center to points on the perimeter)
1. texture (standard deviation of gray-scale values)
1. perimeter
1. area
1. smoothness (local variation in radius lengths)
1. compactness (perimeter^2 / area - 1.0)
1. concavity (severity of concave portions of the contour)
1. concave points (number of concave portions of the contour)
1. symmetry
1. fractal dimension ("coastline approximation" - 1)

In [None]:
print("Data Shape:", data.shape) 

In [None]:
data.info() 

In [None]:
sns.countplot(data["target"])
print(data.target.value_counts()) 

In [None]:
#We convert string expressions to int because it will be necessary when making trains.(Bening = 0 , Malignant = 1)
data["target"] = [1 if i.strip() == "M" else 0 for i in data.target] 

In [None]:
#We look at the data need for standardization, if there are big differences between the data, standardization is required.
describe = data.describe()
describe

<a id = "3"></a>
# Data Analysis

In [None]:
corr_matrix = data.corr()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(corr_matrix,annot = True, fmt = ".2f")
plt.title("Correlation Between Features")
plt.show()

In [None]:
threshold = 0.75 
filtre = np.abs(corr_matrix["target"]) > threshold 
corr_features = corr_matrix.columns[filtre].tolist()
sns.heatmap(data[corr_features].corr(), annot = True, fmt = ".2f")
plt.title("Correlation Between Features w Corr Theshold 0.75")
plt.show()

First, we set a limit value. Here we set it to 0.75.
We bring the ones whose relationship between properties is greater than 0.75.

In [None]:
data_melted = pd.melt(data, id_vars = "target",
                      var_name = "features",
                      value_name = "value")

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(x = "features", y = "value", hue = "target", data = data_melted)
plt.xticks(rotation = 90) 
plt.show()

In [None]:
sns.pairplot(data[corr_features], diag_kind = "kde", markers = "+", hue = "target")
plt.show()

In [None]:
y = data.target
x = data.drop(["target"], axis = 1)

In [None]:
columns = x.columns.tolist()

<a id = "4" ></a>
# Outlier Deteciton

In [None]:
from sklearn.neighbors import LocalOutlierFactor

In [None]:
clf = LocalOutlierFactor()
y_pred = clf.fit_predict(x)

In [None]:
X_score = clf.negative_outlier_factor_
outlier_score = pd.DataFrame()
outlier_score["score"] = X_score
threshold = -2.5
filtre = outlier_score["score"] < threshold
outlier_index = outlier_score[filtre].index.tolist()

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(x.iloc[outlier_index,0],x.iloc[outlier_index,1],color = "blue", s = 50, label = "outliers")
plt.scatter(x.iloc[:,0],x.iloc[:,1],color = "k", s = 3, label = "Data Points")
radius = (X_score.max()- X_score) / (X_score.max() - X_score.min())
outlier_score["radius"] = radius
plt.scatter(x.iloc[:,0],x.iloc[:,1],s = 1000*radius, edgecolors = "r", facecolors = "none", label = "Outlier Scores")
plt.legend() 
plt.show()


In [None]:
#drop outlier
x = x.drop(outlier_index)
y = y.drop(outlier_index).values

<a id = "5"></a><br>
# Modeling

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.metrics import accuracy_score, confusion_matrix 
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.decomposition import PCA

<a id = "6"></a><br>
## Train - Test Split

In [None]:
test_size = 0.3
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = test_size, random_state = 42)
print("X_train",len(X_train))
print("X_test",len(X_test))
print("Y_train",len(Y_train))
print("Y_test",len(Y_test))

<a id = "7"></a><br>
## Standrization

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train_df = pd.DataFrame(X_train, columns = columns)
X_train_df_describe = X_train_df.describe()
X_train_df_describe

In [None]:
X_train_df["target"] = Y_train

In [None]:
data_melted = pd.melt(X_train_df, id_vars = "target",
                      var_name = "features",
                      value_name = "value")

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(x = "features", y = "value", hue = "target", data = data_melted)
plt.xticks(rotation = 90)
plt.show()

In [None]:
sns.pairplot(X_train_df[corr_features], diag_kind = "kde", markers = "+",hue = "target")
plt.show()

<a id = "8"></a><br>
## Simple K-Nearest Neighbors Classifier 

In [None]:
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(X_train, Y_train)
y_pred = knn.predict(X_test)
cm = confusion_matrix(Y_test, y_pred)
acc = accuracy_score(Y_test, y_pred)
score = knn.score(X_test, Y_test)

print("Basic KNN Accuracy: % {}".format(acc))
print("Score : ", score)
print("CM : ", cm)

<a id = "9"></a><br>
## KNN Best Parameters

In [None]:
def KNN_Best_Params(x_train, x_test, y_train, y_test):
    
    k_range = list(range(1,31))
    weight_options = ["uniform","distance"]
    print()
    param_grid = dict(n_neighbors = k_range, weights = weight_options)
     
    knn = KNeighborsClassifier()
    
    grid = GridSearchCV(knn, param_grid, cv = 10, scoring = "accuracy")
    grid.fit(x_train, y_train)
    
    print("Best training score : {} with paremeters : {}".format(grid.best_score_, grid.best_params_))
    print()
    
    knn = KNeighborsClassifier(**grid.best_params_) # best paremetre olarak gelen değerlerimiz.
    knn.fit(x_train, y_train)
    
    y_pred_test = knn.predict(x_test)
    y_pred_train = knn.predict(x_train)
    
    cm_test = confusion_matrix(y_test, y_pred_test)
    cm_train = confusion_matrix(y_train, y_pred_train)
    
    acc_test = accuracy_score(y_test, y_pred_test) 
    acc_train = accuracy_score(y_train, y_pred_train)
    print("Test Score: {}, Train Score: {}".format(acc_test, acc_train))
    print()
    print("CM Test: ",cm_test)
    print("CM Train: ",cm_train)
    
    return grid


In [None]:
grid = KNN_Best_Params(X_train, X_test, Y_train, Y_test) 

<a id = "10"></a><br>
# Principal Component Analysis (PCA)

PCA :
* Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space.
* Unsupervised Machine Learning

In [None]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [None]:
#We reduce 30 featurs to 2 
pca = PCA(n_components = 2)
pca.fit(x_scaled)
X_reduced_pca = pca.transform(x_scaled)
pca_data = pd.DataFrame(X_reduced_pca, columns = ["p1","p2"])
pca_data["target"] = y

In [None]:
plt.figure(figsize=(10,7))
sns.scatterplot(x = "p1", y = "p2", hue = "target", data = pca_data)
plt.title("PCA : p1 vs p2")

In [None]:
X_train_pca, X_test_pca, Y_train_pca, Y_test_pca = train_test_split(X_reduced_pca, y, test_size = test_size, random_state = 42)

In [None]:
#We send our data by calling the function we wrote.
grid_pca = KNN_Best_Params(X_train_pca, X_test_pca, Y_train_pca, Y_test_pca)

In [None]:
cmap_light = ListedColormap(['orange',  'cornflowerblue'])
cmap_bold = ListedColormap(['darkorange', 'darkblue'])

In [None]:
h = .05 # step size in the mesh
X = X_reduced_pca
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

In [None]:
Z = grid_pca.predict(np.c_[xx.ravel(), yy.ravel()])

In [None]:
Z = Z.reshape(xx.shape)
plt.figure(figsize=(10,7))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
            edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = '%s')"
          % (len(np.unique(y)),grid_pca.best_estimator_.n_neighbors, grid_pca.best_estimator_.weights))

<a id = "11"></a><br>
# Neighborhood Component Analysis (NCA)

NCA is a machine learning algorithm for metric learning. It learns a linear transformation in a supervised fashion to improve the classification accuracy of a stochastic nearest neighbors rule in the transformed space.

In [None]:
nca = NeighborhoodComponentsAnalysis(n_components = 2, random_state = 42)
nca.fit(x_scaled, y)
X_reduced_nca = nca.transform(x_scaled)
nca_data = pd.DataFrame(X_reduced_nca, columns = ["p1","p2"])
nca_data["target"] = y
plt.figure(figsize=(10,7))
sns.scatterplot(x = "p1",  y = "p2", hue = "target", data = nca_data)
plt.title("NCA: p1 vs p2")

In [None]:
X_train_nca, X_test_nca, Y_train_nca, Y_test_nca = train_test_split(X_reduced_nca, y, test_size = test_size, random_state = 42)

In [None]:
grid_nca = KNN_Best_Params(X_train_nca, X_test_nca, Y_train_nca, Y_test_nca)

In [None]:
h = .2 # step size in the mesh
X = X_reduced_nca
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

In [None]:
Z = grid_nca.predict(np.c_[xx.ravel(), yy.ravel()])

In [None]:
Z = Z.reshape(xx.shape)
plt.figure(figsize=(10,7))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
            edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = '%s')"
          % (len(np.unique(y)),grid_nca.best_estimator_.n_neighbors, grid_nca.best_estimator_.weights))


<a id = "12"></a><br>
# Result

In [None]:
# let's find the wrong classifications we made.
knn = KNeighborsClassifier(**grid_nca.best_params_)
knn.fit(X_train_nca,Y_train_nca)
y_pred_nca = knn.predict(X_test_nca)
acc_test_nca = accuracy_score(y_pred_nca,Y_test_nca)
print("Score --> {}".format(knn.score(X_test_nca,Y_test_nca)))

In [None]:
test_data = pd.DataFrame()
test_data["X_test_nca_p1"] = X_test_nca[:,0]
test_data["X_test_nca_p2"] = X_test_nca[:,1]
test_data["y_pred_nca"] = y_pred_nca
test_data["Y_test_nca"] = Y_test_nca

In [None]:
plt.figure(figsize=(10,7))
diff = np.where(y_pred_nca!=Y_test_nca)[0]
plt.scatter(test_data.iloc[diff,0],test_data.iloc[diff,1],label = "Wrong Classified",alpha = 0.2,color = "red",s = 1000)

sns.scatterplot(x="X_test_nca_p1", y="X_test_nca_p2", hue="Y_test_nca",data=test_data)