In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier,NeighborhoodComponentsAnalysis,LocalOutlierFactor
from sklearn.decomposition import PCA


In [None]:
data=pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
data.drop(['id','Unnamed: 32'],axis=1,inplace=True)
data.head()

In [None]:
data=data.rename(columns={'diagnosis':'target'})

In [None]:
sns.countplot(data.target)
data.target.value_counts()

In [None]:
data.target=[1 if i.strip()=='M' else 0 for i in data['target']]
data.target.value_counts()


In [None]:
data.shape

In [None]:
data.info()

In [None]:
#data.describe
#missing value:none

In [None]:
corr_matrix=data.corr()
sns.clustermap(corr_matrix,annot=True,fmt='.2f',figsize=(15,15))
plt.title('CORRALATION BETWEEN FEATURES')
plt.show()

In [None]:
threshold=0.75
filtre=np.abs(corr_matrix['target'])>threshold
corr_features=corr_matrix.columns[filtre].tolist()
sns.clustermap(data[corr_features].corr(),annot=True,fmt='.2f',figsize=(10,10))
plt.title('CORRALATION BETWEEN FEATURES with thershold')
plt.show()

In [None]:
data_melted=pd.melt(data,id_vars='target',var_name='features',value_name='value')
plt.figure(figsize=(10,6))
sns.boxplot(x='features',y='value',hue='target',data=data_melted)
plt.xticks(rotation=90)
plt.show()

In [None]:
sns.pairplot(data[corr_features],diag_kind='kde',markers='+',hue='target')
plt.show()

## Local Outlier Factor

In [None]:
y=data.target
x=data.drop(['target'],axis=1)
columns=x.columns.tolist()
clf=LocalOutlierFactor()
y_pred=clf.fit_predict(x)#it return -1 for outlier


In [None]:
X_score=clf.negative_outlier_factor_
outlier_score=pd.DataFrame()
outlier_score['score']=X_score


In [None]:
radius=(X_score.max()-X_score)/(X_score.max()-X_score.min())
outlier_score['radius']=radius
outlier_score.head()

In [None]:
thresholdd=-2.5
filtr=outlier_score['score']<thresholdd
outlier_index=outlier_score[filtr].index.tolist()


plt.scatter(x.iloc[:,0],x.iloc[:,1],s=1000*radius,edgecolors='r',facecolors='none',label='outlier scores')
plt.scatter(x.iloc[:,0],x.iloc[:,1],color='k',s=3,label='data points')
plt.scatter(x.iloc[outlier_index,0],x.iloc[outlier_index,1],s=50,color='b', label='outliers')
plt.legend()
plt.show()

In [None]:
x=x.drop(outlier_index)
y=y.drop(outlier_index).values

## Train Test Split


In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [None]:
#standardizaton
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

X_train_df=pd.DataFrame(X_train,columns=columns)
X_train_df['target']=Y_train


In [None]:
data_melted=pd.melt(X_train_df,id_vars='target',var_name='features',value_name='value')
plt.figure(figsize=(15,8))
sns.boxplot(x='features',y='value',hue='target',data=data_melted)
plt.xticks(rotation=90)
plt.show()


## K-Nearest Neighbors(KNN)

In [None]:
knn=KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,Y_train)
Y_pr=knn.predict(X_test)
cm=confusion_matrix(Y_test,Y_pr)
acc=accuracy_score(Y_test,Y_pr)
score=knn.score(X_test,Y_test)

print('accuracy: ',acc)
print('score : ',score)


f,ax = plt.subplots(figsize=(8, 8))
sns.heatmap(cm, annot=True, linewidths=0.01,cmap="Greens",linecolor="gray", fmt= '.1f',ax=ax)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

### Knn best parameters


In [None]:
def KNN_Best_Params(x_train,x_test,y_train,y_test):
    k_range=list(range(1,31))
    weight_options=['uniform','distance']
    print()
    param_grid=dict(n_neighbors=k_range,weights=weight_options)
    knn=KNeighborsClassifier()
    grid=GridSearchCV(knn,param_grid,cv=10,scoring='accuracy')
    grid.fit(x_train,y_train)
    print('best training score {} with parameters {}'.format(grid.best_score_,grid.best_params_))
    print()
    
    knn=KNeighborsClassifier(**grid.best_params_)
    knn.fit(x_train,y_train)
    
    y_pred_test=knn.predict(x_test)
    y_pred_train=knn.predict(x_train)
    
    cm_test=confusion_matrix(y_test,y_pred_test)
    cm_train=confusion_matrix(y_train,y_pred_train)
    
    acc_test=accuracy_score(y_test,y_pred_test)
    acc_train=accuracy_score(y_train,y_pred_train)
    print('TEST SCORE : {} TRAİN SCORE : {}'.format(acc_test,acc_train))
    print()
    print('cm test : ',cm_test)
    print('cm train : ',cm_train)
    return grid
grid=KNN_Best_Params(X_train,X_test,Y_train,Y_test)

##  Principal Component Analysis(PCA)


In [None]:
#unsupervised
scaler=StandardScaler()
x_scaled=scaler.fit_transform(x)

pca=PCA(n_components=2)
pca.fit(x_scaled)
x_reduced_pca=pca.transform(x_scaled)
pca_data=pd.DataFrame(x_reduced_pca, columns=['p1','p2'])
pca_data['target']=y
sns.scatterplot(x='p1',y='p2',hue='target',data=pca_data)
plt.title('PCA: p1 vs p2')
plt.show()

In [None]:
X_train_pca,X_test_pca,Y_train_pca,Y_test_pca=train_test_split(x_reduced_pca,y,test_size=0.3,random_state=42)
grid_pca=KNN_Best_Params(X_train_pca,X_test_pca,Y_train_pca,Y_test_pca)

In [None]:
cmap_ligth=ListedColormap(['orange','cornflowerblue'])
cmap_bold=ListedColormap(['darkorange','darkblue'])
h=0.5
X=x_reduced_pca
x_min,x_max=X[:,0].min()-1,X[:,0].max()+1
y_min,y_max=X[:,1].min()-1,X[:,1].max()+1
xx,yy=np.meshgrid(np.arange(x_min,x_max,h),
                 np.arange(y_min,y_max,h))
Z=grid_pca.predict(np.c_[xx.ravel(),yy.ravel()])
Z=Z.reshape(xx.shape)
plt.figure(figsize=(18,10))
plt.pcolormesh(xx,yy,Z,cmap=cmap_ligth)

plt.scatter(X[:,0],X[:,1],c=y,cmap=cmap_bold,edgecolor='k',s=20)
plt.xlim(xx.min(),xx.max())
plt.ylim(yy.min(),yy.max())
plt.title('%i-class classification (k=%i, weights=%s)'%(len(np.unique(y)),grid_pca.best_estimator_.n_neighbors,grid_pca.best_estimator_.weights))
plt.show()

## Neighborhood Components Analysis (NCA)

In [None]:
nca=NeighborhoodComponentsAnalysis(n_components=2,random_state=42)
nca.fit(x_scaled,y)
x_reduced_nca=nca.transform(x_scaled)
nca_data=pd.DataFrame(x_reduced_nca,columns=['p1','p2'])
nca_data['target']=y
sns.scatterplot(x='p1',y='p2',hue='target',data=nca_data)
plt.title('NCA : p1 vs p2')
plt.show()

In [None]:
X_train_nca,X_test_nca,Y_train_nca,Y_test_nca=train_test_split(x_reduced_nca,y,test_size=0.3,random_state=42)
grid_nca=KNN_Best_Params(X_train_nca,X_test_nca,Y_train_nca,Y_test_nca)

In [None]:
cmap_ligth=ListedColormap(['orange','cornflowerblue'])
cmap_bold=ListedColormap(['darkorange','darkblue'])
h=0.5
X=x_reduced_nca
x_min,x_max=X[:,0].min()-1,X[:,0].max()+1
y_min,y_max=X[:,1].min()-1,X[:,1].max()+1
xx,yy=np.meshgrid(np.arange(x_min,x_max,h),
                 np.arange(y_min,y_max,h))
Z=grid_nca.predict(np.c_[xx.ravel(),yy.ravel()])
Z=Z.reshape(xx.shape)
plt.figure(figsize=(18,10))
plt.pcolormesh(xx,yy,Z,cmap=cmap_ligth)

plt.scatter(X[:,0],X[:,1],c=y,cmap=cmap_bold,edgecolor='k',s=20)
plt.xlim(xx.min(),xx.max())
plt.ylim(yy.min(),yy.max())
plt.title('%i-class classification (k=%i, weights=%s)'%(len(np.unique(y)),grid_nca.best_estimator_.n_neighbors,grid_nca.best_estimator_.weights))
plt.show()

## 	Conclusion

In [None]:

knn=KNeighborsClassifier(**grid_nca.best_params_)
knn.fit(X_train_nca,Y_train_nca)
y_pred_nca=knn.predict(X_test_nca)
acc_test_nca=accuracy_score(y_pred_nca,Y_test_nca)
knn.score(X_test_nca,Y_test_nca)

test_data=pd.DataFrame()
test_data['X_test_nca_p1']=X_test_nca[:,0]
test_data['X_test_nca_p2']=X_test_nca[:,1]
test_data['y_pred_nca']=y_pred_nca
test_data['Y_test_nca']=Y_test_nca

plt.figure(figsize=(7,7))
sns.scatterplot(x='X_test_nca_p1',y='X_test_nca_p2',hue='Y_test_nca',data=test_data)

diff=np.where(y_pred_nca!=Y_test_nca)[0]
plt.scatter(test_data.iloc[diff,0],test_data.iloc[diff,1],label='wrong classified',marker='o',color='r', s=400,alpha=0.5 )
plt.legend()
plt.title('Conclusion')
plt.show()
