In [80]:
import numpy as np
import pandas as pd

#Veri Görselleştirme
import matplotlib.pyplot as plt 
import seaborn as sns

# Algoritmalar
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

#Train ve Test Setlerinin ayrılması
from sklearn.model_selection import train_test_split

#Değerlendirme Metrikleri
#Sınıflandırma metrikleri
from sklearn.metrics import accuracy_score,classification_report
#Regresyon Metrikleri
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

#Cross-validation ve en iyi parametrelerin seçimi
from sklearn.model_selection import GridSearchCV

# Gereksiz uyarıların görüntülenmemesi için kullanıyoruz
import warnings
warnings.filterwarnings('ignore')

# KNN ile Sınıflandırma

In [None]:
df = pd.read_csv('./Data/breast-cancer-wisconsin.csv')
df

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [18]:
df.drop(columns=['Unnamed: 32'], inplace = True)

In [None]:
df.describe().T

In [None]:
df.nunique()

In [21]:
df.drop(columns=['id'],inplace=True)

In [22]:
X = df.drop(columns=['diagnosis'])
y = df['diagnosis']

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state=42)

In [24]:
# knn için sıklıkla kullanılan parametreler => n_neighbors, metric 
knn_clf = KNeighborsClassifier()
knn_params = {
    'n_neighbors': [i for i in range(1,15)],
    'metric': ['euclidean','minkowski','manhattan','cosine']
}

In [None]:
knn_clf_cv = GridSearchCV(knn_clf,param_grid=knn_params,scoring='accuracy',verbose=3)
knn_clf_cv.fit(X_train,y_train)

In [26]:
knn_clf_cv.best_params_

{'metric': 'manhattan', 'n_neighbors': 1}

In [28]:
y_pred = knn_clf_cv.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           B       0.92      0.97      0.95        71
           M       0.95      0.86      0.90        43

    accuracy                           0.93       114
   macro avg       0.93      0.92      0.92       114
weighted avg       0.93      0.93      0.93       114



In [29]:
#Modelimiz veri setini ezberlemiş durumda
y_pred_train = knn_clf_cv.predict(X_train)
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           B       1.00      1.00      1.00       286
           M       1.00      1.00      1.00       169

    accuracy                           1.00       455
   macro avg       1.00      1.00      1.00       455
weighted avg       1.00      1.00      1.00       455



In [30]:
# knn için sıklıkla kullanılan parametreler => n_neighbors, metric 
knn_clf = KNeighborsClassifier()
knn_params = {
    'n_neighbors': [i for i in range(3,15)],
    'metric': ['euclidean','minkowski','manhattan','cosine']
}

In [31]:
knn_clf_cv = GridSearchCV(knn_clf,param_grid=knn_params,scoring='accuracy',verbose=3)
knn_clf_cv.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END ...metric=euclidean, n_neighbors=3;, score=0.945 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=3;, score=0.879 total time=   0.0s
[CV 3/5] END ...metric=euclidean, n_neighbors=3;, score=0.978 total time=   0.0s
[CV 4/5] END ...metric=euclidean, n_neighbors=3;, score=0.901 total time=   0.0s
[CV 5/5] END ...metric=euclidean, n_neighbors=3;, score=0.901 total time=   0.0s
[CV 1/5] END ...metric=euclidean, n_neighbors=4;, score=0.956 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=4;, score=0.890 total time=   0.0s
[CV 3/5] END ...metric=euclidean, n_neighbors=4;, score=0.967 total time=   0.0s
[CV 4/5] END ...metric=euclidean, n_neighbors=4;, score=0.890 total time=   0.0s
[CV 5/5] END ...metric=euclidean, n_neighbors=4;, score=0.901 total time=   0.0s
[CV 1/5] END ...metric=euclidean, n_neighbors=5;, score=0.934 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=5

In [32]:
knn_clf_cv.best_params_

{'metric': 'manhattan', 'n_neighbors': 5}

In [33]:
knn_clf_cv.best_score_

0.9362637362637363

In [34]:
y_pred = knn_clf_cv.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           B       0.93      0.99      0.96        71
           M       0.97      0.88      0.93        43

    accuracy                           0.95       114
   macro avg       0.95      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114



In [35]:
y_pred_train = knn_clf_cv.predict(X_train)
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           B       0.94      0.98      0.96       286
           M       0.97      0.89      0.93       169

    accuracy                           0.95       455
   macro avg       0.95      0.94      0.94       455
weighted avg       0.95      0.95      0.95       455



In [36]:
# knn için sıklıkla kullanılan parametreler => n_neighbors, metric 
knn_clf = KNeighborsClassifier()
knn_params = {
    'n_neighbors': [i for i in range(3,15)],
    'metric': ['euclidean','minkowski','cosine']
}

In [37]:
knn_clf_cv = GridSearchCV(knn_clf,param_grid=knn_params,scoring='accuracy',verbose=3)
knn_clf_cv.fit(X_train,y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END ...metric=euclidean, n_neighbors=3;, score=0.945 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=3;, score=0.879 total time=   0.0s
[CV 3/5] END ...metric=euclidean, n_neighbors=3;, score=0.978 total time=   0.0s
[CV 4/5] END ...metric=euclidean, n_neighbors=3;, score=0.901 total time=   0.0s
[CV 5/5] END ...metric=euclidean, n_neighbors=3;, score=0.901 total time=   0.0s
[CV 1/5] END ...metric=euclidean, n_neighbors=4;, score=0.956 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=4;, score=0.890 total time=   0.0s
[CV 3/5] END ...metric=euclidean, n_neighbors=4;, score=0.967 total time=   0.0s
[CV 4/5] END ...metric=euclidean, n_neighbors=4;, score=0.890 total time=   0.0s
[CV 5/5] END ...metric=euclidean, n_neighbors=4;, score=0.901 total time=   0.0s
[CV 1/5] END ...metric=euclidean, n_neighbors=5;, score=0.934 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=5

In [38]:
knn_clf_cv.best_params_

{'metric': 'euclidean', 'n_neighbors': 9}

In [39]:
knn_clf_cv.best_score_

0.9274725274725274

In [40]:
y_pred = knn_clf_cv.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           B       0.95      0.99      0.97        71
           M       0.97      0.91      0.94        43

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [41]:
y_pred_train = knn_clf_cv.predict(X_train)
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           B       0.93      0.97      0.95       286
           M       0.94      0.87      0.90       169

    accuracy                           0.93       455
   macro avg       0.93      0.92      0.93       455
weighted avg       0.93      0.93      0.93       455



In [42]:
# knn için sıklıkla kullanılan parametreler => n_neighbors, metric 
knn_clf = KNeighborsClassifier()
knn_params = {
    'n_neighbors': [i for i in range(2,15)],
    'metric': ['euclidean','minkowski','manhattan','cosine']
}

In [43]:
knn_clf_cv = GridSearchCV(knn_clf,param_grid=knn_params,scoring='accuracy',verbose=3)
knn_clf_cv.fit(X_train,y_train)

Fitting 5 folds for each of 52 candidates, totalling 260 fits
[CV 1/5] END ...metric=euclidean, n_neighbors=2;, score=0.956 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=2;, score=0.901 total time=   0.0s
[CV 3/5] END ...metric=euclidean, n_neighbors=2;, score=0.956 total time=   0.0s
[CV 4/5] END ...metric=euclidean, n_neighbors=2;, score=0.901 total time=   0.0s
[CV 5/5] END ...metric=euclidean, n_neighbors=2;, score=0.890 total time=   0.0s
[CV 1/5] END ...metric=euclidean, n_neighbors=3;, score=0.945 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=3;, score=0.879 total time=   0.0s
[CV 3/5] END ...metric=euclidean, n_neighbors=3;, score=0.978 total time=   0.0s
[CV 4/5] END ...metric=euclidean, n_neighbors=3;, score=0.901 total time=   0.0s
[CV 5/5] END ...metric=euclidean, n_neighbors=3;, score=0.901 total time=   0.0s
[CV 1/5] END ...metric=euclidean, n_neighbors=4;, score=0.956 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=4

In [44]:
knn_clf_cv.best_params_

{'metric': 'manhattan', 'n_neighbors': 5}

In [45]:
knn_clf_cv.best_score_

0.9362637362637363

In [46]:
y_pred = knn_clf_cv.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           B       0.93      0.99      0.96        71
           M       0.97      0.88      0.93        43

    accuracy                           0.95       114
   macro avg       0.95      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114



In [47]:
y_pred_train = knn_clf_cv.predict(X_train)
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           B       0.94      0.98      0.96       286
           M       0.97      0.89      0.93       169

    accuracy                           0.95       455
   macro avg       0.95      0.94      0.94       455
weighted avg       0.95      0.95      0.95       455



In [48]:
# knn için sıklıkla kullanılan parametreler => n_neighbors, metric 
knn_clf = KNeighborsClassifier()
knn_params = {
    'n_neighbors': [i for i in range(2,15)],
    'metric': ['euclidean','minkowski','cosine']
}

In [49]:
knn_clf_cv = GridSearchCV(knn_clf,param_grid=knn_params,scoring='accuracy',verbose=3)
knn_clf_cv.fit(X_train,y_train)

Fitting 5 folds for each of 39 candidates, totalling 195 fits
[CV 1/5] END ...metric=euclidean, n_neighbors=2;, score=0.956 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=2;, score=0.901 total time=   0.0s
[CV 3/5] END ...metric=euclidean, n_neighbors=2;, score=0.956 total time=   0.0s
[CV 4/5] END ...metric=euclidean, n_neighbors=2;, score=0.901 total time=   0.0s
[CV 5/5] END ...metric=euclidean, n_neighbors=2;, score=0.890 total time=   0.0s
[CV 1/5] END ...metric=euclidean, n_neighbors=3;, score=0.945 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=3;, score=0.879 total time=   0.0s
[CV 3/5] END ...metric=euclidean, n_neighbors=3;, score=0.978 total time=   0.0s
[CV 4/5] END ...metric=euclidean, n_neighbors=3;, score=0.901 total time=   0.0s
[CV 5/5] END ...metric=euclidean, n_neighbors=3;, score=0.901 total time=   0.0s
[CV 1/5] END ...metric=euclidean, n_neighbors=4;, score=0.956 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=4

In [50]:
knn_clf_cv.best_params_

{'metric': 'euclidean', 'n_neighbors': 9}

In [51]:
knn_clf_cv.best_score_

0.9274725274725274

In [52]:
y_pred = knn_clf_cv.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           B       0.95      0.99      0.97        71
           M       0.97      0.91      0.94        43

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [53]:
y_pred_train = knn_clf_cv.predict(X_train)
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           B       0.93      0.97      0.95       286
           M       0.94      0.87      0.90       169

    accuracy                           0.93       455
   macro avg       0.93      0.92      0.93       455
weighted avg       0.93      0.93      0.93       455



In [54]:
# knn için sıklıkla kullanılan parametreler => n_neighbors, metric 
knn_clf = KNeighborsClassifier()
knn_params = {
    'n_neighbors': [i for i in range(1,15)],
    'metric': ['euclidean','minkowski','cosine']
}

In [55]:
knn_clf_cv = GridSearchCV(knn_clf,param_grid=knn_params,scoring='accuracy',verbose=3)
knn_clf_cv.fit(X_train,y_train)

Fitting 5 folds for each of 42 candidates, totalling 210 fits
[CV 1/5] END ...metric=euclidean, n_neighbors=1;, score=0.945 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=1;, score=0.901 total time=   0.0s
[CV 3/5] END ...metric=euclidean, n_neighbors=1;, score=0.934 total time=   0.0s
[CV 4/5] END ...metric=euclidean, n_neighbors=1;, score=0.890 total time=   0.0s
[CV 5/5] END ...metric=euclidean, n_neighbors=1;, score=0.901 total time=   0.0s
[CV 1/5] END ...metric=euclidean, n_neighbors=2;, score=0.956 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=2;, score=0.901 total time=   0.0s
[CV 3/5] END ...metric=euclidean, n_neighbors=2;, score=0.956 total time=   0.0s
[CV 4/5] END ...metric=euclidean, n_neighbors=2;, score=0.901 total time=   0.0s
[CV 5/5] END ...metric=euclidean, n_neighbors=2;, score=0.890 total time=   0.0s
[CV 1/5] END ...metric=euclidean, n_neighbors=3;, score=0.945 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=3

In [56]:
knn_clf_cv.best_params_

{'metric': 'euclidean', 'n_neighbors': 9}

In [57]:
knn_clf_cv.best_score_

0.9274725274725274

In [58]:
y_pred = knn_clf_cv.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           B       0.95      0.99      0.97        71
           M       0.97      0.91      0.94        43

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [59]:
y_pred_train = knn_clf_cv.predict(X_train)
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           B       0.93      0.97      0.95       286
           M       0.94      0.87      0.90       169

    accuracy                           0.93       455
   macro avg       0.93      0.92      0.93       455
weighted avg       0.93      0.93      0.93       455



SONUÇ:
metrik=> manhattan, k => (1) 1-15 olduğunda veri setimiz ezberliyor
k değer aralığı 2-15'e alındığında , {'metric': 'manhattan', 'n_neighbors': 5}   95 , 95 cv=> 93.6
manhattan metriklerden çıkarıldığında {'metric': 'euclidean', 'n_neighbors': 9}  93 96  cv=>92.7
manhattan çıkarılıp, k 1-15 aralığında {'metric': 'euclidean', 'n_neighbors': 9}

# KNN ile Regresyon

In [None]:
df2 = pd.read_csv('./Data/insurance.csv')
df2

In [None]:
df2.info()

In [None]:
df2.describe().T

In [None]:
df2.isnull().sum()

In [66]:
df2_encoded = pd.get_dummies(df2,columns=['sex','smoker','region'],dtype=np.int64)

In [67]:
df2_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   int64  
 1   bmi               1338 non-null   float64
 2   children          1338 non-null   int64  
 3   charges           1338 non-null   float64
 4   sex_female        1338 non-null   int64  
 5   sex_male          1338 non-null   int64  
 6   smoker_no         1338 non-null   int64  
 7   smoker_yes        1338 non-null   int64  
 8   region_northeast  1338 non-null   int64  
 9   region_northwest  1338 non-null   int64  
 10  region_southeast  1338 non-null   int64  
 11  region_southwest  1338 non-null   int64  
dtypes: float64(2), int64(10)
memory usage: 125.6 KB


In [69]:
X2 = df2_encoded.drop(columns=['charges'])
y2 = df2_encoded['charges']

In [71]:
X2_train,X2_test,y2_train,y2_test = train_test_split(X2,y2,train_size=0.8,random_state=42)

In [72]:
# knn için sıklıkla kullanılan parametreler => n_neighbors, metric 
knn_reg = KNeighborsRegressor()
knn_reg_params = {
    'n_neighbors': [i for i in range(1,15)],
    'metric': ['euclidean','minkowski','manhattan','cosine']
}

In [75]:
knn_reg_cv = GridSearchCV(knn_reg,param_grid=knn_reg_params,verbose=3)
knn_reg_cv.fit(X2_train,y2_train)

Fitting 5 folds for each of 56 candidates, totalling 280 fits
[CV 1/5] END ..metric=euclidean, n_neighbors=1;, score=-0.118 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=1;, score=0.064 total time=   0.0s
[CV 3/5] END ...metric=euclidean, n_neighbors=1;, score=0.216 total time=   0.0s
[CV 4/5] END ..metric=euclidean, n_neighbors=1;, score=-0.037 total time=   0.0s
[CV 5/5] END ..metric=euclidean, n_neighbors=1;, score=-0.045 total time=   0.0s
[CV 1/5] END ...metric=euclidean, n_neighbors=2;, score=0.107 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=2;, score=0.229 total time=   0.0s
[CV 3/5] END ...metric=euclidean, n_neighbors=2;, score=0.340 total time=   0.0s
[CV 4/5] END ...metric=euclidean, n_neighbors=2;, score=0.131 total time=   0.0s
[CV 5/5] END ...metric=euclidean, n_neighbors=2;, score=0.119 total time=   0.0s
[CV 1/5] END ...metric=euclidean, n_neighbors=3;, score=0.164 total time=   0.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=3

In [76]:
knn_reg_cv.best_params_

{'metric': 'cosine', 'n_neighbors': 4}

In [82]:
y2_pred = knn_reg_cv.predict(X2_test)
print("MAE:",mean_absolute_error(y2_test,y2_pred)) 
print("MSE:",mean_squared_error(y2_test,y2_pred))
print("RMSE:",np.sqrt(mean_squared_error(y2_test,y2_pred)))
print("R2 Score:",r2_score(y2_test,y2_pred))

MAE: 5004.711328466417
MSE: 60488966.737731494
RMSE: 7777.465315752394
R2 Score: 0.6103738423077759


In [83]:
y2_pred_train = knn_reg_cv.predict(X2_train)
print("MAE:",mean_absolute_error(y2_train,y2_pred_train)) 
print("MSE:",mean_squared_error(y2_train,y2_pred_train))
print("RMSE:",np.sqrt(mean_squared_error(y2_train,y2_pred_train)))
print("R2 Score:",r2_score(y2_train,y2_pred_train))

MAE: 4037.2948398808408
MSE: 40147832.594121106
RMSE: 6336.231734565988
R2 Score: 0.7218400532294775
