In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Modeller
from sklearn.svm import SVC,SVR
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor

#Hata Metrikleri
from sklearn.metrics import accuracy_score,classification_report ,mean_absolute_error,mean_squared_error,r2_score

#Cross-validation ve en iyi parametrelerin seçimi
from sklearn.model_selection import GridSearchCV,train_test_split

#Kategorik Değişken Dönüşümü
from sklearn.preprocessing import LabelEncoder

# Gereksiz uyarıların görüntülenmemesi için kullanıyoruz
import warnings
warnings.filterwarnings('ignore')



# Topluluk Öğrenimi ile Sınıflandırma

In [2]:
df = pd.read_csv('./Data/breast-cancer-wisconsin.csv')
df

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,


In [3]:
df.drop(columns=['id','Unnamed: 32'],inplace=True)
X = df.drop(columns=['diagnosis'])
y = df['diagnosis']
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state=42)

In [23]:
knn_clf = KNeighborsClassifier()
knn_clf_params = {
    'n_neighbors': [i for i in range(1,15)],
    'metric': ['euclidean','minkowski','manhattan','cosine']
}

In [24]:
svc_clf = SVC(probability=True)
svc_params = {
    'kernel' :['linear','poly','rbf'],
    'C' : [100, 10, 1.0, 0.1, 0.001]
}

In [25]:
dtree_clf = DecisionTreeClassifier()
dtree_clf_params = {
    'max_depth': [3,4,5,7,9],
    'min_samples_split':[2,5,8,10,20],
    'min_samples_leaf': [1,2,3,4],
    'criterion' : ['gini','entropy']
}

In [27]:
knn_clf_grid = GridSearchCV(knn_clf,knn_clf_params,cv=5,verbose=1)
svc_clf_grid = GridSearchCV(svc_clf,svc_params,cv=5,verbose=2,n_jobs=-1)
dtree_clf_grid = GridSearchCV(dtree_clf,dtree_clf_params,cv=5,verbose=3)

In [28]:
ensemble_classifier = VotingClassifier(
   [('knn',knn_clf_grid),
    ('svc',svc_clf_grid),
    ('dtree',dtree_clf_grid)],voting='soft')

In [29]:
ensemble_classifier.fit(X_train,y_train)

Fitting 5 folds for each of 56 candidates, totalling 280 fits
Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV 1/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.912 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.901 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.934 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.934 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.934 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5;, score=0.923 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5;, score=0.901 total time=   0.0s
[CV 3/5]

In [30]:
y_pred = ensemble_classifier.predict(X_test)
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           B       0.99      0.93      0.96        75
           M       0.88      0.97      0.93        39

    accuracy                           0.95       114
   macro avg       0.93      0.95      0.94       114
weighted avg       0.95      0.95      0.95       114



In [34]:
y_pred_tr = ensemble_classifier.predict(X_train)
print(classification_report(y_pred_tr,y_train))

              precision    recall  f1-score   support

           B       1.00      1.00      1.00       286
           M       1.00      1.00      1.00       169

    accuracy                           1.00       455
   macro avg       1.00      1.00      1.00       455
weighted avg       1.00      1.00      1.00       455



# Topluluk Öğrenimi ile Regresyon

In [35]:
df2 = pd.read_csv('./Data/insurance.csv')
df2

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [36]:
df2_encoded = pd.get_dummies(df2,columns=['sex','smoker','region'],dtype=np.int64)
X2 = df2_encoded.drop(columns=['charges'])
y2 = df2_encoded['charges']
X2_train,X2_test,y2_train,y2_test = train_test_split(X2,y2,train_size=0.8,random_state=42)

In [37]:
knn_reg = KNeighborsRegressor()
knn_reg_params = {
    'n_neighbors': [i for i in range(1,15)],
    'metric': ['euclidean','minkowski','manhattan','cosine']
}

In [38]:
svr_reg = SVR()
svr_params = {
    'kernel' :['linear','poly','rbf'],
    'C' : [100, 10, 1.0, 0.1, 0.001]
}

In [48]:
dtree_reg = DecisionTreeRegressor()
dtree_reg_params = {
    'max_depth': [3,4,5,7,9],
    'min_samples_split':[2,5,8,10,20],
    'min_samples_leaf': [1,2,3,4],
    'criterion' : ["squared_error", "friedman_mse", "absolute_error","poisson"]
}

In [49]:
knn_reg_grid = GridSearchCV(knn_reg,knn_reg_params,cv=5,verbose=1)
svr_reg_grid = GridSearchCV(svr_reg,svr_params,cv=5,verbose=2,n_jobs=-1)
dtree_reg_grid = GridSearchCV(dtree_reg,dtree_reg_params,cv=5,verbose=3)

In [50]:
voting_reg = VotingRegressor(
   [('knn',knn_reg_grid),
    ('svr',svr_reg_grid),
    ('dtree',dtree_reg_grid)],verbose=3)

In [51]:
voting_reg.fit(X2_train,y2_train)

Fitting 5 folds for each of 56 candidates, totalling 280 fits
[Voting] ...................... (1 of 3) Processing knn, total=   1.5s
Fitting 5 folds for each of 15 candidates, totalling 75 fits
[Voting] ...................... (2 of 3) Processing svr, total=   1.0s
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
[CV 1/5] END criterion=squared_error, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.821 total time=   0.0s
[CV 2/5] END criterion=squared_error, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.903 total time=   0.0s
[CV 3/5] END criterion=squared_error, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.826 total time=   0.0s
[CV 4/5] END criterion=squared_error, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.795 total time=   0.0s
[CV 5/5] END criterion=squared_error, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.833 total time=   0.0s
[CV 1/5] END criterion=squared_error, max_depth=3, 

In [53]:
y2_pred = voting_reg.predict(X2_test)
print(np.sqrt(mean_squared_error(y2_pred,y2_test)))

5852.681764434521


In [54]:
y2_pred_tr = voting_reg.predict(X2_train)
print(np.sqrt(mean_squared_error(y2_pred_tr,y2_train)))

5482.909811354785


SVR => yaklaşık 7800 hata
KNN => yaklaşık 6300 hata
DTREE => Yaklaşık 4500 hata

Karar Ağacı dışındaki modellerin hataları çok daha yüksek olduğu için model DTree'den daha kötü sonuç verdi. Bu durum bize model seçerken dikkatli olmamız gerektiğini gösteriyor