In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("diabetes.csv")
df.head()

In [None]:
df_copy = df.copy()
df_copy[["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]] = df[["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]].replace(0,np.NaN)


In [None]:
df_copy["Glucose"].fillna(df_copy["Glucose"].mean(),inplace=True)
df_copy["BloodPressure"].fillna(df_copy["BloodPressure"].mean(),inplace=True)
df_copy["SkinThickness"].fillna(df_copy["SkinThickness"].median(),inplace=True)
df_copy["Insulin"].fillna(df_copy["Insulin"].median(),inplace=True)
df_copy["BMI"].fillna(df_copy["BMI"].mean(),inplace=True)

In [None]:
plt.figure(figsize=(16,12))
sns.set_style(style='whitegrid')
plt.subplot(3,3,1)
sns.boxplot(x='Glucose',data=df_copy)
plt.subplot(3,3,2)
sns.boxplot(x='BloodPressure',data=df_copy)
plt.subplot(3,3,3)
sns.boxplot(x='Insulin',data=df_copy)
plt.subplot(3,3,4)
sns.boxplot(x='BMI',data=df_copy)
plt.subplot(3,3,5)
sns.boxplot(x='Age',data=df_copy)
plt.subplot(3,3,6)
sns.boxplot(x='SkinThickness',data=df_copy)
plt.subplot(3,3,7)
sns.boxplot(x='Pregnancies',data=df_copy)
plt.subplot(3,3,8)
sns.boxplot(x='DiabetesPedigreeFunction',data=df_copy)

In [None]:
X = df_copy.drop("Outcome",axis=1)
y = df_copy["Outcome"]

In [None]:
X_copy = X.copy()
X_copy

In [None]:
X_ss = X_copy[["Glucose","BMI"]]
X_qt = X_copy[['Pregnancies', 'BloodPressure', 'SkinThickness', 'Insulin','DiabetesPedigreeFunction', 'Age']]

In [None]:
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import StandardScaler


In [None]:
ss = StandardScaler()
X_scaler = ss.fit_transform(X_ss)

quantile_transformer = QuantileTransformer(random_state=0)
X_trans = quantile_transformer.fit_transform(X_qt)

In [None]:
df_ss = pd.DataFrame(X_scaler,columns=["Glucose","BMI"])
df_qt = pd.DataFrame(X_trans, columns=['Pregnancies', 'BloodPressure', 'SkinThickness', 'Insulin','DiabetesPedigreeFunction', 'Age'])

In [None]:
X_new = df_qt.join(df_ss, how='inner')
X_new.head()

In [None]:
plt.figure(figsize=(16,12))
sns.set_style(style='whitegrid')
plt.subplot(3,3,1)
sns.boxplot(x='Glucose',data=X_new)
plt.subplot(3,3,2)
sns.boxplot(x='BloodPressure',data=X_new)
plt.subplot(3,3,3)
sns.boxplot(x='Insulin',data=X_new)
plt.subplot(3,3,4)
sns.boxplot(x='BMI',data=X_new)
plt.subplot(3,3,5)
sns.boxplot(x='Age',data=X_new)
plt.subplot(3,3,6)
sns.boxplot(x='SkinThickness',data=X_new)
plt.subplot(3,3,7)
sns.boxplot(x='Pregnancies',data=X_new)
plt.subplot(3,3,8)
sns.boxplot(x='DiabetesPedigreeFunction',data=X_new)

In [None]:
plt.figure(figsize=(16,12))
sns.set_style(style='whitegrid')
plt.subplot(3,3,1)
sns.boxplot(x=X_new['Glucose'],data=X_new)
plt.subplot(3,3,2)
sns.boxplot(x=X_new['BMI'],data=X_new)
plt.subplot(3,3,3)
sns.boxplot(x=X_new['Pregnancies'],data=X_new)
plt.subplot(3,3,4)
sns.boxplot(x=X_new['Age'],data=X_new)
plt.subplot(3,3,5)
sns.boxplot(x=X_new['SkinThickness'],data=X_new)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
X_train,X_test,y_train,y_test = train_test_split(X_new,y,test_size=0.3,random_state=42)
knn = KNeighborsClassifier()
knn_model = knn.fit(X_train,y_train)
y_pred = knn_model.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score
print("Classification Report is:\n",classification_report(y_test,y_pred))
print("\n F1:\n",f1_score(y_test,y_pred))
print("\n Precision score is:\n",precision_score(y_test,y_pred))
print("\n Recall score is:\n",recall_score(y_test,y_pred))
print("\n Confusion Matrix:\n")
print(confusion_matrix(y_test,y_pred))
plt.figure(figsize=(15,8))
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True)
plt.ylabel('Prediction',fontsize=13)
plt.xlabel('Actual',fontsize=13)
plt.title('Confusion Matrix',fontsize=17)
plt.show()

In [None]:
test_scores = []
train_scores = []

for i in range(1,15):

    knn = KNeighborsClassifier(i)
    knn.fit(X_train,y_train)
    
    train_scores.append(knn.score(X_train,y_train))
    test_scores.append(knn.score(X_test,y_test))

In [None]:
max_train_score = max(train_scores)
train_scores_ind = [i for i, v in enumerate(train_scores) if v == max_train_score]
print('Max train score {} % and k = {}'.format(max_train_score*100,list(map(lambda x: x+1, train_scores_ind))))

In [None]:
max_test_score = max(test_scores)
test_scores_ind = [i for i, v in enumerate(test_scores) if v == max_test_score]
print('Max test score {} % and k = {}'.format(max_test_score*100,list(map(lambda x: x+1, test_scores_ind))))

In [None]:
plt.figure(figsize=(12,5))
p = sns.lineplot(range(1,15),train_scores,marker='*',label='Train Score')
p = sns.lineplot(range(1,15),test_scores,marker='o',label='Test Score')

## Support Vector Machine

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=42)

In [None]:
from sklearn.svm import SVC
svm_model = SVC(kernel="linear").fit(X_train,y_train)
y_pred = svm_model.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
svm= SVC()

In [None]:
svm_params = {"C": np.arange(1,10),
              "gamma":(0.001, 0.01, 0.1),
              "kernel":["linear","rbf"]}

#C parametresi, SVM'in sınıflandırma hatasını nasıl kabul edeceğini belirler. C değeri ne kadar büyükse, SVM sınıflandırma hatasını azaltmak için daha fazla çaba harcar. 
#Gamma'nın değeri ne kadar büyükse, tek bir eğitim örneği, modeldeki diğer örneklerin sınıflandırılmasında daha fazla ağırlığa sahip olur. Bu da modelin eğitim verilerine aşırı uyum yapmasına neden olabilir.

In [None]:
from sklearn.model_selection import GridSearchCV
svm_cv = GridSearchCV(svm,svm_params,n_jobs=-1, cv=5, verbose=1, scoring="accuracy")

In [None]:
svm_cv.fit(X_train, y_train)
best_params = svm_cv.best_params_
print(f"Best params: {best_params}")

In [None]:
svc_tuned = SVC(C=8,gamma=0.001,kernel='linear').fit(X_train,y_train)
y_pred_tuned = svc_tuned.predict(X_test)
accuracy_score(y_test,y_pred_tuned)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score
print("Classification Report is:\n",classification_report(y_test,y_pred_tuned))
print("\n F1:\n",f1_score(y_test,y_pred_tuned))
print("\n Precision score is:\n",precision_score(y_test,y_pred_tuned))
print("\n Recall score is:\n",recall_score(y_test,y_pred_tuned))
print("\n Confusion Matrix:\n")
print(confusion_matrix(y_test,y_pred_tuned))
plt.figure(figsize=(15,8))
sns.heatmap(confusion_matrix(y_test,y_pred_tuned),annot=True)
plt.ylabel('Prediction',fontsize=13)
plt.xlabel('Actual',fontsize=13)
plt.title('Confusion Matrix',fontsize=17)
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
y_pred = tree.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
params = {"criterion": ("entropy","gini"), #Criterion (Kriter): Karar ağacının düğümlerini bölmek için kullanılacak ölçütü belirler. 
          "splitter":("best", "random"), #Splitter (Bölücü): Karar ağacının düğümlerini bölme stratejisini belirler.
          "max_depth":(list(range(1, 10))), #Max_depth (Maksimum Derinlik): Oluşturulacak olan ağacın maksimum derinliğini belirler. 
          "min_samples_split":[2, 3, 4], #Min_samples_split (Minimum Bölme Örnek Sayısı): Bu, bir düğümün bölünmesi için gereken minimum örnek sayısını belirler. 
          "min_samples_leaf":list(range(1, 10)) #Min_samples_leaf (Minimum Yaprak Örnek Sayısı): Bu, bir yaprak düğümü oluşturmak için gerekli olan minimum örnek sayısını belirler.
          }

In [None]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_cv = GridSearchCV(tree_clf, params, scoring="accuracy", n_jobs=-1, verbose=1, cv=5)
tree_cv.fit(X_train, y_train)
best_params = tree_cv.best_params_

In [None]:
best_params

In [None]:
tree_tuned = DecisionTreeClassifier(**best_params)
tree_tuned.fit(X_train,y_train)
y_pred = tree_tuned.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score
print("Classification Report is:\n",classification_report(y_test,y_pred))
print("\n F1:\n",f1_score(y_test,y_pred))
print("\n Precision score is:\n",precision_score(y_test,y_pred))
print("\n Recall score is:\n",recall_score(y_test,y_pred))
print("\n Confusion Matrix:\n")
print(confusion_matrix(y_test,y_pred))
plt.figure(figsize=(15,8))
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True)
plt.ylabel('Prediction',fontsize=13)
plt.xlabel('Actual',fontsize=13)
plt.title('Confusion Matrix',fontsize=17)
plt.show()