In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sns as sns
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve, \
    classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

PROBLEM: ŞEKER HASTALIĞI TAHMINI

In [None]:
df = pd.read_csv("data/diabetes.csv")
df.head()

LOGICTIC REGRESSION

In [None]:
df["Outcome"].value_counts()

In [None]:
df.describe().T

In [None]:
y = df["Outcome"]
x = df.drop(["Outcome"], axis=1)

In [None]:
y.head()

In [None]:
x.head()

In [None]:
log_model = LogisticRegression(solver="liblinear").fit(x, y)
log_model.intercept_

In [None]:
log_model.coef_

In [None]:
y_pred = log_model.predict(x)

In [None]:
confusion_matrix(y, y_pred)

In [None]:
accuracy_score(y, y_pred)

In [None]:
print(classification_report(y, y_pred))

In [None]:
log_model.predict(x)[0:10]

In [None]:
log_model.predict_proba(x)[0:10]  # sınıf olasılıkları

In [None]:
logit_roc_auc = roc_auc_score(y, log_model.predict(x))
fpr, tpr, thresholds = roc_curve(y, log_model.predict_proba(x)[:, 1])
plt.figure()
plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.title("Receiver operating characterisitc")
plt.legend(loc="lower right")
plt.savefig("Log_ROC")
plt.show()

MODEL TUNING

In [None]:
# Model doğrulama yapılacak

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)
log_model = LogisticRegression(solver="liblinear").fit(x_train, y_train)

In [None]:
y_pred = log_model.predict(x_test)
print(accuracy_score(y_test, y_pred))

In [None]:
cross_val_score(log_model, x_test, y_test, cv=10)

KNN

In [None]:
knn_model = KNeighborsClassifier().fit(x_train, y_train)
knn_model

In [None]:
y_pred = knn_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
knn_params = {"n_neighbors": np.arange(1, 50)}
knn_cv_model = GridSearchCV(knn_model, knn_params, cv=10, n_jobs=-1, verbose=2).fit(x_train, y_train)

In [None]:
knn_cv_model.best_score_

In [None]:
knn_tuned = KNeighborsClassifier(n_neighbors=knn_cv_model.best_params_["n_neighbors"]).fit(x_train, y_train)
y_pred = knn_tuned.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
knn_tuned.score(x_test, y_test)

SVM
Amaç: İki sınıfın arasındaki mesafeyi(marjin) maksimum yapmak
Doğrusal form etkili olmayacağı durumlarda boyut artırma yapılır, Kernel hilesi

In [None]:
svm_model = SVC(kernel="linear").fit(x_train, y_train)

In [None]:
y_pred = svm_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
svm = SVC()

In [None]:
svm_params = {"C": np.arange(1, 3), "kernel": ["linear", "rbf"]}
sv_cv_model = GridSearchCV(svm, svm_params, cv=5, n_jobs=-1, verbose=2).fit(x_train, y_train)

In [None]:
sv_cv_model.best_score_

In [None]:
svm_tuned = SVC(C=sv_cv_model.best_params_["C"], kernel="linear").fit(x_train, y_train)
y_pred = svm_tuned.predict(x_test)

In [None]:
accuracy_score(y_test, y_pred)

ANN(ÇOK KATMANLI ALGILAYICILAR)
En küçük hata ile tahmin yapacak katsayıları bulmak

In [None]:
mlpc_model = MLPClassifier().fit(x_train, y_train)  # doğrusal problem için relu, sınıflandırma için logistic fonsk.

In [None]:
y_pred = mlpc_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
mlpc_params = {"alpha": [1, 5, 0.1, 0.01, 0.03, 0.005, 0.0001],
               "hidden_layer_sizes": [(10, 10), (100, 100, 100), (100, 100), (3, 5)]}

In [None]:
mlpc = MLPClassifier(solver="lbfgs", activation="logistic")

In [None]:
mlp_cv_model = GridSearchCV(mlpc, mlpc_params, cv=10, verbose=2, n_jobs=-1).fit(x_train, y_train)

In [None]:
mlpc_tuned = MLPClassifier(alpha=5, hidden_layer_sizes=(100, 100)).fit(x_train,
                                                                       y_train)

In [None]:
y_pred = mlpc_tuned.predict(x_test)
accuracy_score(y_test, y_pred)

CART

In [None]:
cart_model = DecisionTreeClassifier().fit(x_train, y_train)
y_pred = cart_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
cart = DecisionTreeClassifier()
cart_params = {"depth": [1, 3, 5, 8, 10],
               "min_samples_split": [2, 3, 5, 10, 20, 50]}
cart_cv_model = GridSearchCV(cart, cart_params, cv=10).fit(x_train, y_train)

In [None]:
cart_tuned = DecisionTreeClassifier(max_depth=5, min_samples_split=20).fit(x_train, y_train)

In [None]:
y_pred = cart_tuned.predict(x_test)
accuracy_score(y_test, y_pred)

RANDOM FOREST

In [None]:
rf_model = RandomForestClassifier().fit(x_train, y_train)

In [None]:
y_pred = rf_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
rf = RandomForestClassifier()

In [None]:
rf_params = {"n_estimators": [100, 200, 500, 1000],
             "max_features": [3, 5, 7, 8],
             "min_samples_split": [2, 5, 10, 20]}

In [None]:
rf_cv_model = GridSearchCV(rf, rf_params, cv=10, n_jobs=-1, verbose=2).fit(x_train, y_train)

In [None]:
rf_tuned = RandomForestClassifier(max_features=8, min_samples_split=5, n_estimators=500).fit(x_train, y_train)

In [None]:
y_pred = rf_tuned.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# Değişken önem düzeyleri

feature_imp = pd.Series(rf_tuned.feature_importances_,
                        index=x_train.columns).sort_values(ascending=False)

sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel("Degisken önem skorları")
plt.ylabel("Degiskenler")
plt.title("Degisken önem düzeyleri")
plt.show()

GBM
Modeller serisi kurulur

In [None]:
gbm_model = GradientBoostingClassifier().fit(x_train, y_train)

In [None]:
y_pred = gbm_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
gbm = GradientBoostingClassifier()

In [None]:
gbm_params = {"learning_rate": [0.1, 0.01, 0.001, 0.05],
              "n_estimators": [100, 300, 500, 1000],
              "max_depth": [2, 3, 5, 8]}

In [None]:
gbm_cv_model = GridSearchCV(gbm, gbm_params, cv=10).fit(x_train, y_train)

In [None]:
gbm_tuned = GradientBoostingClassifier(learning_rate=0.01, max_depth=5, n_estimators=500).fit(x_train, y_train)

In [None]:
y_pred = gbm_tuned.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# Değişken önem düzeyleri

feature_imp = pd.Series(gbm_tuned.feature_importances_,
                        index=x_train.columns).sort_values(ascending=False)

sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel("Degisken önem skorları")
plt.ylabel("Degiskenler")
plt.title("Degisken önem düzeyleri")
plt.show()

XGBOOST

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier().fit(x_train, y_train)

In [None]:
y_pred = xgb_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
xgb = XGBClassifier()
xgb_params = {"n_estimators": [100, 500, 1000],
              "subsample": [0.6, 0.8, 1],
              "max_depth": [3, 5, 7],
              "learning_rate": [0.1, 0.001, 0.01]}

In [None]:
xgb_cv_model = GridSearchCV(xgb, xgb_params, cv=10, n_jobs=-1, verbose=2).fit(x_train, y_train)

In [None]:
xgb_cv_model.best_params_

In [None]:
xgb_tuned = XGBClassifier(learning_rate=0.001, max_depth=7, n_estimators=500, subsample=0.6).fit(x_train, y_train)

In [None]:
y_pred = xgb_tuned.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# Değişken önem düzeyleri

feature_imp = pd.Series(xgb_tuned.feature_importances_,
                        index=x_train.columns).sort_values(ascending=False)

sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel("Degisken önem skorları")
plt.ylabel("Degiskenler")
plt.title("Degisken önem düzeyleri")
plt.show()

LBM

In [None]:
from lightgbm import LGBMClassifier

lgbm_model = LGBMClassifier().fit(x_train, y_train)

In [None]:
y_pred = lgbm_model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
lgbm = LGBMClassifier()

In [None]:
lgbm_params = {"learning_rate": [0.001, 0.01, 0.1],
               "n_estimators": [200, 500, 1000],
               "max_depth": [1, 2, 3, 5, 8]}

In [None]:
lbgm_cv_model = GridSearchCV(lgbm, lgbm_params, cv=10, verbose=2, n_jobs=-1).fit(x_train, y_train)

In [None]:
lgbm_tuned = LGBMClassifier(learning_rate=lbgm_cv_model.best_params_["learning_rate"],
                            max_depth=lbgm_cv_model.best_params_["max_depth"],
                            n_estimators=lbgm_cv_model.best_params_["n_estimators"]).fit(x_train, y_train)

In [None]:
y_pred = lgbm_tuned.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
# Değişken önem düzeyleri

feature_imp = pd.Series(lgbm_tuned.feature_importances_,
                        index=x_train.columns).sort_values(ascending=False)

sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel("Degisken önem skorları")
plt.ylabel("Degiskenler")
plt.title("Degisken önem düzeyleri")
plt.show()