In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import cross_validate

#  from pyspark.sql import SparkSession

import warnings
warnings.filterwarnings('ignore')


In [None]:
df = pd.read_csv('/kaggle/input/diabetes-prediction-dataset/diabetes_prediction_dataset.csv')

df

In [None]:
colmn = df.columns
print(colmn)

In [None]:
# Let's Encod data
encoder = LabelEncoder()
colmn = df.columns
for index, name in enumerate(colmn):
    df[name] = encoder.fit_transform(df[name].values)

df.corr()

In [None]:
df['diabetes'].hist()

# Disbalance in classes (

In [None]:
# there are literli function plot_tree to visualize that

X = df.drop(columns='diabetes')
y = pd.DataFrame(df['diabetes'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

# Fit the classifier with default hyper-parameters
clf = DecisionTreeClassifier(max_depth=8)
model = clf.fit(X, y)

In [None]:
fig = plt.figure(figsize=(15, 5))
_ = tree.plot_tree(clf, 
                   feature_names=X.columns,  
                   class_names='diabetes',
                   filled=True)

In [None]:
from sklearn.metrics import confusion_matrix
import itertools

pred_rt = model.predict(X_test)

accuracy_score(y_test, pred_rt)

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

font = {'size' : 20}

plt.rc('font', **font)

cnf_matrix = confusion_matrix(y_test, model.predict(X_test))
plt.figure(figsize=(8, 8))
plot_confusion_matrix(cnf_matrix, classes=['Non-diabetes', 'diabetes'],
                      title='Confusion matrix')
plt.show()

In [None]:
report = classification_report(y_test, model.predict(X_test), target_names=['Non-diabetes', 'diabetes'])
print(report)

<h4>Gradient boost</h4>

In [None]:
params = {'n_estimators':200,
          'max_depth':12,
          'criterion':'squared_error',
          'learning_rate':0.03,
          'min_samples_leaf':16,
          'min_samples_split':16
          }

gbr = GradientBoostingClassifier(**params)
gbr.fit(X_train, y_train)

pred_grd = gbr.predict(X_test)

In [None]:
font = {'size' : 20}

plt.rc('font', **font)

cnf_matrix = confusion_matrix(y_test, gbr.predict(X_test))
plt.figure(figsize=(8, 8))
plot_confusion_matrix(cnf_matrix, classes=['Non-diabetes', 'diabetes'],
                      title='Confusion matrix')
plt.show()

In [None]:
report = classification_report(y_test, gbr.predict(X_test), target_names=['Non-diabetes', 'diabetes'])
print(report)

In [None]:
feature_importance = gbr.feature_importances_
feature_importance_dict = dict(zip(X_test.columns, feature_importance))
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

variables, importance = zip(*sorted_feature_importance)
fig = plt.figure(figsize=(16, 4))
plt.bar(variables, importance)
plt.xticks(fontsize=14,rotation=75)
plt.show()

<h3>Dependense in model</h3>

In [None]:
leath_nodes = np.array([i for i in range(2, 6000, 400)], dtype=np.int16)
train_score = np.array([0 for i in range(len(leath_nodes))], dtype=np.float32)
valid_score = np.array([0 for i in range(len(leath_nodes))], dtype=np.float32)

X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=0.1, random_state=1)

#cv = StratifiedKFold(n_splits=5) # For validation

for index, leath in enumerate(leath_nodes):
    # I need to calculate score with most
    #trn_mn = []
    #vld_mn = []

    local_model = GradientBoostingClassifier(
            max_depth=5,
            criterion = 'squared_error',
            max_leaf_nodes = leath
        )
    cv_results = cross_validate(local_model, X_tr, y_tr, cv=3, return_estimator=True,
                                scoring='f1'
                                )
    print(f'{index + 1}. for {leath} of leaths cv_reesults is {cv_results["test_score"]} ', end='\n')
    valid_score[index] =  np.min(cv_results['test_score'])
    local_model.fit(X_tr, y_tr)
    train_score[index] = f1_score(y_tr, local_model.predict(X_tr))
print(valid_score)

In [None]:
plt.figure(figsize=(12, 4))
plt.plot(leath_nodes, train_score, label='Train')
plt.plot(leath_nodes, valid_score, label='Validation')
plt.xlabel('')
plt.ylabel('f1')
plt.title('Кол-во листьев')
plt.grid(True)
plt.legend()
plt.show()

<h3>Dependense but with deep of model</h3>

In [None]:
deep_of_md = np.array([i for i in range(2, 20, 4)], dtype=np.int16)
dp_otion = len(deep_of_md)
train_score = np.array([[] for i in range(dp_option)])
valid_score = np.array([[] for i in range(dp_option)])
n_estimators_values = np.arange(1, 150, 5)
X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=0.1, random_state=1)

for arr_int, deep_gb in enumerate(deep_of_md):

    train_scores_depth = []
    valid_scores_depth = []

    for num_of_est in n_estimators_values:
        local_model = GradientBoostingClassifier(
            max_depth=deep_gb,
            criterion = 'squared_error',
            n_estimators = num_of_est
        )
        cv_results = cross_validate(local_model, X_tr, y_tr, cv=3, return_estimator=True,
                                    scoring='f1'
                                    )
        valid_scores_depth.append(np.min(cv_results['test_score']))
        local_model.fit(X_tr, y_tr)
        train_scores_depth.append(f1_score(y_tr, local_model.predict(X_tr)))
        
        
    train_score[arr_int] = train_scores_depth
    valid_score[arr_int] = valid_scores_depth
