# Initializing libraries

In [365]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (max_error, mean_absolute_error, mean_squared_error, r2_score,
                             confusion_matrix, accuracy_score, recall_score, precision_score, f1_score)
from imblearn.over_sampling import SMOTE
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder,OneHotEncoder
from tensorflow.keras.callbacks import EarlyStopping
from category_encoders import TargetEncoder
from imblearn.combine import SMOTETomek


# Reading dataset

In [219]:
db = pd.read_csv('diabetes.csv')
db

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


# Encoding dataset

In [220]:
db.dropna(inplace=True)
db.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')

In [221]:
coding_columns = ["gender","smoking_history" ]
OHE = OneHotEncoder(sparse_output=False, drop="first")

OHE

In [222]:
OHE.fit(db[coding_columns])
OHE.transform(db[coding_columns])

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

# Inputing row

In [223]:
encoded = OHE.fit_transform(db[coding_columns])
encoded_df = pd.DataFrame(encoded, columns=OHE.get_feature_names_out(coding_columns))
db = pd.concat([db.drop(coding_columns, axis=1), encoded_df], axis=1)
db

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Male,gender_Other,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,54.0,0,0,27.32,6.6,80,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,28.0,0,0,27.32,5.7,158,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,36.0,0,0,23.45,5.0,155,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,76.0,1,1,20.14,4.8,155,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,80.0,0,0,27.32,6.2,90,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,2.0,0,0,17.37,6.5,100,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,66.0,0,0,27.83,5.7,155,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
99998,24.0,0,0,35.42,4.0,100,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Correlation

In [224]:
print(db.corr()['diabetes'].sort_values())

gender_Other                  -0.004090
smoking_history_current        0.019606
smoking_history_not current    0.020734
smoking_history_ever           0.024080
smoking_history_never          0.027267
gender_Male                    0.037666
smoking_history_former         0.097917
heart_disease                  0.171727
hypertension                   0.197823
bmi                            0.214357
age                            0.258008
HbA1c_level                    0.400660
blood_glucose_level            0.419558
diabetes                       1.000000
Name: diabetes, dtype: float64


# Linear regression

In [225]:
x_reg = db.drop("diabetes", axis=1)
y_reg = db['diabetes']


### В машинном обучении StandardScaler (стандартный масштабатор) используется для изменения размера распределения значений так, чтобы среднее значение наблюдаемых значений было равно 0, а стандартное отклонение – 1.

In [226]:
x_train, x_test, y_train, y_test = train_test_split(x_reg, y_reg, test_size=0.2, random_state=89)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)



# Creating model

In [227]:
model = Sequential([
    Input(shape=(x_train_scaled.shape[1],)),
    Dropout(0.2),
    Dense(256, activation="relu"),
    Dropout(0.2),
    Dense(128, activation="relu"),
    Dense(64, activation="relu"),
    Dropout(0.2),
    Dense(1)
])

# Learning model

In [228]:
model.compile(optimizer = Adam(learning_rate =0.0001), loss="mse")
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model.fit(x_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping], verbose=0)
y_pred = model.predict(x_test_scaled, verbose =0).flatten()

max_err = max_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Metrics

In [229]:
print("Regression metrics")
print(f"Max error {max_err:.2f}")
print(f"MAE {mae:.2f}")
print(f"MSE {mse:.2f}")
print(f"Coeffient determination R2 {r2:.2f}")

Regression metrics
Max error 1.00
MAE 0.07
MSE 0.03
Coeffient determination R2 0.63


# Classification

# Initializing dataset

In [302]:
df = pd.read_csv("train.csv")
df.dropna(inplace=True)
df.columns
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


# Coding data

In [337]:

encodingcolumns = ["Name", "Sex", "Ticket", "Fare","Cabin", "Embarked"]
encoder = TargetEncoder()
df[encodingcolumns] = encoder.fit_transform(df[encodingcolumns], df["Survived"])
df



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,49.0,0.0,38.0,1,0,109.0,57.0,72.0,0.0
3,4,1,1,70.0,0.0,35.0,1,0,31.0,44.0,48.0,2.0
6,7,0,1,112.0,1.0,54.0,0,0,55.0,41.0,117.0,2.0
10,11,1,3,148.0,0.0,4.0,1,1,120.0,10.0,131.0,2.0
11,12,1,1,27.0,0.0,58.0,0,0,26.0,17.0,43.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,18.0,0.0,47.0,1,1,33.0,43.0,91.0,2.0
872,873,0,1,35.0,1.0,33.0,0,0,89.0,1.0,29.0,2.0
879,880,1,1,140.0,0.0,56.0,0,1,38.0,68.0,61.0,0.0
887,888,1,1,75.0,0.0,19.0,0,0,10.0,23.0,25.0,2.0


In [338]:
x_cls = df.drop("Survived", axis=1)
y_cls = df["Survived"]

# Scaling

In [339]:
x_train_cls, x_test_cls, y_train_cls, y_test_cls = train_test_split(x_cls, y_cls, test_size=0.2, random_state=89)
scaler = StandardScaler()
x_train_scl = scaler.fit_transform(x_train_cls)
x_test_scl = scaler.transform(x_test_cls)

# Training model but with classification

In [None]:
smote = SMOTETomek(random_state=42)
x_train_bal, y_train_bal = smote.fit_resample(x_train_scl, y_train_cls)

model = Sequential([
    Input(shape=(x_train_bal.shape[1],)),
    Dense(300, activation='relu'),
    Dropout(0.5),
    Dense(200, activation='relu'),
    Dropout(0.5),
    Dense(50, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(x_train_bal, y_train_bal, epochs=50, batch_size=32, verbose=0)

y_pred_prob = model.predict(x_test_scl, verbose=0).flatten()
y_pred_cls = (y_pred_prob >= 0.39).astype(int)

# Metrics

In [None]:
conf_matrix = confusion_matrix(y_test_cls, y_pred_cls)
acc = accuracy_score(y_test_cls, y_pred_cls)
recall = recall_score(y_test_cls, y_pred_cls, average='binary')
precision = precision_score(y_test_cls, y_pred_cls, average='binary')
f1 = f1_score(y_test_cls, y_pred_cls, average='binary')

print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Accuracy: {acc}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")


Confusion Matrix:
[[ 9  5]
 [ 4 19]]
Accuracy: 0.7567567567567568
Recall: 0.734472049689441
Precision: 0.7419871794871795
F1 Score: 0.8085106382978723


In [404]:
from sklearn.metrics import f1_score

best_f1 = 0
best_thresh = 0.5
for t in np.arange(0.3, 0.7, 0.01):
    pred = (y_pred_prob >= t).astype(int)
    f1 = f1_score(y_test_cls, pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"Best F1: {best_f1}, Best Threshold: {best_thresh}")


Best F1: 0.84, Best Threshold: 0.39000000000000007
