In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
import joblib

In [3]:
df = pd.read_csv("C:/Users/vanya/OneDrive/Desktop/VANYA/Clg_Project/diabetes_prediction_dataset.csv")


In [4]:
# Separate encoders
le_gender = LabelEncoder()
df['gender'] = le_gender.fit_transform(df['gender'])

le_smoking = LabelEncoder()
df['smoking_history'] = le_smoking.fit_transform(df['smoking_history'])

X = df.drop('diabetes', axis=1)
y = df['diabetes']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
# Pipelines
pipe_lr = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(random_state=42, solver="liblinear", max_iter=500))
])

pipe_dt = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", DecisionTreeClassifier(random_state=42))
])

In [7]:
# Params
param_grid_lr = {
    "clf__penalty": ["l1", "l2"],
    "clf__C": [0.01, 0.1, 1, 10, 100],
    "clf__solver": ["liblinear", "saga"],
    "clf__max_iter": [100, 200, 500]
}

param_grid_dt = {
    "clf__criterion": ["gini", "entropy"],
    "clf__max_depth": [None, 10, 20, 30, 50, 70],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4]
}

In [8]:
# Random search
search_lr = RandomizedSearchCV(pipe_lr, param_grid_lr, cv=5, n_iter=20, scoring="accuracy", random_state=42)
search_dt = RandomizedSearchCV(pipe_dt, param_grid_dt, cv=5, n_iter=20, scoring="accuracy", random_state=42)

search_lr.fit(X_train, y_train)
search_dt.fit(X_train, y_train)


In [9]:
# Choose best
best_model = search_lr if search_lr.best_score_ > search_dt.best_score_ else search_dt

In [10]:
print("Best Model:", best_model.best_estimator_)
print("Best CV Accuracy:", best_model.best_score_)
print("Test Accuracy:", accuracy_score(y_test, best_model.predict(X_test)))


Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('clf',
                 DecisionTreeClassifier(max_depth=10, min_samples_split=10,
                                        random_state=42))])
Best CV Accuracy: 0.9716625000000001
Test Accuracy: 0.9715


In [11]:
# Save everything
joblib.dump(best_model.best_estimator_, "C:/Users/vanya/streamlit_models/diabetes_model.pkl")
joblib.dump(le_gender, "C:/Users/vanya/streamlit_models/le_gender.pkl")
joblib.dump(le_smoking, "C:/Users/vanya/streamlit_models/le_smoking.pkl")

['C:/Users/vanya/streamlit_models/le_smoking.pkl']