In [1]:
import pandas as pd

In [4]:
df = pd.read_csv('diabetes_prediction_dataset.csv')

In [5]:
df.shape

(100000, 9)

In [6]:
display(df.head())

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


#### Preprocess

In [7]:
target_column = 'G3'
X = df.drop(columns=[target_column, "address", "G1", "G2"])
y = (df[target_column] >= 10).astype(int)

KeyError: "['G3', 'address', 'G1', 'G2'] not found in axis"

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=42
)

In [None]:
# import matplotlib.pyplot as plt

# df.hist(figsize=(20, 15))
# plt.show()

In [None]:
# import seaborn as sns
# sns.boxplot(data=df, x="freetime", y="G3")

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

category_columns = ["sex", "Mjob", "Fjob", "reason","guardian", "schoolsup", "famsup", "paid","activities", "nursery", "higher", "internet", "romantic", ]
scaling_columns = ["age", "Medu", "Fedu", "traveltime", "studytime", "failures", "famrel", "freetime", "goout", "Dalc", "health", "absences",]

transformer = ColumnTransformer([
    ("new", OneHotEncoder(handle_unknown="ignore", sparse_output=False), category_columns),
    ("scaler", StandardScaler(), scaling_columns)
])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
pipeline = Pipeline([
    ("transformer", transformer),
    ("classifier", DecisionTreeClassifier())
])



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        "classifier": [DecisionTreeClassifier()],
        "classifier__max_depth": [3, 5, 10, 15, None],
        "classifier__min_samples_leaf": [1, 2, 5, 10]
    },
    {
        "classifier": [RandomForestClassifier()],
        "classifier__n_estimators": [10, 50, 100],
        "classifier__min_samples_leaf": [1, 2, 5, 10]
    },
    {
        "classifier": [GaussianNB()],
        "classifier__var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
    }
]

In [None]:
model_search = GridSearchCV(pipeline, param_grid, return_train_score=True, n_jobs=-1)
model_search.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"[{'classifier': [DecisionTreeClassifier()], 'classifier__max_depth': [3, 5, ...], 'classifier__min_samples_leaf': [1, 2, ...]}, {'classifier': [RandomForestClassifier()], 'classifier__min_samples_leaf': [1, 2, ...], 'classifier__n_estimators': [10, 50, ...]}, ...]"
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('new', ...), ('scaler', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
model_search.best_params_

{'classifier': RandomForestClassifier(),
 'classifier__min_samples_leaf': 5,
 'classifier__n_estimators': 100}

In [None]:
model_search.best_score_

np.float64(0.8493221338048924)