In [None]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [49]:
dataset = pd.read_csv("./diabetes_data 2.csv")
dataset

Unnamed: 0.1,Unnamed: 0,Fasting_Blood_Glucose,Postprandial_Blood_Glucose,HbA1c,Random_Blood_Glucose,BMI,Waist_Circumference,Triglyceride_Levels,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,...,Gestational_Diabetes,PCOS,Hypertension,Physical_Activity,Smoking,Alcohol_Consumption,Obesity,Diet,Sleep_Apnea,Diabetes_Status
0,0,118.690215,111.472598,5.570234,224.721689,29.299897,37.872710,212.064239,93.491951,100.418755,...,Yes,No,Yes,Yes,No,Yes,No,No,No,Positive
1,1,193.592860,172.123161,5.481873,253.236721,35.135808,39.468713,93.096591,106.809528,98.109810,...,Yes,No,No,No,Yes,No,No,Yes,No,Negative
2,2,165.159212,228.400792,9.437527,127.607617,34.004023,47.090948,268.098641,164.812122,56.702794,...,No,No,No,Yes,Yes,No,Yes,No,No,Positive
3,3,147.825603,204.478231,5.497277,213.721043,18.847498,36.800088,203.279060,159.009152,114.580068,...,Yes,Yes,Yes,No,No,Yes,No,No,Yes,Positive
4,4,90.282423,217.115395,5.631698,201.501576,18.731237,47.392994,89.300971,121.557842,89.793054,...,Yes,Yes,No,Yes,No,Yes,No,Yes,Yes,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2995,81.731503,132.332546,5.712814,142.634373,31.867784,33.039740,127.180164,102.211989,82.196275,...,Yes,Yes,No,No,Yes,No,Yes,No,No,Negative
2996,2996,101.889512,122.206950,5.423619,178.741817,31.798141,34.648423,139.123233,118.578609,74.112063,...,Yes,No,Yes,Yes,No,Yes,No,No,Yes,Negative
2997,2997,93.294244,130.571657,5.311118,119.174624,24.852150,39.421332,112.996152,128.685327,71.592304,...,Yes,Yes,No,Yes,Yes,No,Yes,No,Yes,Negative
2998,2998,100.005629,124.054126,5.145150,158.280116,26.359585,28.989608,122.985922,104.613596,74.805198,...,Yes,No,Yes,Yes,Yes,No,Yes,No,Yes,Negative


In [None]:
dataset['Diabetes_Status'] = LabelEncoder().fit_transform(dataset['Diabetes_Status'])


In [129]:
categorical_data = ['Family_History_of_Diabetes', 'Gestational_Diabetes',
'Physical_Activity', 'Smoking', 'Alcohol_Consumption',
       'Obesity', 'Diet', 'Sleep_Apnea']


numeric_data = [
       'HDL_Cholesterol', 'Fructosamine_Levels', 'C_Peptide', 'Proinsulin_Levels', "Insulin_Levels", "HbA1c"]


preprocessing = ColumnTransformer(transformers=[
    ("cat", Pipeline(steps=[
        ("encode", OneHotEncoder(drop="first", sparse_output=False))]), categorical_data),
    ("scale", Pipeline(steps=[
        ('scale', StandardScaler())]), numeric_data)
])


In [139]:
processing = Pipeline(steps=[
    ("preprocess", preprocessing),
    ("model", XGBClassifier(
        max_depth=5,
        n_estimators=200,
        learning_rate=0.8,
    )),
])
last_col = dataset.columns[-1]
X = dataset[dataset.columns.difference([last_col])]
y = dataset[last_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

processing.fit(X_train, y_train)
prediction = processing.predict(X_test)
print(f' Accuracy: {accuracy_score(y_test, prediction)}, Precision {precision_score(y_test, prediction)}, Recall {recall_score(y_test, prediction)} F1-Score = {f1_score(y_test, prediction)}')

cat_features = preprocessing.named_transformers_['cat'].named_steps['encode'].get_feature_names_out(categorical_data)

# 3️⃣ Combine numeric + encoded categorical feature names
all_features = np.concatenate([numeric_data, cat_features])

# 4️⃣ Get feature importances from RandomForest inside pipeline
importances = processing.named_steps['model'].feature_importances_

# 5️⃣ Create DataFrame
feature_importances = pd.DataFrame({
    'feature': all_features,
    'importance': importances
}).sort_values(by='importance', ascending=False)

feature_importances.head(15)
confusion_matrix(y_test, prediction)

 Accuracy: 0.8783333333333333, Precision 0.8764705882352941, Recall 0.9057750759878419 F1-Score = 0.890881913303438


array([[229,  42],
       [ 31, 298]])

In [123]:
dataset

Unnamed: 0.1,Unnamed: 0,Fasting_Blood_Glucose,Postprandial_Blood_Glucose,HbA1c,Random_Blood_Glucose,BMI,Waist_Circumference,Triglyceride_Levels,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,...,Gestational_Diabetes,PCOS,Hypertension,Physical_Activity,Smoking,Alcohol_Consumption,Obesity,Diet,Sleep_Apnea,Diabetes_Status
0,0,118.690215,111.472598,5.570234,224.721689,29.299897,37.872710,212.064239,93.491951,100.418755,...,Yes,No,Yes,Yes,No,Yes,No,No,No,1
1,1,193.592860,172.123161,5.481873,253.236721,35.135808,39.468713,93.096591,106.809528,98.109810,...,Yes,No,No,No,Yes,No,No,Yes,No,0
2,2,165.159212,228.400792,9.437527,127.607617,34.004023,47.090948,268.098641,164.812122,56.702794,...,No,No,No,Yes,Yes,No,Yes,No,No,1
3,3,147.825603,204.478231,5.497277,213.721043,18.847498,36.800088,203.279060,159.009152,114.580068,...,Yes,Yes,Yes,No,No,Yes,No,No,Yes,1
4,4,90.282423,217.115395,5.631698,201.501576,18.731237,47.392994,89.300971,121.557842,89.793054,...,Yes,Yes,No,Yes,No,Yes,No,Yes,Yes,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2995,81.731503,132.332546,5.712814,142.634373,31.867784,33.039740,127.180164,102.211989,82.196275,...,Yes,Yes,No,No,Yes,No,Yes,No,No,0
2996,2996,101.889512,122.206950,5.423619,178.741817,31.798141,34.648423,139.123233,118.578609,74.112063,...,Yes,No,Yes,Yes,No,Yes,No,No,Yes,0
2997,2997,93.294244,130.571657,5.311118,119.174624,24.852150,39.421332,112.996152,128.685327,71.592304,...,Yes,Yes,No,Yes,Yes,No,Yes,No,Yes,0
2998,2998,100.005629,124.054126,5.145150,158.280116,26.359585,28.989608,122.985922,104.613596,74.805198,...,Yes,No,Yes,Yes,Yes,No,Yes,No,Yes,0


In [None]:
dataset.corr()["Diabetes_Status"]