In [1]:
# Importing the libraries
import numpy as np # for array operations
import pandas as pd # for working with DataFrames
# import requests, io # for HTTP requests and I/O commands
import matplotlib.pyplot as plt # for data visualization
#%matplotlib inline

# scikit-learn modules
from sklearn.model_selection import train_test_split # for splitting the data
from sklearn.metrics import classification_report, accuracy_score # for calculating the cost function
from sklearn.tree import DecisionTreeClassifier # f

In [34]:
from sklearn.model_selection import GridSearchCV

In [20]:
#Cargo el archivo parquet
df = pd.read_parquet(r'dataset\diabetes_ok.parquet')

In [21]:
df.sample(6)

Unnamed: 0,Age,BMI,Chol,TG,HDL,LDL,BUN,Diabetes
4245,74,26,4.74,1.19,4.860753,4.860753,6.07,1
985,45,27,4.42,1.14,0.94,3.13,5.29,0
4008,44,28,4.79,1.34,1.54,2.66,3.83,1
1367,29,20,3.94,1.33,1.23,2.1,3.6,0
2934,61,25,5.95,3.19,1.21,3.15,5.9,0
4718,54,25,5.07,2.17,0.93,3.16,5.29,1


In [22]:
df.shape

(5131, 8)

In [23]:
df1 = df.copy()

### Separating the features and the target variable

In [24]:
x = df1.drop(['Diabetes'], axis=1) #Features
y = df1['Diabetes'] #target

### Spliting the data into train set and a test set

In [25]:
# Splitting the dataset into training and testing set (80/20)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 28)

### Check Hiper-p

In [33]:
# Hiper-p search space
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [35]:
decision_tree = DecisionTreeClassifier()

In [36]:
#Grid configuration
grid_search = GridSearchCV(decision_tree, param_grid, cv=5, scoring='accuracy')


In [37]:
# Find the best Hiper-p
grid_search.fit(x_train, y_train)

360 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "d:\0_Respaldo\0_Proyectos_2024\Diabetes alert\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\0_Respaldo\0_Proyectos_2024\Diabetes alert\.venv\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "d:\0_Respaldo\0_Proyectos_2024\Diabetes alert\.venv\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "d:\0_Respaldo\0_Proyectos_2024\Diabetes alert\.venv\Lib\site-pac

In [38]:
# Best Hiper-p finded
print("Mejores hiperparámetros encontrados:")
print(grid_search.best_params_)

Mejores hiperparámetros encontrados:
{'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2}


In [39]:
# Score result:
print("Precisión del modelo con los mejores hiperparámetros:", grid_search.best_score_)

Precisión del modelo con los mejores hiperparámetros: 0.8128674133269955


### Fitting the model to the training dataset

In [41]:
best_params ={'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2}

In [55]:
# Initializing the Decision Tree Regression model
model = DecisionTreeClassifier(**best_params,random_state = 0)

# Fitting the Decision Tree Regression model to the data
feature_names = df1.columns.to_list()
model.fit(x_train, y_train,)

### Calculating the loss after training


In [43]:
# Predicting the target values of the test set
y_pred = model.predict(x_test)


In [44]:
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.7828627069133398


In [45]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.90      0.84       638
           1       0.79      0.59      0.67       389

    accuracy                           0.78      1027
   macro avg       0.78      0.74      0.75      1027
weighted avg       0.78      0.78      0.77      1027



### Exporto el modelo

In [46]:
# Model saved
import joblib as jb

In [47]:
jb.dump(model, 'blood_model.pkl')

['blood_model.pkl']

In [48]:
#load model
jb.load('blood_model.pkl')

In [49]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5131 entries, 0 to 5130
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       5131 non-null   int64  
 1   BMI       5131 non-null   int64  
 2   Chol      5131 non-null   float64
 3   TG        5131 non-null   float64
 4   HDL       5131 non-null   float64
 5   LDL       5131 non-null   float64
 6   BUN       5131 non-null   float64
 7   Diabetes  5131 non-null   int64  
dtypes: float64(5), int64(3)
memory usage: 320.8 KB


### Make a prediction

In [50]:
Age= 20  
BMI= 20
Chol= 5
TG= 5
HDL= 3
LDL= 3
BUN= 5

In [51]:
test_result =[
    Age,
    BMI,
    Chol,
    TG,
    HDL,
    LDL,
    BUN
]

In [52]:
#Prediction 0=Negative 1=Positive
#W = np.array(v_list)
w_pred = model.predict([test_result])
print([w_pred])
print(int(w_pred[0]))

[array([0], dtype=int64)]
0


