In [1]:
# Incluir las bibliotecas requeridas para el modelo y su métrica
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
#Cargar el archivo desde el repositorio
# URL del archivo CSV público en S3
url = 'https://diabetes-dvcstore.s3.us-east-1.amazonaws.com/files/md5/10/1cada3906fab160ea188043d7f9a1b'
# Carga el archivo directamente en un DataFrame de pandas
df = pd.read_csv(url)

# Ver las primeras filas del DataFrame
df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,BMI,HeartDiseaseorAttack,GenHlth,Age
0,0.0,0.0,0.0,21.0,0.0,1.0,8.0
1,0.0,1.0,1.0,25.0,0.0,3.0,10.0
2,0.0,1.0,0.0,23.0,0.0,2.0,11.0
3,0.0,0.0,1.0,26.0,0.0,1.0,10.0
4,0.0,0.0,1.0,25.0,0.0,1.0,7.0


In [3]:
# Separar las características (X) y la variable objetivo (y)

X = df.drop(columns=['Diabetes_012'])
y = df['Diabetes_012']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_test.head()

Unnamed: 0,HighBP,HighChol,BMI,HeartDiseaseorAttack,GenHlth,Age
43070,1.0,1.0,30.0,1.0,3.0,9.0
15906,0.0,0.0,26.0,0.0,4.0,7.0
66360,1.0,1.0,40.0,0.0,3.0,12.0
7029,0.0,1.0,30.0,0.0,3.0,7.0
8053,1.0,1.0,24.0,0.0,4.0,5.0


In [4]:
# Parametrización del modelo con
# cuadrícula de hiperparámetros
param_grid = {
    'hidden_layer_sizes': [(256,), (300,), (256, 128)],
    'max_iter': [300, 600, 1000],  # Número de iteraciones máximas
    'learning_rate_init': [0.001, 0.01, 0.1]  # Tasa de aprendizaje inicial
}
mlp = MLPClassifier(random_state=64)
# Configura GridSearchCV
grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2)

# Entrena el modelo
grid_search.fit(X_train, y_train)  # X e y deben ser tus datos de entrada y salida

# Imprime los mejores parámetros y el puntaje correspondiente
print("Mejores parámetros encontrados: ", grid_search.best_params_)
print("Mejor puntuación: ", grid_search.best_score_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END hidden_layer_sizes=(256,), learning_rate_init=0.001, max_iter=300; total time=  36.8s
[CV] END hidden_layer_sizes=(256,), learning_rate_init=0.001, max_iter=300; total time=  12.6s
[CV] END hidden_layer_sizes=(256,), learning_rate_init=0.001, max_iter=300; total time=   9.4s
[CV] END hidden_layer_sizes=(256,), learning_rate_init=0.001, max_iter=600; total time=  13.5s
[CV] END hidden_layer_sizes=(256,), learning_rate_init=0.001, max_iter=600; total time=  12.7s
[CV] END hidden_layer_sizes=(256,), learning_rate_init=0.001, max_iter=600; total time=  11.3s
[CV] END hidden_layer_sizes=(256,), learning_rate_init=0.001, max_iter=1000; total time=  13.6s
[CV] END hidden_layer_sizes=(256,), learning_rate_init=0.001, max_iter=1000; total time=  13.5s
[CV] END hidden_layer_sizes=(256,), learning_rate_init=0.001, max_iter=1000; total time=  11.4s
[CV] END hidden_layer_sizes=(256,), learning_rate_init=0.01, max_iter=300; total 

In [5]:
# Crear el modelo de red neuronal (MLPClassifier)
modelo_nn = MLPClassifier(hidden_layer_sizes=(300,), max_iter=300, random_state=64)
modelo_nn.fit(X_train, y_train)
y_pred = modelo_nn.predict(X_test)

# Calcular la precisión del modelo
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Imprimir los resultados
print(f'Precisión del modelo de redes neuronales: {accuracy * 100:.2f}%')
print(f'Recall por clase: {recall* 100:.3f}%')

Precisión del modelo de redes neuronales: 74.53%
Recall por clase: 84.753%


In [6]:
import joblib
# Guardar el modelo para desplegarlo
joblib.dump(modelo_nn, 'diabetes_NeuralNet.pkl')

['diabetes_NeuralNet.pkl']

In [7]:
import sklearn
print(sklearn.__version__)

1.5.2
