In [None]:
import pandas as pd

# Cargar el CSV principal
df = pd.read_csv("data/Data_Entry_2017.csv")

In [15]:
df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112120 entries, 0 to 112119
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Image Index                  112120 non-null  object 
 1   Finding Labels               112120 non-null  object 
 2   Follow-up #                  112120 non-null  int64  
 3   Patient ID                   112120 non-null  int64  
 4   Patient Age                  112120 non-null  int64  
 5   Patient Gender               112120 non-null  object 
 6   View Position                112120 non-null  object 
 7   OriginalImage[Width          112120 non-null  int64  
 8   Height]                      112120 non-null  int64  
 9   OriginalImagePixelSpacing[x  112120 non-null  float64
 10  y]                           112120 non-null  float64
 11  Unnamed: 11                  0 non-null       float64
dtypes: float64(3), int64(5), object(4)
memory usage: 10.3+ MB


In [17]:
entries: set = set()
for index, row in df.iterrows():
    entries.update(row["Finding Labels"].split("|"))
entries

{'Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Effusion',
 'Emphysema',
 'Fibrosis',
 'Hernia',
 'Infiltration',
 'Mass',
 'No Finding',
 'Nodule',
 'Pleural_Thickening',
 'Pneumonia',
 'Pneumothorax'}

In [22]:
# Filtrar solo las imágenes del primer zip
df_subset = df[df["Image Index"].str.startswith("000")].copy()

# Seleccionar las primeras 500
df_subset = df_subset.head(300).reset_index(drop=True)

In [23]:
df_subset["Finding Labels"] = df_subset["Finding Labels"].str.split("|")

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df_subset["Finding Labels"])


In [24]:
import cv2
import numpy as np
import os

IMG_DIR = "data"
IMG_SIZE = 224  

def load_image(fname):
    path = os.path.join(IMG_DIR, fname)
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
    img = np.stack([img]*3, axis=-1)  # convertir a 3 canales
    return img / 255.0

X = np.array([load_image(fname) for fname in df_subset["Image Index"]])




In [25]:
import tensorflow as tf

base_model = tf.keras.applications.DenseNet121(
    weights="imagenet", include_top=False, input_shape=(224, 224, 3)
)
x = tf.keras.layers.GlobalAveragePooling2D()(base_model.output)
x = tf.keras.layers.Dense(128, activation="relu")(x)
output = tf.keras.layers.Dense(len(mlb.classes_), activation="sigmoid")(x)

model = tf.keras.Model(inputs=base_model.input, outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])



In [26]:
import mlflow
import mlflow.tensorflow

mlflow.set_tracking_uri("file:/home/speranza/Documentos/OPL-proyecto-final/mlruns")
mlflow.set_experiment("modelo_proy_final_5")

<Experiment: artifact_location='file:///home/speranza/Documentos/OPL-proyecto-final/mlruns/612120675996488765', creation_time=1756600125678, experiment_id='612120675996488765', last_update_time=1756600125678, lifecycle_stage='active', name='modelo_proy_final_5', tags={}>

In [28]:
with mlflow.start_run(run_name="entrenamiento_run"):
    # Entrenamiento
    history = model.fit(X, y, epochs=4, batch_size=4, validation_split=0.2)

    # Log de métricas por época
    for epoch in range(len(history.history["loss"])):
        mlflow.log_metric("loss", history.history["loss"][epoch], step=epoch)
        mlflow.log_metric("val_loss", history.history["val_loss"][epoch], step=epoch)
        mlflow.log_metric("accuracy", history.history["accuracy"][epoch], step=epoch)
        mlflow.log_metric("val_accuracy", history.history["val_accuracy"][epoch], step=epoch)

    # Log del modelo
    #mlflow.tensorflow.log_model(model, artifact_path="modelo_tf_final")
    model.save("modelo_final.keras")  # guardas tú el archivo
    mlflow.log_artifact("modelo_final.keras")  # lo registras como artefacto manual


Epoch 1/4
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 2s/step - accuracy: 0.2295 - loss: 0.4037 - val_accuracy: 0.0167 - val_loss: 1.9442
Epoch 2/4
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 2s/step - accuracy: 0.3262 - loss: 0.3082 - val_accuracy: 0.0167 - val_loss: 0.4261
Epoch 3/4
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 2s/step - accuracy: 0.3212 - loss: 0.2771 - val_accuracy: 0.0000e+00 - val_loss: 1.1167
Epoch 4/4
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 2s/step - accuracy: 0.3561 - loss: 0.2762 - val_accuracy: 0.2667 - val_loss: 0.4071


In [29]:
import tensorflow as tf

model = tf.keras.models.load_model("modelo_final.keras")


In [65]:
from tensorflow.keras.preprocessing import image
import numpy as np

# Ruta de la imagen
img_path = "/home/speranza/Documentos/OPL-proyecto-final/notebooks/data/00000032_050.png"

# Cargar y redimensionar
img = image.load_img(img_path, target_size=(224, 224))
img_array = image.img_to_array(img)

# Expandir dimensión para batch (modelo espera batch)
img_array = np.expand_dims(img_array, axis=0)

# Normalizar como hace DenseNet121
img_array = tf.keras.applications.densenet.preprocess_input(img_array)


In [66]:
pred = model.predict(img_array)  # salida: array de probabilidades


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 220ms/step


In [67]:
# Obtener etiquetas activas
threshold = 0.5  # umbral, puedes ajustarlo
labels_pred = [mlb.classes_[i] for i, p in enumerate(pred[0]) if p > threshold]

print("Etiquetas predichas:", labels_pred)


Etiquetas predichas: ['Emphysema', 'Infiltration']
