In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score

# === 1. Cargar datos ===
X = pd.read_csv("X_ready.csv", index_col='gb_id')
df_labels = pd.read_csv("y_ready.csv", index_col='gb_id')

# === 2. Preparar etiquetas como multietiqueta ===
df_labels['label_list'] = df_labels['label'].str.split(';')

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df_labels['label_list'])

# === 3. División train/val ===
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# === 4. Definir modelos ===
models = {
    "RandomForest": OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)),
    "LogisticRegression": OneVsRestClassifier(LogisticRegression(max_iter=1000, solver='lbfgs'))
}

# === 5. Entrenar y evaluar ===
for name, model in models.items():
    print(f"\n=== Entrenando {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    print(f"\n=== Resultados para {name} ===")
    print("F1 Score (micro):", f1_score(y_val, y_pred, average='micro'))
    print("F1 Score (macro):", f1_score(y_val, y_pred, average='macro'))

    print("\n=== Classification Report ===")
    print(classification_report(y_val, y_pred, target_names=mlb.classes_))



=== Entrenando RandomForest ===

=== Resultados para RandomForest ===
F1 Score (micro): 0.5318892900120337
F1 Score (macro): 0.12979290271959623

=== Classification Report ===
                                      precision    recall  f1-score   support

                             Allergy       0.00      0.00      0.00         2
             Análisis cromatográfico       0.00      0.00      0.00         1
        Biochemical Research Methods       0.00      0.00      0.00         4
    Biochemistry & Molecular Biology       1.00      0.35      0.52        88
                             Biology       0.00      0.00      0.00         5
                          Biophysics       0.00      0.00      0.00         5
Biotechnology & Applied Microbiology       1.00      0.14      0.25         7
    Cardiac & Cardiovascular Systems       0.00      0.00      0.00         0
           Cell & Tissue Engineering       0.00      0.00      0.00         1
                        Cell Biology      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== Resultados para LogisticRegression ===
F1 Score (micro): 0.8587155963302753
F1 Score (macro): 0.4526152370691189

=== Classification Report ===
                                      precision    recall  f1-score   support

                             Allergy       0.00      0.00      0.00         2
             Análisis cromatográfico       0.00      0.00      0.00         1
        Biochemical Research Methods       0.00      0.00      0.00         4
    Biochemistry & Molecular Biology       0.87      0.70      0.78        88
                             Biology       0.00      0.00      0.00         5
                          Biophysics       1.00      0.40      0.57         5
Biotechnology & Applied Microbiology       1.00      0.71      0.83         7
    Cardiac & Cardiovascular Systems       0.00      0.00      0.00         0
           Cell & Tissue Engineering       0.00      0.00      0.00         1
                        Cell Biology       1.00      0.91      0.95   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_val, y_pred)
print(f"Multilabel exact match accuracy: {acc:.4f}")

Multilabel exact match accuracy: 0.7214
