In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
import re

def preprocess_mic(value):
    """Convierte los valores MIC en numéricos, manejando los signos >, <, ≥, ≤."""
    if pd.isna(value) or value == "-":
        return np.nan
    match = re.match(r'([<>≤≥]?)([0-9\.]+)', str(value))
    if match:
        sign, number = match.groups()
        number = float(number)
        if sign in ('>', '≥'):
            return number * 1.5  # Asumimos un incremento del 50%
        elif sign in ('<', '≤'):
            return number * 0.75  # Asumimos un decremento del 25%
        return number
    return np.nan

def preprocess_mutations(df):
    """Transforma las mutaciones en variables binarias usando MultiLabelBinarizer."""
    mutation_data = df.iloc[:, 11:].replace('-', np.nan).fillna('')
    mutation_data = mutation_data.applymap(lambda x: set(x.split(',')) if x else set())
    mlb = MultiLabelBinarizer()
    mutation_matrix = pd.DataFrame(mlb.fit_transform(mutation_data), columns=mlb.classes_, index=df.index)
    return mutation_matrix

# Cargar datos de entrenamiento
train_df = pd.read_csv("training_data.csv")

# Preprocesar valores MIC
for col in ['IMI', 'AZT', 'FEP', 'MER', 'CIP']:
    train_df[col] = train_df[col].apply(preprocess_mic)

# Transformar etiquetas en valores numéricos
label_mapping = {'S': 0, 'I': 1, 'R': 2}
for col in ['IMI_label', 'AZT_label', 'FEP_label', 'MER_label', 'CIP_label']:
    train_df[col] = train_df[col].map(label_mapping)

# Procesar mutaciones
mutation_features = preprocess_mutations(train_df)

# Preparar datos de entrada
X = pd.concat([train_df[['IMI', 'AZT', 'FEP', 'MER', 'CIP']], mutation_features], axis=1)
y = train_df[['IMI_label', 'AZT_label', 'FEP_label', 'MER_label', 'CIP_label']]

# Dividir en entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar modelo Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluar el modelo
preds = rf.predict(X_val)
print(classification_report(y_val, preds))

# Cargar y procesar datos de prueba
test_df = pd.read_csv("test_data.csv")
test_mutations = preprocess_mutations(test_df)
X_test = test_mutations.reindex(columns=mutation_features.columns, fill_value=0)

# Hacer predicciones sobre datos de prueba
test_preds = rf.predict(X_test)
pred_labels = pd.DataFrame(test_preds, columns=['IMI_pred', 'AZT_pred', 'FEP_pred', 'MER_pred', 'CIP_pred'])

# Guardar resultados
output = pd.concat([test_df[['ID']], pred_labels], axis=1)
output.to_csv("predictions.csv", index=False)