# Inferencia con Ministral3-8B + LoRA (pesos propios)

Este cuaderno carga los pesos LoRA entrenados previamente y realiza inferencia por batch en DEV y TEST.

## Importar librerías

In [None]:
import os
import torch
import pandas as pd
import numpy as np
from dotenv import load_dotenv

from transformers import Mistral3ForConditionalGeneration, FineGrainedFP8Config
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.messages import UserMessage, SystemMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from peft import PeftModel

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from pyevall.evaluation import PyEvALLEvaluation
from pyevall.metrics.metricfactory import MetricFactory

# Cargar variables de entorno desde .env
load_dotenv('.env')
# Si no funciona, intentar con ruta relativa al notebook
if not os.getenv("HF_TOKEN"):
    load_dotenv()


In [None]:
# Cargar el token de HuggingFace desde .env
HF_TOKEN = os.getenv("HF_TOKEN", "")
if HF_TOKEN:
    print("✓ HF_TOKEN cargado desde .env")
else:
    print("⚠ HF_TOKEN no encontrado en .env - Continuando sin token")


## Configuración y rutas

In [None]:
BASE_MODEL_NAME = "mistralai/Ministral-3-8B-Instruct-2512"
LORA_WEIGHTS_PATH = "../results_v2/3Ministral8B_LoRA/lora_weights"

MAIN_PATH   = ".."
GROUP_ID    = "BeingChillingWeWillWin"
MODEL_ID    = "3Ministral8B_ft"

TEXT_COLUMN  = "tweet"
LABEL_COLUMN = "task1"

DATA_VAL_PATH  = os.path.join(MAIN_PATH, "preprocessed_data", "val_preprocessed_v2.json")
DATA_TEST_PATH = os.path.join(MAIN_PATH, "preprocessed_data", "test_preprocessed_v2.json")

PREDICTIONS_DIR = os.path.join(MAIN_PATH, "results_v2", "3Ministral8B_LoRA", "predictions")
os.makedirs(PREDICTIONS_DIR, exist_ok=True)

INFER_BATCH_SIZE = 16
MAX_INPUT_LEN    = 256
MAX_NEW_TOKENS   = 5

## Carga de datos

In [3]:
val_df  = pd.read_json(DATA_VAL_PATH)
test_df = pd.read_json(DATA_TEST_PATH)

label_map         = {"NO": 0, "YES": 1}
label_map_inverse = {0: "NO", 1: "YES"}

val_df["label"] = val_df[LABEL_COLUMN].map(label_map)

print(f"Val: {len(val_df)} | Test: {len(test_df)}")
print("\nDistribución VAL:")
print(val_df[LABEL_COLUMN].value_counts())

Val: 910 | Test: 934

Distribución VAL:
task1
NO     505
YES    405
Name: count, dtype: int64


## Carga del modelo base + pesos LoRA

Cargamos el modelo base en BF16 y después montamos los adaptadores LoRA con `PeftModel.from_pretrained`.

In [None]:
# Cargar el tokenizer de Mistral
# Mistral-3 usa el tokenizer v3 de Mistral
tokenizer = MistralTokenizer.v3(is_tekken=True)

print("Cargando modelo base...")
# El modelo viene pre-cuantizado en FP8, usamos su configuración nativa
base_model = Mistral3ForConditionalGeneration.from_pretrained(
    BASE_MODEL_NAME,
    device_map="auto",
    quantization_config=FineGrainedFP8Config(dequantize=True),
    token=HF_TOKEN
)

print(f"Cargando pesos LoRA desde: {LORA_WEIGHTS_PATH}")
model = PeftModel.from_pretrained(base_model, LORA_WEIGHTS_PATH)
model.eval()
print("Modelo listo para inferencia.")


Cargando modelo base...


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/531 [00:00<?, ?it/s]

Cargando pesos LoRA desde: ../results/3Ministral8B_LoRA/lora_weights


Modelo listo para inferencia.


## Prompt y función de inferencia por batch

In [None]:
SYSTEM_PROMPT = (
    "You are a text classification assistant. "
    "Your task is to determine whether the following text contains sexism. "
    "Answer with exactly one word: YES or NO."
)

def build_messages(text: str):
    return [
        SystemMessage(content=SYSTEM_PROMPT),
        UserMessage(content=f"Text: {text}\n\nDoes this text contain sexism?"),
    ]

def predict_batch(texts: list, batch_size: int = INFER_BATCH_SIZE, verbose: bool = True) -> list:
    all_preds = []

    for i in range(0, len(texts), batch_size):
        chunk          = texts[i : i + batch_size]
        batch_messages = [build_messages(text) for text in chunk]

        # Tokenizar cada mensaje por separado
        tokenized_list = []
        for messages in batch_messages:
            tokenized = tokenizer.encode_chat_completion(
                ChatCompletionRequest(messages=messages)
            )
            tokenized_list.append(tokenized.tokens)
        
        # Encontrar la longitud máxima
        max_len = max(len(tokens) for tokens in tokenized_list)
        
        # Pad las secuencias
        input_ids = []
        attention_mask = []
        for tokens in tokenized_list:
            padding_length = max_len - len(tokens)
            padded_tokens = tokens + [0] * padding_length
            mask = [1] * len(tokens) + [0] * padding_length
            input_ids.append(padded_tokens)
            attention_mask.append(mask)
        
        input_ids = torch.tensor(input_ids).to("cuda")
        attention_mask = torch.tensor(attention_mask).to("cuda")
        input_len = input_ids.shape[1]

        with torch.no_grad():
            output_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=MAX_NEW_TOKENS,
                do_sample=False,
                pad_token_id=0,
            )

        for output in output_ids:
            decoded = tokenizer.decode(output[input_len:].tolist()).strip().upper()
            if "YES" in decoded:
                all_preds.append("YES")
            elif "NO" in decoded:
                all_preds.append("NO")
            else:
                print(f"[WARN] Respuesta inesperada: '{decoded}' → se asigna NO")
                all_preds.append("NO")

        if verbose:
            print(f"  Procesados {min(i + batch_size, len(texts))}/{len(texts)}...")

    return all_preds

## Inferencia en DEV

In [6]:
print("Realizando inferencia en DEV...")
dev_preds_str = predict_batch(val_df[TEXT_COLUMN].tolist())
dev_preds     = np.array([label_map[p] for p in dev_preds_str])
y_true_dev    = val_df["label"].values

precision, recall, f1, _ = precision_recall_fscore_support(
    y_true_dev, dev_preds, average="binary", zero_division=0
)
acc = accuracy_score(y_true_dev, dev_preds)

print(f"\nMétricas en DEV:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")

Realizando inferencia en DEV...


  Procesados 16/910...


  Procesados 32/910...


  Procesados 48/910...


  Procesados 64/910...


  Procesados 80/910...


  Procesados 96/910...


  Procesados 112/910...


  Procesados 128/910...


  Procesados 144/910...


  Procesados 160/910...


  Procesados 176/910...


  Procesados 192/910...


  Procesados 208/910...


  Procesados 224/910...


  Procesados 240/910...


  Procesados 256/910...


  Procesados 272/910...


  Procesados 288/910...


  Procesados 304/910...


  Procesados 320/910...


  Procesados 336/910...


  Procesados 352/910...


  Procesados 368/910...


  Procesados 384/910...


  Procesados 400/910...


  Procesados 416/910...


  Procesados 432/910...


  Procesados 448/910...


  Procesados 464/910...


  Procesados 480/910...


  Procesados 496/910...


  Procesados 512/910...


  Procesados 528/910...


  Procesados 544/910...


  Procesados 560/910...


  Procesados 576/910...


  Procesados 592/910...


  Procesados 608/910...


  Procesados 624/910...


  Procesados 640/910...


  Procesados 656/910...


  Procesados 672/910...


  Procesados 688/910...


  Procesados 704/910...


  Procesados 720/910...


  Procesados 736/910...


  Procesados 752/910...


  Procesados 768/910...


  Procesados 784/910...


  Procesados 800/910...


  Procesados 816/910...


[WARN] Respuesta inesperada: ', IT IS. IT' → se asigna NO
  Procesados 832/910...


  Procesados 848/910...


  Procesados 864/910...


  Procesados 880/910...


  Procesados 896/910...


  Procesados 910/910...

Métricas en DEV:
Accuracy:  0.8451
Precision: 0.8587
Recall:    0.7802
F1-Score:  0.8176


## Evaluación en DEV con PyEvALL

In [7]:
dev_preds_for_pyevall = [
    {"test_case": "EXIST2025", "id": str(id_exist), "value": pred}
    for id_exist, pred in zip(val_df["id_EXIST"].values, dev_preds_str)
]
dev_preds_df   = pd.DataFrame(dev_preds_for_pyevall)
dev_preds_path = os.path.join(PREDICTIONS_DIR, "dev_predictions_temp.json")
with open(dev_preds_path, "w", encoding="utf-8") as f:
    f.write(dev_preds_df.to_json(orient="records"))

dev_gold = [
    {"test_case": "EXIST2025", "id": str(id_exist), "value": label}
    for id_exist, label in zip(val_df["id_EXIST"].values, val_df[LABEL_COLUMN].values)
]
dev_gold_df   = pd.DataFrame(dev_gold)
dev_gold_path = os.path.join(PREDICTIONS_DIR, "dev_gold_temp.json")
with open(dev_gold_path, "w", encoding="utf-8") as f:
    f.write(dev_gold_df.to_json(orient="records"))

evaluator = PyEvALLEvaluation()
metrics   = [MetricFactory.Accuracy.value, MetricFactory.FMeasure.value]
report    = evaluator.evaluate(dev_preds_path, dev_gold_path, metrics)
print("\n=== Evaluación en DEV con PyEvALL ===")
report.print_report()

2026-02-25 18:22:44,704 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['Accuracy', 'FMeasure']


2026-02-25 18:22:44,754 - pyevall.metrics.metrics - INFO -             evaluate() - Executing accuracy evaluation method


2026-02-25 18:22:44,866 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method



=== Evaluación en DEV con PyEvALL ===


{
  "metrics": {
    "Accuracy": {
      "name": "Accuracy",
      "acronym": "Acc",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average": 0.845054945054945
        }],
        "average_per_test_case": 0.845054945054945
      }
    },
    "FMeasure": {
      "name": "F-Measure",
      "acronym": "F1",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "classes": {
            "NO": 0.8653295128939827,
            "YES": 0.817593790426908
          },
          "average": 0.8414616516604454
        }],
        "average_per_test_case": 0.8414616516604454
      }
    }
  },
  "files": {
    "dev_predictions_temp.json": {
      "name": "dev_predictions_temp.json",
      "status": "OK",
      "gold": false,
      "description": "Use parameter: report=\"embedded\"!",
      "errors": {}
    },
    "dev

## Inferencia en TEST

In [8]:
print("Realizando inferencia en TEST...")
test_preds_str = predict_batch(test_df[TEXT_COLUMN].tolist())
test_preds     = np.array([label_map[p] for p in test_preds_str])

print(f"\nPredicciones en TEST:")
print(f"Total muestras   : {len(test_preds)}")
print(f"Predicciones YES : {np.sum(test_preds == 1)} ({100*np.mean(test_preds == 1):.2f}%)")
print(f"Predicciones NO  : {np.sum(test_preds == 0)} ({100*np.mean(test_preds == 0):.2f}%)")

Realizando inferencia en TEST...


  Procesados 16/934...


  Procesados 32/934...


  Procesados 48/934...


  Procesados 64/934...


  Procesados 80/934...


  Procesados 96/934...


  Procesados 112/934...


  Procesados 128/934...


  Procesados 144/934...


  Procesados 160/934...


  Procesados 176/934...


  Procesados 192/934...


  Procesados 208/934...


  Procesados 224/934...


  Procesados 240/934...


  Procesados 256/934...


  Procesados 272/934...


  Procesados 288/934...


  Procesados 304/934...


  Procesados 320/934...


  Procesados 336/934...


  Procesados 352/934...


  Procesados 368/934...


  Procesados 384/934...


  Procesados 400/934...


  Procesados 416/934...


  Procesados 432/934...


  Procesados 448/934...


  Procesados 464/934...


  Procesados 480/934...


  Procesados 496/934...


[WARN] Respuesta inesperada: 'SEXISM?' → se asigna NO
  Procesados 512/934...


  Procesados 528/934...


  Procesados 544/934...


  Procesados 560/934...


  Procesados 576/934...


  Procesados 592/934...


  Procesados 608/934...


  Procesados 624/934...


  Procesados 640/934...


  Procesados 656/934...


  Procesados 672/934...


  Procesados 688/934...


  Procesados 704/934...


  Procesados 720/934...


  Procesados 736/934...


  Procesados 752/934...


  Procesados 768/934...


  Procesados 784/934...


  Procesados 800/934...


  Procesados 816/934...


  Procesados 832/934...


  Procesados 848/934...


  Procesados 864/934...


  Procesados 880/934...


  Procesados 896/934...


  Procesados 912/934...


  Procesados 928/934...


  Procesados 934/934...

Predicciones en TEST:
Total muestras   : 934
Predicciones YES : 395 (42.29%)
Predicciones NO  : 539 (57.71%)


## Guardar predicciones TEST en formato PyEvALL

In [9]:
test_preds_for_submission = [
    {"test_case": "EXIST2025", "id": str(id_exist), "value": pred}
    for id_exist, pred in zip(test_df["id_EXIST"].values, test_preds_str)
]

test_preds_df   = pd.DataFrame(test_preds_for_submission)
output_filename = f"{GROUP_ID}_{MODEL_ID}.json"
output_path     = os.path.join(PREDICTIONS_DIR, output_filename)

with open(output_path, "w", encoding="utf-8") as f:
    f.write(test_preds_df.to_json(orient="records"))

print(f"Predicciones guardadas en: {output_path}")

Predicciones guardadas en: ../results/3Ministral8B_LoRA/predictions/BeingChillingWeWillWin_3Ministral8B_ft.json
