In [1]:
import os
from dotenv import load_dotenv
from groq import Groq

import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report
import numpy as np


In [2]:
load_dotenv()

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if GROQ_API_KEY is None:
    raise ValueError("‚ùå Missing GROQ_API_KEY in environment variables.")

client = Groq(api_key=GROQ_API_KEY)

def groq_generate(prompt, model):
    """Generic Groq requester with custom model."""
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a tweet classifier that detects expressions of hope."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.1,
        max_completion_tokens=80,
    )
    return response.choices[0].message.content.strip()


In [3]:
df = pd.read_csv("SHSD.csv")
print("Dataset loaded:", len(df))
df.head()


Dataset loaded: 19183


Unnamed: 0,text,binary,multiclass
0,todo amor que yo esper√© de la vida lo he encon...,Hope,Generalized Hope
1,Hola #USER# cuando van poner cajas autoservi...,Not Hope,Not Hope
2,#USER# Se√±or Mateu pero este tipo de imagen se...,Not Hope,Not Hope
3,#EspnF90 el var se cre√≥ para ayudar que los √°r...,Not Hope,Not Hope
4,hay un personaje de la primera peli que me rec...,Not Hope,Not Hope


In [4]:
# ============================
# Subset BINARIO (Hope vs Not Hope)
# ============================
subset = df[df['binary'].isin(['Not Hope', 'Hope'])]

subsetBin = (
    subset
    .groupby('binary', group_keys=False)
    .apply(lambda x: x.sample(500, random_state=42))
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)

# ============================
# Subset MULTICLASS
# ============================
subset2 = df[df['multiclass'].isin([
    'Generalized Hope',
    'Realistic Hope',
    'Unrealistic Hope'
])]

subsetMulti = (
    subset2
    .groupby('multiclass', group_keys=False)
    .apply(lambda x: x.sample(334, random_state=42))
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)

print("Binary subset:", len(subsetBin))
print("Multiclass subset:", len(subsetMulti))


Binary subset: 1000
Multiclass subset: 1002


  .apply(lambda x: x.sample(500, random_state=42))
  .apply(lambda x: x.sample(334, random_state=42))


In [5]:
def clasificarBin_batch(lista_textos, prompt_version=1, df=None, model=None):
    """
    Clasifica una LISTA de textos en:
    - Esperanza
    - No Esperanza

    prompt_version:
    1 ‚Üí Zero-shot
    2 ‚Üí One-shot (1 ejemplo)
    3 ‚Üí Few-shot (hasta 5 ejemplos)

    Retorna SIEMPRE len(lista_textos) etiquetas.
    """

    VALID_LABELS = ["Esperanza", "No Esperanza"]

    definicion_hope = "Esperanza significa expresar optimismo, confianza o el deseo de un futuro mejor."

    # ===================== BLOQUE NUMERADO =====================
    bloque_tweets = "\n".join(
        f"{i+1}. {texto}" for i, texto in enumerate(lista_textos)
    )

    # ===================== EJEMPLOS =====================
    ejemplo_texto = ""
    ejemplos_texto = ""

    if df is not None and "binary" in df.columns and not df.empty:

        # üîπ ONE-SHOT ‚Üí 1 ejemplo
        if prompt_version == 2:
            row = df.sample(1).iloc[0]
            ejemplo_texto = f'"{row["text"]}" ‚Üí {row["binary"]}'

        # üîπ FEW-SHOT ‚Üí hasta 5 ejemplos
        elif prompt_version == 3:
            ejemplos = df.sample(min(5, len(df)))
            ejemplos_texto = "\n".join(
                f'"{row["text"]}" ‚Üí {row["binary"]}'
                for _, row in ejemplos.iterrows()
            )

    # Fallback si no hay df
    if prompt_version == 2 and ejemplo_texto == "":
        ejemplo_texto = '"Las cosas mejorar√°n pronto." ‚Üí Esperanza'

    if prompt_version == 3 and ejemplos_texto == "":
        ejemplos_texto = '"Las cosas mejorar√°n pronto." ‚Üí Esperanza'

    # ===================== PROMPTS =====================

    # ---------- ZERO SHOT ----------
    if prompt_version == 1:
        prompt = f"""
Definici√≥n: {definicion_hope}

Recibir√°s una lista numerada de tuits.
Devuelve EXACTAMENTE UNA ETIQUETA POR L√çNEA, en el mismo orden.
Etiquetas v√°lidas:
- Esperanza
- No Esperanza

Tuits:
{bloque_tweets}

Ahora devuelve SOLO las etiquetas, una por l√≠nea, sin n√∫meros y sin texto adicional:
"""

    # ---------- ONE SHOT (1 ejemplo) ----------
    elif prompt_version == 2:
        prompt = f"""
Definici√≥n: {definicion_hope}

Ejemplo:
{ejemplo_texto}

Recibir√°s una lista numerada de tuits.
Devuelve EXACTAMENTE UNA ETIQUETA POR L√çNEA, en el mismo orden.
Etiquetas v√°lidas:
- Esperanza
- No Esperanza

Tuits:
{bloque_tweets}

Ahora devuelve SOLO las etiquetas, una por l√≠nea, sin n√∫meros y sin texto adicional:
"""

    # ---------- FEW SHOT (5 ejemplos) ----------
    else:
        prompt = f"""
Definici√≥n: {definicion_hope}

Ejemplos:
{ejemplos_texto}

Recibir√°s una lista numerada de tuits.
Devuelve EXACTAMENTE UNA ETIQUETA POR L√çNEA, en el mismo orden.
Etiquetas v√°lidas:
- Esperanza
- No Esperanza

Tuits:
{bloque_tweets}

Ahora devuelve SOLO las etiquetas, una por l√≠nea, sin n√∫meros y sin texto adicional:
"""

    # ===================== CALL MODEL =====================
    raw = groq_generate(prompt, model=model)
    raw_lines = [l.strip() for l in raw.splitlines() if l.strip()]

    # ===================== PARSER ROBUSTO =====================
    labels = []

    for line in raw_lines:
        clean = line.strip()

        if clean == "":
            continue
        if clean.lower().startswith(("tweet", "label", "output")):
            continue
        if ":" in clean:
            clean = clean.split(":")[-1].strip()

        for v in VALID_LABELS:
            if v.lower() == clean.lower():
                labels.append(v)
                break

    # ===================== FALLBACK SEGURO =====================
    if len(labels) < len(lista_textos):
        labels += ["No Esperanza"] * (len(lista_textos) - len(labels))
    elif len(labels) > len(lista_textos):
        labels = labels[:len(lista_textos)]

    return labels


In [6]:
def clasificarMulti_batch(lista_textos, prompt_version=1, df=None, model=None):
    """
    Clasifica textos usando SOLO LAS ETIQUETAS DEL PROMPT:
    - Esperanza Abstracta
    - Esperanza Realista
    - Deseo Fant√°stico

    ‚ùó No aparecen etiquetas del dataset aqu√≠.
    """

    PROMPT_LABELS = [
        "Esperanza Abstracta",
        "Esperanza Realista",
        "Deseo Fant√°stico"
    ]

    # ---------- Dataset para shots (mapeado) ----------
    df_hope = None
    if df is not None and "multiclass" in df.columns:
        df_hope = df.copy()
        df_hope["prompt_label"] = df_hope["multiclass"].map({
            "Generalized Hope": "Esperanza Abstracta",
            "Realistic Hope": "Esperanza Realista",
            "Unrealistic Hope": "Deseo Fant√°stico"
        })
        df_hope = df_hope[df_hope["prompt_label"].notna()]

    bloque_tweets = "\n".join(f"{i+1}. {t}" for i, t in enumerate(lista_textos))

    RULES = """
REGLAS DE CLASIFICACI√ìN:

DESEO FANT√ÅSTICO:
- Resultados imposibles, milagrosos, m√°gicos o irreales.
- Deseos ficticios, c√≥smicos o claramente exagerados.

ESPERANZA REALISTA:
- Esperanza claramente ligada a una situaci√≥n o evento del mundo real.
- Asociada a un contexto concreto (evento, fecha, persona, instituci√≥n).
- Plausible dentro de la realidad normal.

ESPERANZA ABSTRACTA:
- Expresiones generales, emocionales o simb√≥licas de esperanza.
- Bendiciones, oraciones, mensajes de √°nimo u optimismo vago.
- NO claramente asociada a un evento o situaci√≥n espec√≠fica.
"""

    # ---------- PROMPTS ----------
    if prompt_version == 1:  # Zero-shot
        prompt = f"""
Clasifica cada tuit en UNA de las siguientes categor√≠as:
- Esperanza Abstracta
- Esperanza Realista
- Deseo Fant√°stico

{RULES}

Tuits:
{bloque_tweets}

Devuelve SOLO una etiqueta por l√≠nea:
"""

    elif prompt_version == 2:  # One-shot
        ex = df_hope.sample(1).iloc[0] if df_hope is not None else None
        ej_text = ex["text"] if ex is not None else "Espero que un milagro arregle todo de la noche a la ma√±ana."
        ej_label = ex["prompt_label"] if ex is not None else "Deseo Fant√°stico"

        prompt = f"""
Clasifica cada tuit en UNA de las siguientes categor√≠as:
- Esperanza Abstracta
- Esperanza Realista
- Deseo Fant√°stico

{RULES}

Ejemplo:
"{ej_text}" ‚Üí {ej_label}

Tuits:
{bloque_tweets}

Devuelve SOLO una etiqueta por l√≠nea:
"""

    else:  # Few-shot
        ejemplos = []
        if df_hope is not None:
            for lbl in PROMPT_LABELS:
                sub = df_hope[df_hope["prompt_label"] == lbl]
                if not sub.empty:
                    r = sub.sample(1).iloc[0]
                    ejemplos.append(f'"{r["text"]}" ‚Üí {lbl}')

        ejemplos_texto = "\n".join(ejemplos)

        prompt = f"""
Clasifica cada tuit en UNA de las siguientes categor√≠as:
- Esperanza Abstracta
- Esperanza Realista
- Deseo Fant√°stico

{RULES}

Ejemplos:
{ejemplos_texto}

Tuits:
{bloque_tweets}

Devuelve SOLO una etiqueta por l√≠nea:
"""

    raw = groq_generate(prompt, model=model)
    raw_lines = [l.strip() for l in raw.splitlines() if l.strip()]

    def normalize(label):
        t = label.lower()
        if "fant√°stico" in t or "fantastico" in t or "deseo" in t:
            return "Deseo Fant√°stico"
        if "realista" in t:
            return "Esperanza Realista"
        if "abstracta" in t or "abstracto" in t:
            return "Esperanza Abstracta"
        return "Esperanza Abstracta"

    labels = [normalize(l) for l in raw_lines]

    if len(labels) < len(lista_textos):
        labels += ["Esperanza Abstracta"] * (len(lista_textos) - len(labels))
    elif len(labels) > len(lista_textos):
        labels = labels[:len(lista_textos)]

    return labels


In [7]:
    MODELS = {
        # ---------- LLaMA family ----------
        "LLaMA-8B": "llama-3.1-8b-instant",
        "LLaMA-70B": "llama-3.3-70b-versatile",
    
        # ---------- LLaMA-4 (Instruct variants) ----------
        "Maverick-17B": "meta-llama/llama-4-maverick-17b-128e-instruct",
        "Scout-17B": "meta-llama/llama-4-scout-17b-16e-instruct",
    
        # ---------- External / Non-LLaMA ----------
        "Allam-7B": "allam-2-7b",
        "Kimi-K2": "moonshotai/kimi-k2-instruct",
        "Qwen-32B": "qwen/qwen3-32b"
    }


In [8]:
from tqdm.auto import tqdm
from groq import RateLimitError
import pandas as pd

RESULTS = []

BATCH_SIZE = 20
NUM_RECORDS = 200

finBin = subsetBin.sample(NUM_RECORDS).copy()  # ‚úÖ SOLO binary

# ‚õî MODELOS A SALTAR
SKIP_MODELS = []

print("‚è≠Ô∏è Skipping models:", SKIP_MODELS)

# ===================== LOOP PRINCIPAL =====================
for model_name, model_id in tqdm(
    MODELS.items(),
    desc="Modelos (Binary)",
    total=len(MODELS)
):

    if model_name in SKIP_MODELS:
        print(f"\n‚è≠Ô∏è SKIPPING MODEL (hardcoded): {model_name}\n")
        continue

    print(f"\n==========================")
    print(f"Testing MODEL (Binary): {model_name}")
    print("==========================\n")

    try:
        for prompt_version in tqdm(
            [1, 2, 3],
            desc=f"Prompts ({model_name})"
        ):

            col = f"{model_name}_p{prompt_version}"
            if col in finBin.columns:
                print(f"‚è≠Ô∏è {col} already exists, skipping")
                continue

            preds_bin = []
            batches_bin = range(0, len(finBin), BATCH_SIZE)

            for i in tqdm(
                batches_bin,
                desc=f"Binary p{prompt_version} ‚Äî {model_name}",
                leave=False
            ):
                batch = finBin["text"].iloc[i:i+BATCH_SIZE].tolist()
                preds_bin.extend(
                    clasificarBin_batch(
                        batch,
                        prompt_version,
                        df=subsetBin,
                        model=model_id
                    )
                )

            finBin[col] = preds_bin
            print(f"{model_name} ‚Äî Binary Prompt {prompt_version} DONE.")

        # üîπ Guardado parcial por modelo
        finBin.to_csv(f"resultados_binary_{model_name}.csv", index=False)

    except Exception as e:
        print(f"\n‚ö†Ô∏è Error en modelo {model_name}: {e}")
        print("‚è≠Ô∏è Saltando este modelo y continuando.\n")
        continue

# ===================== GUARDADO FINAL =====================
finBin.to_csv("Btempraw.csv", index=False)

print("\n‚úÖ Evaluaci√≥n BINARIA terminada. CSV guardado correctamente.")


‚è≠Ô∏è Skipping models: []


Modelos (Binary):   0%|          | 0/7 [00:00<?, ?it/s]


Testing MODEL (Binary): LLaMA-8B



Prompts (LLaMA-8B):   0%|          | 0/3 [00:00<?, ?it/s]

Binary p1 ‚Äî LLaMA-8B:   0%|          | 0/10 [00:00<?, ?it/s]

LLaMA-8B ‚Äî Binary Prompt 1 DONE.


Binary p2 ‚Äî LLaMA-8B:   0%|          | 0/10 [00:00<?, ?it/s]

LLaMA-8B ‚Äî Binary Prompt 2 DONE.


Binary p3 ‚Äî LLaMA-8B:   0%|          | 0/10 [00:00<?, ?it/s]

LLaMA-8B ‚Äî Binary Prompt 3 DONE.

Testing MODEL (Binary): LLaMA-70B



Prompts (LLaMA-70B):   0%|          | 0/3 [00:00<?, ?it/s]

Binary p1 ‚Äî LLaMA-70B:   0%|          | 0/10 [00:00<?, ?it/s]

LLaMA-70B ‚Äî Binary Prompt 1 DONE.


Binary p2 ‚Äî LLaMA-70B:   0%|          | 0/10 [00:00<?, ?it/s]

LLaMA-70B ‚Äî Binary Prompt 2 DONE.


Binary p3 ‚Äî LLaMA-70B:   0%|          | 0/10 [00:00<?, ?it/s]

LLaMA-70B ‚Äî Binary Prompt 3 DONE.

Testing MODEL (Binary): Maverick-17B



Prompts (Maverick-17B):   0%|          | 0/3 [00:00<?, ?it/s]

Binary p1 ‚Äî Maverick-17B:   0%|          | 0/10 [00:00<?, ?it/s]

Maverick-17B ‚Äî Binary Prompt 1 DONE.


Binary p2 ‚Äî Maverick-17B:   0%|          | 0/10 [00:00<?, ?it/s]

Maverick-17B ‚Äî Binary Prompt 2 DONE.


Binary p3 ‚Äî Maverick-17B:   0%|          | 0/10 [00:00<?, ?it/s]

Maverick-17B ‚Äî Binary Prompt 3 DONE.

Testing MODEL (Binary): Scout-17B



Prompts (Scout-17B):   0%|          | 0/3 [00:00<?, ?it/s]

Binary p1 ‚Äî Scout-17B:   0%|          | 0/10 [00:00<?, ?it/s]

Scout-17B ‚Äî Binary Prompt 1 DONE.


Binary p2 ‚Äî Scout-17B:   0%|          | 0/10 [00:00<?, ?it/s]

Scout-17B ‚Äî Binary Prompt 2 DONE.


Binary p3 ‚Äî Scout-17B:   0%|          | 0/10 [00:00<?, ?it/s]

Allam-7B ‚Äî Binary Prompt 1 DONE.


Binary p2 ‚Äî Allam-7B:   0%|          | 0/10 [00:00<?, ?it/s]

Allam-7B ‚Äî Binary Prompt 2 DONE.


Binary p3 ‚Äî Allam-7B:   0%|          | 0/10 [00:00<?, ?it/s]

Allam-7B ‚Äî Binary Prompt 3 DONE.

Testing MODEL (Binary): Kimi-K2



Prompts (Kimi-K2):   0%|          | 0/3 [00:00<?, ?it/s]

Binary p1 ‚Äî Kimi-K2:   0%|          | 0/10 [00:00<?, ?it/s]

Kimi-K2 ‚Äî Binary Prompt 1 DONE.


Binary p2 ‚Äî Kimi-K2:   0%|          | 0/10 [00:00<?, ?it/s]

Kimi-K2 ‚Äî Binary Prompt 2 DONE.


Binary p3 ‚Äî Kimi-K2:   0%|          | 0/10 [00:00<?, ?it/s]

Kimi-K2 ‚Äî Binary Prompt 3 DONE.

Testing MODEL (Binary): Qwen-32B



Prompts (Qwen-32B):   0%|          | 0/3 [00:00<?, ?it/s]

Binary p1 ‚Äî Qwen-32B:   0%|          | 0/10 [00:00<?, ?it/s]

Qwen-32B ‚Äî Binary Prompt 1 DONE.


Binary p2 ‚Äî Qwen-32B:   0%|          | 0/10 [00:00<?, ?it/s]

Qwen-32B ‚Äî Binary Prompt 2 DONE.


Binary p3 ‚Äî Qwen-32B:   0%|          | 0/10 [00:00<?, ?it/s]

Qwen-32B ‚Äî Binary Prompt 3 DONE.

‚úÖ Evaluaci√≥n BINARIA terminada. CSV guardado correctamente.


In [9]:
from tqdm.auto import tqdm
from groq import RateLimitError
import pandas as pd

RESULTS = []

BATCH_SIZE = 20
NUM_RECORDS = 200

finMulti = subsetMulti.sample(NUM_RECORDS).copy()

SKIP_MODELS = []
print("‚è≠Ô∏è Skipping models:", SKIP_MODELS)

for model_name, model_id in tqdm(MODELS.items(), desc="Modelos", total=len(MODELS)):

    if model_name in SKIP_MODELS:
        print(f"\n‚è≠Ô∏è SKIPPING MODEL (hardcoded): {model_name}\n")
        continue

    print(f"\n==========================")
    print(f"Testing MODEL: {model_name}")
    print("==========================\n")

    try:
        for prompt_version in tqdm([1, 2, 3], desc=f"Prompts ({model_name})"):

            col = f"{model_name}_p{prompt_version}"
            if col in finMulti.columns:
                print(f"‚è≠Ô∏è {col} already exists, skipping")
                continue

            preds_multi = []
            batches_multi = range(0, len(finMulti), BATCH_SIZE)

            for i in tqdm(
                batches_multi,
                desc=f"Multi p{prompt_version} ‚Äî {model_name}",
                leave=False
            ):
                batch = finMulti["text"].iloc[i:i+BATCH_SIZE].tolist()
                preds_multi.extend(
                    clasificarMulti_batch(
                        batch,
                        prompt_version,
                        df=subsetMulti,
                        model=model_id
                    )
                )

            finMulti[col] = preds_multi
            print(f"{model_name} ‚Äî Prompt {prompt_version} DONE.")

        finMulti.to_csv(f"resultados_multiclass_{model_name}.csv", index=False)

    except Exception as e:
        print(f"\n‚ö†Ô∏è Error en modelo {model_name}: {e}")
        print("‚è≠Ô∏è Saltando este modelo y continuando.\n")
        continue

finMulti.to_csv("Mtempraw.csv", index=False)
print("\n‚úÖ Evaluaci√≥n terminada. CSV guardado correctamente.")


‚è≠Ô∏è Skipping models: []


Modelos:   0%|          | 0/7 [00:00<?, ?it/s]


Testing MODEL: LLaMA-8B



Prompts (LLaMA-8B):   0%|          | 0/3 [00:00<?, ?it/s]

Multi p1 ‚Äî LLaMA-8B:   0%|          | 0/10 [00:00<?, ?it/s]

LLaMA-8B ‚Äî Prompt 1 DONE.


Multi p2 ‚Äî LLaMA-8B:   0%|          | 0/10 [00:00<?, ?it/s]

LLaMA-8B ‚Äî Prompt 2 DONE.


Multi p3 ‚Äî LLaMA-8B:   0%|          | 0/10 [00:00<?, ?it/s]

LLaMA-8B ‚Äî Prompt 3 DONE.

Testing MODEL: LLaMA-70B



Prompts (LLaMA-70B):   0%|          | 0/3 [00:00<?, ?it/s]

Multi p1 ‚Äî LLaMA-70B:   0%|          | 0/10 [00:00<?, ?it/s]

LLaMA-70B ‚Äî Prompt 1 DONE.


Multi p2 ‚Äî LLaMA-70B:   0%|          | 0/10 [00:00<?, ?it/s]

LLaMA-70B ‚Äî Prompt 2 DONE.


Multi p3 ‚Äî LLaMA-70B:   0%|          | 0/10 [00:00<?, ?it/s]

LLaMA-70B ‚Äî Prompt 3 DONE.

Testing MODEL: Maverick-17B



Prompts (Maverick-17B):   0%|          | 0/3 [00:00<?, ?it/s]

Multi p1 ‚Äî Maverick-17B:   0%|          | 0/10 [00:00<?, ?it/s]

Maverick-17B ‚Äî Prompt 1 DONE.


Multi p2 ‚Äî Maverick-17B:   0%|          | 0/10 [00:00<?, ?it/s]

Maverick-17B ‚Äî Prompt 2 DONE.


Multi p3 ‚Äî Maverick-17B:   0%|          | 0/10 [00:00<?, ?it/s]

Maverick-17B ‚Äî Prompt 3 DONE.

Testing MODEL: Scout-17B



Prompts (Scout-17B):   0%|          | 0/3 [00:00<?, ?it/s]

Multi p1 ‚Äî Scout-17B:   0%|          | 0/10 [00:00<?, ?it/s]

Scout-17B ‚Äî Prompt 1 DONE.


Multi p2 ‚Äî Scout-17B:   0%|          | 0/10 [00:00<?, ?it/s]

Scout-17B ‚Äî Prompt 2 DONE.


Multi p3 ‚Äî Scout-17B:   0%|          | 0/10 [00:00<?, ?it/s]

Scout-17B ‚Äî Prompt 3 DONE.

Testing MODEL: Allam-7B



Prompts (Allam-7B):   0%|          | 0/3 [00:00<?, ?it/s]

Multi p1 ‚Äî Allam-7B:   0%|          | 0/10 [00:00<?, ?it/s]

Allam-7B ‚Äî Prompt 1 DONE.


Multi p2 ‚Äî Allam-7B:   0%|          | 0/10 [00:00<?, ?it/s]

Allam-7B ‚Äî Prompt 2 DONE.


Multi p3 ‚Äî Allam-7B:   0%|          | 0/10 [00:00<?, ?it/s]

Allam-7B ‚Äî Prompt 3 DONE.

Testing MODEL: Kimi-K2



Prompts (Kimi-K2):   0%|          | 0/3 [00:00<?, ?it/s]

Multi p1 ‚Äî Kimi-K2:   0%|          | 0/10 [00:00<?, ?it/s]

Kimi-K2 ‚Äî Prompt 1 DONE.


Multi p2 ‚Äî Kimi-K2:   0%|          | 0/10 [00:00<?, ?it/s]

Kimi-K2 ‚Äî Prompt 2 DONE.


Multi p3 ‚Äî Kimi-K2:   0%|          | 0/10 [00:00<?, ?it/s]

Kimi-K2 ‚Äî Prompt 3 DONE.

Testing MODEL: Qwen-32B



Prompts (Qwen-32B):   0%|          | 0/3 [00:00<?, ?it/s]

Multi p1 ‚Äî Qwen-32B:   0%|          | 0/10 [00:00<?, ?it/s]

Qwen-32B ‚Äî Prompt 1 DONE.


Multi p2 ‚Äî Qwen-32B:   0%|          | 0/10 [00:00<?, ?it/s]

Qwen-32B ‚Äî Prompt 2 DONE.


Multi p3 ‚Äî Qwen-32B:   0%|          | 0/10 [00:00<?, ?it/s]

Qwen-32B ‚Äî Prompt 3 DONE.

‚úÖ Evaluaci√≥n terminada. CSV guardado correctamente.


In [10]:
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import os

print("===== üß† BINARY CLASSIFICATION (ALL MODELS) =====")

rows = []

def normalize_binary(label):
    """
    Normaliza salidas del modelo (en espa√±ol o ingl√©s)
    y las mapea a:
    - Hope
    - Not Hope
    """
    if not isinstance(label, str):
        return "Not Hope"

    t = label.strip().lower()

    # Espa√±ol
    if "no esperanza" in t:
        return "Not Hope"
    if t == "esperanza":
        return "Hope"
    if "esperanza" in t and "no" not in t:
        return "Hope"

    # Ingl√©s
    if "not hope" in t:
        return "Not Hope"
    if t == "hope":
        return "Hope"
    if "hope" in t and "not" not in t:
        return "Hope"

    return "Not Hope"

os.makedirs("used_prompts", exist_ok=True)

for model_name in MODELS.keys():
    print(f"\n======================")
    print(f"MODEL: {model_name}")
    print("======================")

    for p in [1, 2, 3]:
        col = f"{model_name}_p{p}"

        if col not in finBin.columns:
            print(f"‚è≠Ô∏è Missing {col}, skipping")
            continue

        finBin[f"{col}_norm"] = finBin[col].apply(normalize_binary)

        # Ground truth en ingl√©s
        y_true = finBin["binary"]          # Hope / Not Hope
        y_pred = finBin[f"{col}_norm"]

        acc = accuracy_score(y_true, y_pred)
        report = classification_report(
            y_true,
            y_pred,
            output_dict=True,
            zero_division=0
        )

        macro_f1 = report["macro avg"]["f1-score"]

        invalid_outputs = sum(
            1 for x in finBin[col]
            if not isinstance(x, str)
            or (
                "hope" not in x.lower()
                and "esperanza" not in x.lower()
            )
        )

        print(f"\n--- Prompt {p} ---")
        print(f"Accuracy: {acc*100:.2f}%")
        print(f"Macro F1 (PRIMARY): {macro_f1:.4f}")
        print(classification_report(y_true, y_pred, zero_division=0))

        for label in ["Hope", "Not Hope"]:
            rows.append({
                "Model": model_name,
                "Prompt": p,
                "PromptType": ["zero-shot", "one-shot", "few-shot"][p-1],
                "Label": label,
                "Precision": report[label]["precision"],
                "Recall": report[label]["recall"],
                "F1": report[label]["f1-score"],
                "Support": report[label]["support"],
                "Accuracy": acc,
                "Primary_Macro_F1": macro_f1,
                "InvalidOutputs": invalid_outputs
            })

        rows.append({
            "Model": model_name,
            "Prompt": p,
            "PromptType": ["zero-shot", "one-shot", "few-shot"][p-1],
            "Label": "macro avg",
            "Precision": report["macro avg"]["precision"],
            "Recall": report["macro avg"]["recall"],
            "F1": macro_f1,
            "Support": report["macro avg"]["support"],
            "Accuracy": acc,
            "Primary_Macro_F1": macro_f1,
            "InvalidOutputs": invalid_outputs
        })

        rows.append({
            "Model": model_name,
            "Prompt": p,
            "PromptType": ["zero-shot", "one-shot", "few-shot"][p-1],
            "Label": "weighted avg",
            "Precision": report["weighted avg"]["precision"],
            "Recall": report["weighted avg"]["recall"],
            "F1": report["weighted avg"]["f1-score"],
            "Support": report["weighted avg"]["support"],
            "Accuracy": acc,
            "Primary_Macro_F1": macro_f1,
            "InvalidOutputs": invalid_outputs
        })

        with open(f"used_prompts/binary_{model_name}_prompt{p}.txt", "w", encoding="utf-8") as f:
            f.write(f"MODEL: {model_name}\nPROMPT VERSION: {p}\n")

df_binary_metrics = pd.DataFrame(rows)
df_binary_metrics.to_csv("Btemp.csv", index=False)

print("\n‚úÖ Saved: Btemp.csv")


===== üß† BINARY CLASSIFICATION (ALL MODELS) =====

MODEL: LLaMA-8B

--- Prompt 1 ---
Accuracy: 57.50%
Macro F1 (PRIMARY): 0.5683
              precision    recall  f1-score   support

        Hope       0.56      0.70      0.62       100
    Not Hope       0.60      0.45      0.51       100

    accuracy                           0.57       200
   macro avg       0.58      0.57      0.57       200
weighted avg       0.58      0.57      0.57       200


--- Prompt 2 ---
Accuracy: 56.50%
Macro F1 (PRIMARY): 0.5581
              precision    recall  f1-score   support

        Hope       0.55      0.69      0.61       100
    Not Hope       0.59      0.44      0.50       100

    accuracy                           0.56       200
   macro avg       0.57      0.56      0.56       200
weighted avg       0.57      0.56      0.56       200


--- Prompt 3 ---
Accuracy: 58.50%
Macro F1 (PRIMARY): 0.5812
              precision    recall  f1-score   support

        Hope       0.57      0.68   

In [11]:
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import os

print("===== üé® MULTICLASS CLASSIFICATION (ALL MODELS) =====")

rows = []

# ============================
# Etiquetas finales (INGL√âS)
# ============================
VALID_LABELS = [
    "Generalized Hope",
    "Realistic Hope",
    "Unrealistic Hope"
]

# ============================
# Normalizaci√≥n ROBUSTA
# (Espa√±ol ‚Üí Ingl√©s + Ingl√©s)
# ============================
def normalize_multiclass(label):
    if not isinstance(label, str):
        return "Generalized Hope"

    t = label.strip().lower()

    # ----- Espa√±ol -----
    if "deseo" in t or "fant√°stico" in t or "fantastico" in t:
        return "Unrealistic Hope"
    if "realista" in t:
        return "Realistic Hope"
    if "abstracta" in t or "abstracto" in t:
        return "Generalized Hope"

    # ----- Ingl√©s -----
    if "fantasy" in t or "unrealistic" in t:
        return "Unrealistic Hope"
    if "realistic" in t:
        return "Realistic Hope"
    if "abstract" in t or "generalized" in t:
        return "Generalized Hope"

    return "Generalized Hope"


# ============================
# Guardar prompts usados
# ============================
os.makedirs("used_prompts", exist_ok=True)

# ============================
# LOOP POR MODELO Y PROMPT
# ============================
for model_name in MODELS.keys():
    print(f"\n======================")
    print(f"MODEL: {model_name}")
    print("======================")

    for p in [1, 2, 3]:
        col = f"{model_name}_p{p}"

        if col not in finMulti.columns:
            print(f"‚è≠Ô∏è Missing {col}, skipping")
            continue

        # Normalizar predicciones
        finMulti[f"{col}_norm"] = finMulti[col].apply(normalize_multiclass)

        # Ground truth en INGL√âS
        y_true = finMulti["multiclass"]      # Generalized / Realistic / Unrealistic Hope
        y_pred = finMulti[f"{col}_norm"]

        acc = accuracy_score(y_true, y_pred)
        report = classification_report(
            y_true,
            y_pred,
            output_dict=True,
            zero_division=0
        )

        macro_f1 = report["macro avg"]["f1-score"]

        # Conteo de outputs inv√°lidos (antes de normalizar)
        invalid_outputs = sum(
            1 for x in finMulti[col]
            if not isinstance(x, str)
            or (
                all(
                    kw not in x.lower()
                    for kw in [
                        "abstract", "general",
                        "realistic", "realista",
                        "fantasy", "fant√°stico", "fantastico",
                        "unrealistic", "deseo"
                    ]
                )
            )
        )

        print(f"\n--- Prompt {p} ---")
        print(f"Accuracy: {acc*100:.2f}%")
        print(f"Macro F1 (PRIMARY): {macro_f1:.4f}")
        print(classification_report(y_true, y_pred, zero_division=0))

        # ============================
        # M√©tricas por clase
        # ============================
        for label in VALID_LABELS:
            rows.append({
                "Model": model_name,
                "Prompt": p,
                "PromptType": ["zero-shot", "one-shot", "few-shot"][p-1],
                "Label": label,
                "Precision": report[label]["precision"],
                "Recall": report[label]["recall"],
                "F1": report[label]["f1-score"],
                "Support": report[label]["support"],
                "Accuracy": acc,
                "Primary_Macro_F1": macro_f1,
                "InvalidOutputs": invalid_outputs
            })

        # Macro avg
        rows.append({
            "Model": model_name,
            "Prompt": p,
            "PromptType": ["zero-shot", "one-shot", "few-shot"][p-1],
            "Label": "macro avg",
            "Precision": report["macro avg"]["precision"],
            "Recall": report["macro avg"]["recall"],
            "F1": macro_f1,
            "Support": report["macro avg"]["support"],
            "Accuracy": acc,
            "Primary_Macro_F1": macro_f1,
            "InvalidOutputs": invalid_outputs
        })

        # Weighted avg
        rows.append({
            "Model": model_name,
            "Prompt": p,
            "PromptType": ["zero-shot", "one-shot", "few-shot"][p-1],
            "Label": "weighted avg",
            "Precision": report["weighted avg"]["precision"],
            "Recall": report["weighted avg"]["recall"],
            "F1": report["weighted avg"]["f1-score"],
            "Support": report["weighted avg"]["support"],
            "Accuracy": acc,
            "Primary_Macro_F1": macro_f1,
            "InvalidOutputs": invalid_outputs
        })

        # Guardar prompt usado (referencia metodol√≥gica)
        with open(f"used_prompts/multiclass_{model_name}_prompt{p}.txt", "w", encoding="utf-8") as f:
            f.write(f"MODEL: {model_name}\nPROMPT VERSION: {p}\n")


# =========================
# Guardar CSV final
# =========================
df_multi_metrics = pd.DataFrame(rows)
df_multi_metrics.to_csv(
    "Mtemp.csv",
    index=False
)

print("\n‚úÖ Saved: Mtemp.csv")


===== üé® MULTICLASS CLASSIFICATION (ALL MODELS) =====

MODEL: LLaMA-8B

--- Prompt 1 ---
Accuracy: 45.50%
Macro F1 (PRIMARY): 0.4253
                  precision    recall  f1-score   support

Generalized Hope       0.40      0.75      0.52        72
  Realistic Hope       0.55      0.34      0.42        68
Unrealistic Hope       0.58      0.23      0.33        60

        accuracy                           0.46       200
       macro avg       0.51      0.44      0.43       200
    weighted avg       0.51      0.46      0.43       200


--- Prompt 2 ---
Accuracy: 46.00%
Macro F1 (PRIMARY): 0.4370
                  precision    recall  f1-score   support

Generalized Hope       0.41      0.71      0.52        72
  Realistic Hope       0.53      0.38      0.44        68
Unrealistic Hope       0.58      0.25      0.35        60

        accuracy                           0.46       200
       macro avg       0.51      0.45      0.44       200
    weighted avg       0.50      0.46      0

In [12]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import os

os.makedirs("confusion_matrices", exist_ok=True)

for model_name in MODELS.keys():
    for p in [1, 2, 3]:
        col = f"{model_name}_p{p}"
        if col not in finMulti.columns:
            continue

        y_true = finMulti["multiclass"]
        y_pred = finMulti[col].apply(normalize_multiclass)

        labels = [
            "Generalized Hope",
            "Realistic Hope",
            "Unrealistic Hope"
        ]

        cm = confusion_matrix(y_true, y_pred, labels=labels)
        disp = ConfusionMatrixDisplay(cm, display_labels=labels)

        plt.figure(figsize=(6, 6))
        disp.plot(cmap="Blues", values_format="d")
        plt.title(f"{model_name} ‚Äî Prompt {p}")
        plt.savefig(f"confusion_matrices/{model_name}_prompt{p}.png")
        plt.close()


  fig, ax = plt.subplots()
  plt.figure(figsize=(6, 6))


<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>

<Figure size 600x600 with 0 Axes>