In [None]:
!pip install tqdm together datasets

In [1]:
import json
import time
import requests
import re
import csv
import together
import os
from tqdm import tqdm
from datasets import Dataset

In [None]:
TOGETHER_API_KEY = "nottherealkey" # 
MODEL_NAME ="meta-llama/Llama-4-Scout-17B-16E-Instruct" # or another model you want to use
API_URL = "https://api.together.xyz/inference"
together.api_key = TOGETHER_API_KEY

In [9]:
def extract_choices_from_response(text):
    text = text.upper()
    match = re.search(r"RƒÇSPUNS(?:\s+CORECT)?\s*[:Ôºö]?\s*([ABC](?:[\s,]+[ABC])*)", text)
    if match:
        return set(re.findall(r"[ABC]", match.group(1)))
    return set(re.findall(r"\b[ABC]\b", text))

def evaluate_model_together(dataset, model="meta-llama/Llama-3-70b-chat-hf",
                            max_tokens=128,
                            output_csv_path="evaluare_together_output.csv",
                            log_progress=10,
                            verbose=False,
                            index=0,
                            limit=1000):

    golds, preds = [], []
    exact_matches = 0
    client = together.Together(api_key=TOGETHER_API_KEY)

    # VerificƒÉm dacƒÉ fi»ôierul CSV deja existƒÉ
    file_exists = os.path.exists(output_csv_path)

    with open(output_csv_path, "a", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["index", "prompt", "golds", "preds", "response"])
        if not file_exists:
            writer.writeheader()

        for i, ex in enumerate(tqdm(dataset, desc=f"Evaluare {model}", unit="ex")):
            if i < index or i > limit:
                continue
            try:
                # verificam daca exista image
                image_str=""
                if "image" not in ex or not ex["image"]:
                    print(f"Warning: Exerci»õiul {i} nu are imagine asociatƒÉ.")
                else:
                    image_str = str(ex.get("image", "")).strip()
                text_str = str(ex.get("text", "")).strip()
                prompt_text = (
                    image_str + "\n"
                    + text_str + "\n"
                    + "Mai √Ænt√¢i oferƒÉ doar litera/literele rƒÉspunsului corect sub forma:\nRƒÉspuns: A\n"
                    + "Apoi oferƒÉ o explica»õie succintƒÉ pe un r√¢nd nou. \n"
                    + "RƒÉspuns:"
                )
                prompt = (
                    prompt_text
                )
                gold = extract_choices_from_response(ex.get("response", ""))

                response = client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": prompt}],
                    max_tokens=max_tokens,
                    temperature=0.6
                )

                decoded = response.choices[0].message.content.strip()
                pred = extract_choices_from_response(decoded)

                if verbose and (i % log_progress == 0 or i == len(dataset) - 1):
                    print(f"\nüìò Prompt [{i}]:", prompt)
                    print("‚úÖ Gold:", gold)
                    print("üß† Pred:", pred)
                    print("üìù Decoded:", decoded)
                    print("-" * 60)

                if gold and pred:
                    golds.append(gold)
                    preds.append(pred)
                    if gold == pred:
                        exact_matches += 1

                writer.writerow({
                    "index": i,
                    "prompt": ex["text"],
                    "golds": ",".join(sorted(gold)) if gold else "",
                    "preds": ",".join(sorted(pred)) if pred else "",
                    "response": decoded
                })
                f.flush()

                time.sleep(1)  # RespectƒÉ rate limit

            except Exception as e:
                print(f"[‚ùå EROARE la index {i}] {e}")
                continue

    print(f"\nüìù Evaluare finalizatƒÉ. CSV salvat la: {output_csv_path}")
    total = len(golds)
    accuracy = exact_matches / total if total else 0
    print(f"üìä Acurate»õe totalƒÉ (exact match): {accuracy:.2%}")
    return golds, preds, accuracy


In [None]:
def load_jsonl_as_list(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

data = load_jsonl_as_list("multimodal.jsonl")
dataset_text_only = Dataset.from_list(data)

split_dataset = dataset_text_only.train_test_split(test_size=0.2, seed=42)
train_ds = split_dataset["train"]
test_ds = split_dataset["test"]
print(test_ds[0])

In [None]:
golds, preds, accuracy = evaluate_model_together(
        model=MODEL_NAME,
        dataset=test_ds,
        max_tokens=128,
        log_progress=10,
        verbose=True,
        output_csv_path="multimodal_rezultate_llama4_scout.csv",
    )