## Instalare de dependinte si setup de environment

In [None]:
%pip install datasets transformers peft accelerate bitsandbytes evaluate

In [None]:
from datasets import load_dataset, DatasetDict, Dataset
import json
import evaluate
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from accelerate import infer_auto_device_map
from huggingface_hub import login

In [None]:
login(token="token") # Replace "token" with your actual Hugging Face token

In [None]:
!nvidia-smi

## Incarcarea dataset-urilor

In [None]:
def incarca_dataset(hf_repo_id: str, split: str = "train"):
    """
    √éncarcƒÉ un dataset de pe Hugging Face Hub.

    Args:
        hf_repo_id (str): ID-ul dataset-ului pe Hugging Face
        split (str): Subsetul dorit (ex: "train", "test", etc.)

    Returns:
        Dataset: Obiectul Dataset √ÆncƒÉrcat
    """
    try:
        dataset = load_dataset(hf_repo_id, split=split)
        print(f"Dataset √ÆncƒÉrcat cu succes! Nr exemple: {len(dataset)} ")
        return dataset
    except Exception as e:
        print(f"Eroare la √ÆncƒÉrcarea datasetului: {e}")
        return None

### Pentru dataset local

In [None]:
def load_json(filepath):
    """
    √éncarcƒÉ un fi»ôier JSON »ôi returneazƒÉ con»õinutul ca listƒÉ sau dict.

    Args:
        filepath (str): Calea cƒÉtre fi»ôierul .json

    Returns:
        list | dict: Obiectul JSON √ÆncƒÉrcat
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)


In [None]:
def concat_json_lists(*json_arrays):
    """
    ConcateneazƒÉ oric√¢te liste JSON (liste de dic»õionare).

    Args:
        *json_arrays: Liste JSON (ex: list[dict], list[dict], ...)

    Returns:
        list: O listƒÉ unificatƒÉ de dic»õionare
    """
    merged = []
    for json_list in json_arrays:
        if not isinstance(json_list, list) or not all(isinstance(x, dict) for x in json_list):
            raise TypeError("Toate elementele trebuie sƒÉ fie liste de dic»õionare JSON.")
        merged.extend(json_list)
    return merged


In [None]:
from datasets import Dataset

def json_to_dataset(json_list):
    """
    TransformƒÉ o listƒÉ de dic»õionare JSON √Æntr-un Hugging Face Dataset.

    Args:
        json_list (list): Lista JSON (listƒÉ de dic»õionare)

    Returns:
        datasets.Dataset: Obiect Dataset HF
    """
    if not isinstance(json_list, list) or not all(isinstance(x, dict) for x in json_list):
        raise ValueError("Inputul trebuie sƒÉ fie o listƒÉ de dic»õionare.")
    return Dataset.from_list(json_list)


In [None]:
data_images = load_json("questions_with_images.json")
data_no_images = load_json("questions_without_images.json")

# ConcateneazƒÉ
merged = concat_json_lists(data_images,data_no_images)
print(merged)

# CreeazƒÉ dataset Hugging Face
auto_dataset_local = json_to_dataset(merged)
auto_dataset_local_images = json_to_dataset(data_images)
auto_dataset_local_no_images = json_to_dataset(data_no_images)

# VizualizeazƒÉ primele 3 exemple
print(auto_dataset_local[:3])

In [None]:
# print(auto_dataset[0])

## Preprocesare pentru LLM text-only

In [None]:
def preprocess_llm_text_only(example, include_image_description=True):
    """
    PreproceseazƒÉ o instan»õƒÉ de √Æntrebare pentru fine-tuning LLM text-only.

    Args:
        example (dict): O √Æntrebare din dataset.
        include_image_description (bool): DacƒÉ se include descrierea imaginii √Æn prompt.

    Returns:
        dict: Obiect cu 'prompt' »ôi 'response' pregƒÉtite pentru LLM.
    """
    question = example.get("question", "").strip()
    raw_desc = example.get("image_description")
    image_desc = raw_desc.strip() if isinstance(raw_desc, str) else "fƒÉrƒÉ descriere"
    answers = example.get("answers", [])
    correct = [c.strip().lower() for c in example.get("correct_answers", [])]
    explanation = example.get("explanation", "").strip()

    if len(answers) > 26:
        raise ValueError("Maximum number of answers exceeded (26)")

    option_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    options = [f"{option_letters[i]}: {ans.upper()}" for i, ans in enumerate(answers)]
    options_str = "\n".join(options)

    prompt = f"√éntrebare: {question}"
    if include_image_description and image_desc and image_desc.lower() != "fƒÉrƒÉ descriere":
        prompt += f"\nDescriere imagine: {image_desc}"
    prompt += "\nAlege unul sau mai multe rƒÉspunsuri corecte:\n" + options_str

    response_letters = [
        option_letters[i]
        for i, ans in enumerate(answers)
        if ans.strip().lower() in correct
    ]

    if len(explanation) > 1000:
        explanation = explanation[:1000].rsplit('.', 1)[0] + "..."

    response = (
        f"RƒÉspuns corect: {', '.join(response_letters)}\n\n"
        f"Explica»õie: {explanation}"
    )

    return {
        "prompt": prompt.strip(),
        "response": response
    }


In [None]:
text_only_prompts = auto_dataset_local.map(preprocess_llm_text_only)
print(text_only_prompts[0])

## Preprocesare pentru LLM multimodal

In [None]:
def preprocess_llm_multimodal(example, image_source_key="image_path", include_image_description=False):
    """
    PreproceseazƒÉ o instan»õƒÉ pentru fine-tuning LLM multimodal.

    Args:
        example (dict): √éntrebarea brutƒÉ.
        image_source_key (str): Cheia folositƒÉ pentru imagine (ex: 'image_path').

    Returns:
        dict: Obiect cu 'image', 'text' (prompt) »ôi 'response'
    """
    question = example.get("question", "").strip()
    image_url = example.get(image_source_key, None)
    raw_desc = example.get("image_description")
    image_desc = raw_desc.strip() if isinstance(raw_desc, str) else "fƒÉrƒÉ descriere"
    answers = example.get("answers", [])
    correct = [c.strip().lower() for c in example.get("correct_answers", [])]
    explanation = example.get("explanation", "").strip()

    option_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    options = [f"{option_letters[i]}: {ans.upper()}" for i, ans in enumerate(answers)]
    options_str = "\n".join(options)

    if image_url is None:
        context = "Nu este furnizatƒÉ nicio imagine. RƒÉspunde doar pe baza √ÆntrebƒÉrii de mai jos."
    else:
        context = "Imaginea con»õine o situa»õie rutierƒÉ. RƒÉspunde pe baza acesteia »ôi a √ÆntrebƒÉrii de mai jos."

    prompt = f"{context}\n√éntrebare: {question}"
    if include_image_description and image_desc and image_desc.lower() != "fƒÉrƒÉ descriere":
        prompt += f"\nDescriere imagine: {image_desc}"
    prompt += "\nAlege unul sau mai multe rƒÉspunsuri corecte:\n" + options_str

    response_letters = [
        option_letters[i]
        for i, ans in enumerate(answers)
        if ans.strip().lower() in correct
    ]

    if len(explanation) > 1000:
        explanation = explanation[:1000].rsplit('.', 1)[0] + "..."

    response = (
        f"RƒÉspuns corect: {', '.join(response_letters)}\n\n"
        f"Explica»õie: {explanation}"
    )

    return {
        "image": image_url,
        "prompt": prompt.strip(),
        "response": response
    }

In [None]:
multimodal_prompts = auto_dataset_local.map(preprocess_llm_multimodal)
print(multimodal_prompts[0])

## Salvare in format jsonl

In [None]:
def proceseaza_si_salveaza(dataset, mode="text_only", output_path="output.jsonl"):
    preprocessed = []
    for example in tqdm(dataset):
        if mode == "text_only":
            processed = preprocess_llm_text_only(example)
        elif mode == "multimodal":
            processed = preprocess_llm_multimodal(example)
        else:
            raise ValueError("Mode invalid: trebuie 'text_only' sau 'multimodal'")
        preprocessed.append(processed)

    with open(output_path, "w", encoding="utf-8") as f:
        for row in preprocessed:
            json.dump(row, f, ensure_ascii=False)
            f.write("\n")
    print(f"‚úÖ Salvat {len(preprocessed)} exemple √Æn {output_path}")

In [None]:
proceseaza_si_salveaza(auto_dataset_local, mode="text_only", output_path="text_only.jsonl")
proceseaza_si_salveaza(auto_dataset_local, mode="multimodal", output_path="multimodal.jsonl")

## Fine-tuning text-only LLM's

# Impartirea dataset-ului in train/test


In [None]:
# from dataset import load_dataset

# dataset_text_only = load_dataset("json", data_files="text_only.jsonl", split="train")
# split_dataset = dataset_text_only.train_test_split(test_size=0.2,seed=42)
# train_ds = split_dataset["train"]
# test_ds = split_dataset["test"]

In [None]:
def load_jsonl_as_list(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

data = load_jsonl_as_list("text_only.jsonl")
dataset_text_only = Dataset.from_list(data)

split_dataset = dataset_text_only.train_test_split(test_size=0.2, seed=42)
train_ds = split_dataset["train"]
test_ds = split_dataset["test"]


In [None]:
print(train_ds[0])
print(test_ds[0])

### Fine-tuning pentru LLM-uri text-only


In [None]:
# def analyze_model(model_id="OpenLLM-Ro/RoGemma2-9b-Instruct"):
#     print(f"\n=== AnalizƒÉ pentru modelul: {model_id} ===\n")

#     # √éncarcƒÉ tokenizerul
#     tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
#     print("[‚úì] Tokenizer √ÆncƒÉrcat.")

#     # √éncearcƒÉ sƒÉ √Æncarce modelul FP16 (pentru a evita probleme la √Ænceput)
#     try:
#         model = AutoModelForCausalLM.from_pretrained(
#             model_id,
#             device_map="cpu",  # evitƒÉ folosirea memoriei GPU
#             torch_dtype=torch.float16,
#             trust_remote_code=True
#         )
#         print("[‚úì] Modelul s-a √ÆncƒÉrcat cu succes √Æn fp16.")
#     except Exception as e:
#         print(f"[‚úó] Eroare la √ÆncƒÉrcarea modelului: {e}")
#         return

#     # Afi»ôeazƒÉ layerele care con»õin "proj" (utile pentru LoRA)
#     print("\n=== Layer(e) relevante pentru LoRA (care con»õin 'proj') ===")
#     for name, module in model.named_modules():
#         if "proj" in name or "query" in name or "value" in name:
#             print("  ‚Ä¢", name)

#     # VerificƒÉ dacƒÉ se poate folosi LoRA (PEFT)
#     try:
#         print("\n=== Verific LoRA compatibility ===")
#         model = prepare_model_for_kbit_training(model)
#         print("[‚úì] Modelul este compatibil cu PEFT »ôi LoRA (dupƒÉ pregƒÉtire).")
#     except Exception as e:
#         print(f"[‚úó] Modelul NU este compatibil cu PEFT direct: {e}")

#     # Test: √Æncercare de inferare device_map automatƒÉ (pentru quantizare + distribuit)
#     try:
#         device_map = infer_auto_device_map(model, no_split_module_classes=["DecoderLayer"], max_memory={"cpu": "10GiB"})
#         print("\n[‚úì] infer_auto_device_map a func»õionat. Modelul poate fi √ÆmpƒÉr»õit.")
#     except Exception as e:
#         print(f"[!] Device map automatƒÉ a e»ôuat: {e}")

#     print("\n=== Sf√¢r»ôitul analizei ===")

# analyze_model("OpenLLM-Ro/RoGemma2-9b-Instruct")  # sau alt model

In [None]:
def load_model(model_id="microsoft/phi-2", use_lora=False, lora_r=8, lora_alpha=16):
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    # √éncarcƒÉ modelul quantizat
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        trust_remote_code=True,
        load_in_4bit=True,
        # attn_implementation="eager",  # recomandat pt. Gemma2
        offload_folder="./offload"
    )

    if use_lora:
        model = prepare_model_for_kbit_training(model)

        config = LoraConfig(
            r=lora_r,
            lora_alpha=lora_alpha,
            target_modules=[
                  "q_proj",
                  "v_proj",
              ],
            bias="none",
            task_type="CAUSAL_LM",
        )

        model = get_peft_model(model, config)

    return model, tokenizer

In [None]:
def finetune_llm(model, tokenizer, dataset, output_dir="finetuned", max_steps=200):
    def tokenize(example):
        prompt = example["prompt"] + "\nRƒÉspuns:"
        full = prompt + " " + example["response"]
        tokens = tokenizer(full, truncation=True, padding="max_length", max_length=512)
        tokens["labels"] = tokens["input_ids"].copy()
        return tokens

    tokenized = dataset.map(tokenize, remove_columns=dataset.column_names)
    collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        logging_steps=10,
        save_steps=50,
        max_steps=max_steps,
        fp16=True,
        save_total_limit=1,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=args,
        train_dataset=tokenized,
        data_collator=collator,
    )

    print("Model dtype check:", next(model.parameters()).dtype)
    trainer.train()
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

## Fine-tuning multimodal

## Validare modele

In [None]:
import re
import csv
import os

def extract_choices_from_prompt(text):
    """
    Extract labeled choices A, B, C from the prompt like 'A:', 'B:', etc.
    """
    return set(re.findall(r"\b([ABC])\s*:", text.upper()))


# === Helper: Extract choices from a model or human response ===
def extract_choices_from_response(text):
    """
    Extract answer letters (A, B, C) from a response string.
    Looks after 'RƒÉspuns:' but also supports general patterns.
    """
    text = text.upper()

    # Look specifically after 'RƒÇSPUNS:'
    match = re.search(r"RƒÇSPUNS(?:\s+CORECT)?\s*[:Ôºö]?\s*([ABC](?:[\s,]+[ABC])*)", text)
    if match:
        return set(re.findall(r"[ABC]", match.group(1)))
    return None

def evaluate_model(model, tokenizer, dataset, max_new_tokens=128, log_progress=30, verbose=False, output_csv_path="evaluare_output.csv"):
    model.eval()
    golds, preds = [], []
    exact_matches = 0

    file_exists = os.path.exists(output_csv_path)
    with open(output_csv_path, "a", encoding="utf-8", newline="") as f:
      writer = csv.DictWriter(f, fieldnames=["index", "prompt", "golds", "preds", "response"])
      if not file_exists:
        writer.writeheader()
      for i, ex in enumerate(tqdm(dataset, desc="Evaluare", unit="ex")):
          try:
              # Augment prompt with instruction
              prompt = (
                  ex["prompt"].strip()
                  + "OferƒÉ DOAR litera sau literele rƒÉspunsului corect \n"
                  + "RƒÉspuns:"
              )
              # print(ex["response"])
              # Extract gold choices from annotated response
              gold = extract_choices_from_response(ex["response"])

              # Tokenize prompt
              tokens = tokenizer(prompt, truncation=True, max_length=1024, return_tensors="pt")
              for k, v in tokens.items():
                # print(f"{k}: dtype={v.dtype}, shape={v.shape}")
                if v.dtype in (torch.float32, torch.float64):
                    tokens[k] = v.to(dtype=torch.float16, device=model.device)
                else:
                    tokens[k] = v.to(model.device)

              # Generate output
              with torch.no_grad():
                  output = model.generate(
                      **tokens,
                      max_new_tokens=max_new_tokens,
                      do_sample=False,
                      pad_token_id=tokenizer.pad_token_id
                  )

              decoded = tokenizer.decode(output[0], skip_special_tokens=True)
              pred = extract_choices_from_response(decoded)

              # Logging
              if verbose and (i % log_progress == 0 or i == len(dataset) - 1):
                  print("üìò Prompt:", prompt)
                  print("‚úÖ Gold:", gold)
                  print("üß† Pred:", pred)
                  print("üìù Decoded:", decoded)
                  print("-" * 60)

              # Store results
              if gold and pred:
                  golds.append(gold)
                  preds.append(pred)
                  if gold == pred:
                      exact_matches += 1
              writer.writerow({
                    "index": i,
                    "prompt": ex["prompt"],
                    "golds": ",".join(sorted(gold)) if gold else "",
                    "preds": ",".join(sorted(pred)) if pred else "",
                    "response": decoded
                })
              f.flush()

              del tokens, output
              torch.cuda.empty_cache()
          except Exception as e:
              print(f"[‚ùå EROARE] {e}")
              continue

    print(f"\nüìù CSV salvat la: {output_csv_path}")

    total = len(golds)
    accuracy = exact_matches / total if total else 0
    print(f"\nüìä Acurate»õe totalƒÉ (exact match): {accuracy:.2%}")
    return golds, preds, accuracy

In [None]:
def stringify_choices(choices):
    return " ".join(sorted(choices))

def plot_confusion(golds, preds, labels=None, title="Confusion Matrix", save_path=None):
    cm = confusion_matrix(golds, preds, labels=labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", xticks_rotation=45)
    plt.title(title)
    if save_path:
        plt.savefig(save_path, dpi=300)
    plt.show()

def save_results(golds, preds, accuracy, path_csv="results.csv", path_img="confusion.png"):
    # Convert sets to sorted string labels like "A", "A B", etc.
    gold_strs = [stringify_choices(g) for g in golds]
    pred_strs = [stringify_choices(p) for p in preds]

    # Save to CSV
    df = pd.DataFrame({"Gold": gold_strs, "Pred": pred_strs})
    df.to_csv(path_csv, index=False)

    # Compute unique label set
    labels = sorted(set(gold_strs + pred_strs))
    plot_confusion(gold_strs, pred_strs, labels=labels, save_path=path_img)

def compute_accuracy_from_csv(csv_path, verbose=True):
    """
    Compute exact match accuracy from a CSV with 'Gold' and 'Pred' columns.
    Assumes labels are space-separated letters (e.g., 'A', 'A C', 'B C').
    """
    df = pd.read_csv(csv_path)

    # Safety check
    if 'golds' in df.columns and 'preds' in df.columns:
        df.rename(columns={'golds': 'Gold', 'preds': 'Pred'}, inplace=True)
    if 'Gold' not in df.columns or 'Pred' not in df.columns:
        raise ValueError("CSV must contain 'Gold' and 'Pred' columns.")

    def to_set(s):
        return set(str(s).strip().upper().split())

    golds = df['Gold'].map(to_set)
    preds = df['Pred'].map(to_set)

    total = len(df)
    correct = sum(g == p for g, p in zip(golds, preds))
    accuracy = correct / total if total > 0 else 0.0


    if verbose:
        print(f"Fisier:{csv_path}")
        print(f"‚úÖ Total: {total}")
        print(f"‚úÖ Correct (Exact Matches): {correct}")
        print(f"üìä Accuracy: {accuracy:.2%}")

    return accuracy


### Testarea modelului: zero-shot

In [None]:
# !rm -rf /root/.cache/huggingface
# !rm -rf /content/*model*
# !rm -rf /content/*tokenizer*
# !rm -rf ./offload
model_zero, tok_zero = load_model("OpenLLM-Ro/RoLlama3.1-8b-Instruct", use_lora=True)
# save_results(golds_0, preds_0, acc_0, "zero_results.csv", "zero_confusiont.png")


In [None]:
# def run_single_prompt(prompt, model, tokenizer, max_new_tokens=16):
#     model.eval()
#     device = next(model.parameters()).device  # detecteazƒÉ device-ul real (GPU / CPU / disk offload)
#     if device.type == "cpu":
#       print("‚ö†Ô∏è Aten»õie: modelul este pe CPU ‚Äî inferen»õa va fi foarte lentƒÉ.")
#     else:
#       print("‚úÖ Modelul este pe GPU:", device)
#     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
#     inputs = {k: v.to(device) for k, v in inputs.items()}

#     with torch.no_grad():
#         output = model.generate(
#             **inputs,
#             max_new_tokens=max_new_tokens,
#             do_sample=False,
#             pad_token_id=tokenizer.pad_token_id
#         )

#     decoded = tokenizer.decode(output[0], skip_special_tokens=True)
#     print(decoded)
#     return decoded
# print(model_zero.hf_device_map)
# prompt = "√éntrebare: Ce semnificƒÉ indicatorul STOP?\nA: Oprire\nB: Prioritate\nC: Pericol\nRƒÉspuns:"
# run_single_prompt(prompt, model_zero, tok_zero)

In [None]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
# # print(model_zero)
# golds_0, preds_0, acc_0 = evaluate_model(model_zero, tok_zero, test_ds, max_new_tokens=16, verbose = True)
# save_results(golds_0, preds_0, acc_0, "zero_results_OpenLLM-Ro_RoLlama3.1-8b-Instruct.csv", "zero_confusion_OpenLLM-Ro_RoLlama3.1-8b-Instruct.png")

### Testarea modelului: fine-tuning

In [None]:
# model_zero, tok_zero = load_model("OpenLLM-Ro/RoLlama3-8b-Instruct", use_lora=True)
finetune_llm(model_zero, tok_zero, train_ds, max_steps=200)
golds_ft, preds_ft, acc_ft = evaluate_model(model_zero, tok_zero, test_ds, verbose = True, max_new_tokens=200, output_csv_path="results_RoLlama3.1_finetune.csv")
# save_results(golds_ft, preds_ft, acc_ft, "finetune_results.csv", "finetune_confusion.png")

In [None]:
golds_ft, preds_ft, acc_ft = evaluate_model(model_zero, tok_zero, test_ds, verbose = True, max_new_tokens=10, output_csv_path="results_RoLlama3.1_finetune_test.csv")

In [None]:
# compute_accuracy_from_csv("zero_results.csv")
# compute_accuracy_from_csv("finetune_results.csv")