In [None]:
!gcloud config list

In [None]:
!pip install datasets

In [1]:
from vertexai import init
from vertexai.preview.generative_models import GenerativeModel

init(project="gen-lang-client-0947040378", location="us-central1")
model = GenerativeModel("gemini-2.5-flash-preview-05-20")

In [3]:
import json
import re
import pandas as pd
from tqdm import tqdm
import glob
import time
from vertexai.preview.generative_models import GenerativeModel

def extract_choices_from_prompt(text):
    return set(re.findall(r"\b([ABC])\s*:", text.upper()))

def extract_choices_from_response(text):
    text = text.upper()
    match = re.search(r"RĂSPUNS(?:\s+CORECT)?\s*[:：]?\s*([ABC](?:[\s,]+[ABC])*)", text)
    if match:
        return set(re.findall(r"[ABC]", match.group(1)))
    return None

def evaluate_gemini(model, dataset, max_examples=None, verbose=False, csv_path=None,multimodal=False):
    results = []
    exact_matches = 0
    
    for i, ex in enumerate(tqdm(dataset, desc="Evaluare QA", unit="ex")):
        if max_examples and i >= max_examples:
            break
        try:
            prompt = None
            if multimodal:
                image_text=str(ex.get("image", "")).strip()
                prompt = (
                  image_text+"\n"
                  + ex["prompt"].strip()
                  + "\nMai întâi oferă doar litera/literele răspunsului corect sub forma:\nRăspuns: A\n"
                  + "Apoi oferă o explicație succintă pe un rând nou, fără 'Let's think'.\n"
                  + "Răspuns:"
                  )
            else:
                prompt = (
                  ex["prompt"].strip()
                  + "\nMai întâi oferă doar litera/literele răspunsului corect sub forma:\nRăspuns: A\n"
                  + "Apoi oferă o explicație succintă pe un rând nou, fără 'Let's think'.\n"
                  + "Răspuns:"
                  )
            gold = extract_choices_from_response(ex["response"])

            response = model.generate_content(
                prompt
            )

            text = getattr(response, "text", "").strip()
            pred = extract_choices_from_response(text)

            if verbose:
                print(f"\n📘 Prompt: {prompt}")
                print(f"✅ Gold: {gold}")
                print(f"🧠 Pred: {pred}")
                print(f"📝 Gemini raw: {text}")

            match = gold == pred if gold and pred else False
            if match:
                exact_matches += 1

            results.append({
                "index":str(i),
                "prompt": ex["prompt"].strip(),
                "golds": ", ".join(sorted(gold)) if gold else "",
                "preds": ", ".join(sorted(pred)) if pred else "",
                "gemini_raw": text
            })

        except Exception as e:
            print(f"[❌ EROARE] {e}")
            continue

    # Accuracy
    accuracy = exact_matches / len(results) if results else 0
    print(f"\n📊 Acuratețe totală (exact match): {accuracy:.2%}")

    # Export CSV
    if csv_path:
        df = pd.DataFrame(results)
        df.to_csv(csv_path, index=False, encoding="utf-8")
        print(f"📁 Rezultate salvate în: {csv_path}")

    return results, accuracy


In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
with open("multimodal.jsonl", "r", encoding="utf-8") as f:
    dataset = [json.loads(line) for line in f]
    dataset_text_only = Dataset.from_list(dataset)

    split_dataset = dataset_text_only.train_test_split(test_size=0.2, seed=42)
    train_ds = split_dataset["train"]
    test_ds = split_dataset["test"]
    print(test_ds[1])

In [None]:
results, acc = evaluate_gemini(
    model,
    dataset=test_ds,
    max_examples=250,
    verbose=True,
    multimodal=True,
    csv_path="gemini2.5_flash_qa_results.csv",
)

In [None]:
selected_indices = [38,51,63,111,199]
filtered_test_ds = test_ds.select(selected_indices)
results, acc = evaluate_gemini(
    model,
    dataset=filtered_test_ds,
    max_examples=250,
    verbose=True,
    multimodal=True,
    csv_path="completion_gemini2.5_flash_qa_results.csv",
)

In [5]:
def create_judging_prompt(row):
    return f"""
Ești un evaluator expert în QA. Îți dau un task QA:

Întrebare: {row['prompt']}
Răspuns corect: {row['golds']}
Răspuns prezis: {row['preds']}
Explicație: {row['response']}

Evalueaza NUMAI calitatea explicației. Este explicația în concordanță cu răspunsul prezis? Dar cu cel corect? Are legătură cu întrebarea 

Judecă în acest format:
- Scor (între 0 și 5): 
- Justificare:
"""

def extract_score(judgment_text):
    match = re.search(r'Score\s*\(?\d\s*to\s*5\)?:?\s*(\d)', judgment_text)
    return int(match.group(1)) if match else None

In [None]:
model_files = glob.glob("*.csv")

for file in model_files:
    df = pd.read_csv(file)
    judgments = []
    scores = []
    if 'gemini_raw' in df.columns:
        df.rename(columns={'gemini_raw': 'response'}, inplace=True)

    for i, row in df.iterrows():
        try:
            prompt = create_judging_prompt(row)
            response = model.generate_content(prompt)
            text = response.text.strip()
            judgments.append(text)
            scores.append(extract_score(text))
            print(f'Completed row {i} for file {file}')
        except Exception as e:
            print(f"Exception occured: {e}")
            
    
    df["judgment"] = judgments
    df["score"] = scores
    df.to_csv(file, index=False)
    print(f"Updated: {file}")
