In [None]:
import json
import re
import pandas as pd
from tqdm import tqdm
from collections import Counter
from sklearn.metrics import accuracy_score, classification_report
import anthropic

    # Cautarea raspunsului corect
def extrage_litera_raspuns(text):
    pattern = r"[Rr]ă?s?puns\s+corect\s*:\s*([a-fA-F])\b"
    match = re.search(pattern, text, flags=re.IGNORECASE | re.MULTILINE)
    if not match:
        match = re.search(r"\\boxed\{([a-fA-F])\}", text)
    return match.group(1).lower() if match else "?"

    # Incarcare shots
def incarca_shots(cale_fisier):
    if not cale_fisier:
        return ""
    with open(cale_fisier, "r", encoding="utf-8") as f:
        shots_data = [json.loads(line) for line in f]
    exemple = []
    for ex in shots_data:
        messages = ex.get("messages", [])
        user_msg = next((m["content"] for m in messages if m["role"].lower() == "user"), "")
        bot_msg = next((m["content"] for m in messages if m["role"].lower() in {"chatbot", "assistant"}), "")
        if user_msg and bot_msg:
            exemple.append(f"{user_msg.strip()}\n{bot_msg.strip()}")
    return "\n\n".join(exemple)

    # Functia de evaluare a modelului
def evalueaza_modelul_chat_claude(test_file_path, model_name, api_key, label="Set", output_csv=True, shots_file=None, silent=False):
    client = anthropic.Anthropic(api_key=api_key)
    instructiuni = "\n\nInstrucțiuni: Alege litera răspunsului corect (a–f). Răspunde exact la ultima problemă în formatul: Răspuns corect: x"
    context_fewshot = incarca_shots(shots_file)

    with open(test_file_path, "r", encoding="utf-8") as f:
        test_data = [json.loads(line) for line in f]

    user_prompts = []
    full_prompts = []
    true_labels = []
    predictions = []
    outputs_raw = []

    for idx, sample in enumerate(test_data):
        messages = sample.get("messages", [])
        user_msg = next((m["content"] for m in messages if m["role"].lower() == "user"), "")
        bot_msg = next((m["content"] for m in messages if m["role"].lower() in {"chatbot", "assistant"}), "")
        full_prompt = f"{context_fewshot}\n\n{user_msg.strip()}{instructiuni}" if context_fewshot else f"{user_msg.strip()}{instructiuni}"

        if idx == 0:
            print("\n=== Exemplu prompt complet trimis la model (primul) ===\n")
            print(full_prompt)
            print("\n=== Sfârșit prompt ===\n")

        full_prompts.append(full_prompt)
        user_prompts.append(user_msg.strip())
        true_labels.append(extrage_litera_raspuns(bot_msg))

    for prompt in tqdm(full_prompts, desc=f"Evaluare {label}"):
        try:
            response = client.messages.create(
                model=model_name,
                max_tokens=1024,
                temperature=0,
                system="Ești un asistent care rezolvă exerciții de matematică pentru admitere.",
                messages=[{"role": "user", "content": prompt}]
            )
            output = response.content[0].text.strip()
            predicted = extrage_litera_raspuns(output)
        except Exception as e:
            output = "??"
            predicted = "?"
            if not silent:
                print("Eroare:", e)

        outputs_raw.append(output)
        predictions.append(predicted)

    df = pd.DataFrame({
        "prompt": user_prompts,
        "true": true_labels,
        "pred": predictions,
        "output": outputs_raw
    })
    df["correct"] = df["true"] == df["pred"]

    if not silent:
        acc = accuracy_score(df["true"], df["pred"])
        print(f"\nAcuratețe ({label}): {acc:.2%}")
        print("\n=== Raport ===")
        print(classification_report(df["true"], df["pred"], zero_division=0))
        print("Distribuție predicții:", dict(Counter(df["pred"])))

    if output_csv:
        csv_name = f"rezultate_{label.lower().replace(' ', '_')}.csv"
        df.to_csv(csv_name, index=False, encoding="utf-8-sig")
        if not silent:
            print(f"Fișier CSV salvat: {csv_name}")

    return acc

In [None]:
# Setari
from dotenv import load_dotenv
import os
load_dotenv("Chei.env")
API_KEY = os.getenv("CLAUDE_KEY")
MODEL_NAME = "claude-3-opus-20240229"

In [None]:
# Testare pe setul de test
evalueaza_modelul_chat_claude(
    test_file_path="fine_tune_test_full.jsonl",
    model_name=MODEL_NAME,
    api_key=API_KEY,
    label="Zero-shot Test",
    shots_file=None
)
# Label - influentez numele fisierelor la iesire
# shots-file = None sau fisierul care are shots-urile

In [None]:
# Testare pe setul de validare
evalueaza_modelul_chat_claude(
    test_file_path="fine_tune_validation_full.jsonl",
    model_name=MODEL_NAME,
    api_key=API_KEY,
    label="Zero-shot Validare",
    shots_file=None
)
# Label - influentez numele fisierelor la iesire
# shots-file = None sau fisierul care are shots-urile

In [None]:
# Grafic set de testare
import matplotlib.pyplot as plt

df = pd.read_csv("rezultate_few-shot_test.csv")
df["correct"] = df["true"] == df["pred"]

label_counts = df["true"].value_counts().sort_index()
correct_counts = df[df["correct"]]["true"].value_counts().sort_index()
plt.figure(figsize=(8, 5))
plt.bar(label_counts.index, label_counts.values, alpha=0.5, label="Total")
plt.bar(correct_counts.index, correct_counts.values, alpha=0.8, label="Corecte")
plt.title("Performanță model - set Test")
plt.xlabel("Variantă de răspuns")
plt.ylabel("Număr")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Grafic set de validare
import matplotlib.pyplot as plt

df = pd.read_csv("rezultate_few-shot_validare.csv")
df["correct"] = df["true"] == df["pred"]

label_counts = df["true"].value_counts().sort_index()
correct_counts = df[df["correct"]]["true"].value_counts().sort_index()
plt.figure(figsize=(8, 5))
plt.bar(label_counts.index, label_counts.values, alpha=0.5, label="Total")
plt.bar(correct_counts.index, correct_counts.values, alpha=0.8, label="Corecte")
plt.title("Performanță model - set Validare")
plt.xlabel("Variantă de răspuns")
plt.ylabel("Număr")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Grafic cand nu putea sa aleaga unul dintre variantele de raspuns - testare
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("rezultate_zero-shot-test.csv")
df["correct"] = df["true"] == df["pred"]

# Definim toate literele a–f si ?
litere = list("abcdef")
toate_literele = litere + ["?"]

# Numar total de intrebari pentru fiecare raspuns corect (true)
true_counts = df["true"].value_counts().reindex(litere, fill_value=0)

# Numar de predictii corecte (true == pred)
correct_counts = df[df["correct"]]["true"].value_counts().reindex(litere, fill_value=0)

# Numar total de predictii cu `?`
num_unknown = (df["pred"] == "?").sum()

# Pregatire date extinse cu ?
true_counts_ext = pd.concat([true_counts, pd.Series({"?": 0})])
correct_counts_ext = pd.concat([correct_counts, pd.Series({"?": 0})])
total_counts_ext = true_counts_ext.copy()
total_counts_ext["?"] = num_unknown

# Plot
plt.figure(figsize=(9, 5))
plt.bar(total_counts_ext.index, total_counts_ext.values, alpha=0.5, label="Total")
plt.bar(correct_counts_ext.index, correct_counts_ext.values, alpha=0.8, label="Corecte")
plt.title("Performanță model – Test")
plt.xlabel("Variantă de răspuns (a–f + ?)")
plt.ylabel("Număr de întrebări")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Grafic cand nu putea sa aleaga unul dintre variantele de raspuns - validare
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("rezultate_zero-shot-validare.csv")
df["correct"] = df["true"] == df["pred"]

# Definim toate literele a–f si ?
litere = list("abcdef")
toate_literele = litere + ["?"]

# Numar total de intrebari pentru fiecare raspuns corect (true)
true_counts = df["true"].value_counts().reindex(litere, fill_value=0)

# Numar de predictii corecte (true == pred)
correct_counts = df[df["correct"]]["true"].value_counts().reindex(litere, fill_value=0)

# Numar total de predictii cu `?`
num_unknown = (df["pred"] == "?").sum()

# Pregatire date extinse cu ?
true_counts_ext = pd.concat([true_counts, pd.Series({"?": 0})])
correct_counts_ext = pd.concat([correct_counts, pd.Series({"?": 0})])
total_counts_ext = true_counts_ext.copy()
total_counts_ext["?"] = num_unknown

# Plot
plt.figure(figsize=(9, 5))
plt.bar(total_counts_ext.index, total_counts_ext.values, alpha=0.5, label="Total")
plt.bar(correct_counts_ext.index, correct_counts_ext.values, alpha=0.8, label="Corecte")
plt.title("Performanță model – Test")
plt.xlabel("Variantă de răspuns (a–f + ?)")
plt.ylabel("Număr de întrebări")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Calculez acuratetea pe dificultate pentru fiecare set
import pandas as pd

def calculeaza_acuratete_pe_dificultate(cale_csv, eticheta_set):
    df = pd.read_csv(cale_csv)
    df["dificultate"] = df["prompt"].str.extract(r"\d+\.(?:\d+)?([ABC])\.")
    if "correct" not in df.columns:
        df["correct"] = df["true"] == df["pred"]
    acuratete = df.groupby("dificultate")["correct"].mean().reset_index()
    acuratete.columns = ["Dificultate", f"Acuratețe {eticheta_set}"]
    return acuratete

acuratete_test = calculeaza_acuratete_pe_dificultate("rezultate_zero-shot-test.csv", "Test")
acuratete_val  = calculeaza_acuratete_pe_dificultate("rezultate_zero-shot-validare.csv", "Validare")

df_comparatie = pd.merge(acuratete_test, acuratete_val, on="Dificultate", how="outer")
print(df_comparatie)

In [None]:
#Acuratete pe capitol pentru test
df_test = pd.read_csv("rezultate_zero-shot-test.csv")
df_all = pd.read_csv("exercitii_extrase_structurat_rezolvari.csv")
df_test["id"] = df_test["prompt"].str.extract(r"^(\S+)").iloc[:, 0].str.replace(r"\.$", "", regex=True)
df_test["id"] = df_test["id"].astype(str)
df_all["id"] = df_all["id"].astype(str)
df_merged = df_test.merge(df_all[["id", "capitol"]], on="id", how="left")
if "correct" not in df_merged.columns:
    df_merged["correct"] = df_merged["true"] == df_merged["pred"]
accuracy_by_chapter = df_merged.groupby("capitol")["correct"].mean().reset_index()
accuracy_by_chapter.columns = ["Capitol", "Acuratețe"]
print(accuracy_by_chapter)

In [None]:
#Acuratete pe capitol pentru test
df_test = pd.read_csv("rezultate_zero-shot-test.csv")
df_all = pd.read_csv("exercitii_extrase_structurat_rezolvari.csv")

df_test["id"] = df_test["prompt"].str.extract(r"^(\S+)").iloc[:, 0].str.replace(r"\.$", "", regex=True)
df_test["id"] = df_test["id"].astype(str)
df_all["id"] = df_all["id"].astype(str)
df_merged = df_test.merge(df_all[["id", "capitol"]], on="id", how="left")
if "correct" not in df_merged.columns:
    df_merged["correct"] = df_merged["true"] == df_merged["pred"]

accuracy_by_chapter = df_merged.groupby("capitol")["correct"].mean().reset_index()
accuracy_by_chapter.columns = ["Capitol", "Acuratețe"]
print(accuracy_by_chapter)

In [None]:
#Acuratete pe capitol pentru validare
df_validare = pd.read_csv("rezultate_zero-shot-validare.csv")
df_all = pd.read_csv("exercitii_extrase_structurat_rezolvari.csv")

df_validare["id"] = df_validare["prompt"].str.extract(r"^(\S+)").iloc[:, 0].str.replace(r"\.$", "", regex=True)
df_validare["id"] = df_validare["id"].astype(str)
df_all["id"] = df_all["id"].astype(str)
df_merged = df_validare.merge(df_all[["id", "capitol"]], on="id", how="left")
if "correct" not in df_merged.columns:
    df_merged["correct"] = df_merged["true"] == df_merged["pred"]

accuracy_by_chapter = df_merged.groupby("capitol")["correct"].mean().reset_index()
accuracy_by_chapter.columns = ["Capitol", "Acuratețe"]
print(accuracy_by_chapter)

In [None]:
# Matrice de confuzie pentru testare
df_test = pd.read_csv("rezultate_zero-shot-test.csv")

# Calculeaz matricea de confuzie
cm = confusion_matrix(df_test["TRUE"], df_test["pred"], labels=["a", "b", "c", "d", "e", "f","?"])

# Setari pentru grafic
plt.figure(figsize=(4, 3.5))
sns.set(style="white", font_scale=0.9)

# Heatmap
ax = sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap="Blues",
    cbar=True,
    linewidths=0.5,
    linecolor="black",
    xticklabels=list("abcdef?"),
    yticklabels=list("abcdef?"),
    annot_kws={"size": 10, "weight": "bold", "color": "black"}
)

# Etichete si titlu
ax.set_xlabel("Predicție", fontsize=10)
ax.set_ylabel("Adevărat", fontsize=10)
ax.set_title("Matrice de confuzie – set Test", fontsize=11, weight="bold")

# Layout si salvare
plt.tight_layout()
plt.savefig("matrice_confuzie_test.png", dpi=300)
plt.show()

In [None]:
# Matrice de confuzie pentru validare
df_validare = pd.read_csv("rezultate_zero-shot-validare.csv")

# Calculeaz matricea de confuzie
cm = confusion_matrix(df_validare["TRUE"], df_validare["pred"], labels=["a", "b", "c", "d", "e", "f","?"])

# Setari pentru grafic
plt.figure(figsize=(4, 3.5))
sns.set(style="white", font_scale=0.9)

# Heatmap
ax = sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap="Blues",
    cbar=True,
    linewidths=0.5,
    linecolor="black",
    xticklabels=list("abcdef?"),
    yticklabels=list("abcdef?"),
    annot_kws={"size": 10, "weight": "bold", "color": "black"}
)

# Etichete si titlu
ax.set_xlabel("Predicție", fontsize=10)
ax.set_ylabel("Adevărat", fontsize=10)
ax.set_title("Matrice de confuzie – set Test", fontsize=11, weight="bold")

# Layout si salvare
plt.tight_layout()
plt.savefig("matrice_confuzie_test.png", dpi=300)
plt.show()