In [None]:
import json
import re
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
from openai import OpenAI

    # Incarcare shots
def incarca_shots(cale_fisier):
    if cale_fisier is None:
        return []
    with open(cale_fisier, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

    # Cautarea raspunsului corect
def extrage_raspuns_corect(text):
    match = re.search(r"[Rr]ăspuns\s+corect\s*:\s*[^a-zA-Z]*([a-fA-F])", text)
    if not match:
        match = re.search(r"\\boxed\{([a-fA-F])\}", text)
    return match.group(1).lower() if match else "?"

    # Functia de evaluare a modelului
def evalueaza_modelul(test_file_path, model_name, api_key, label="Set", output_csv=True, shots_file=None):
    client = OpenAI(api_key=api_key)

    with open(test_file_path, "r", encoding="utf-8") as f:
        test_data = [json.loads(line) for line in f]

    shots = incarca_shots(shots_file)

    # Pregatirea shot-urilor ca o lista
    shots_messages = []
    for shot in shots:
        shots_messages.extend(shot["messages"])

    has_system = any(m["role"] == "system" for m in shots_messages)

    # Pregatirea setului de date pt incarcare
    
    true_labels = []
    user_prompts = []
    assistant_outputs = []

    for sample in test_data:
        user_prompt = sample["messages"][1]["content"]
        assistant_answer = sample["messages"][2]["content"]
        true_label = extrage_raspuns_corect(assistant_answer)

        user_prompts.append(user_prompt)
        true_labels.append(true_label)
        assistant_outputs.append(assistant_answer)

    # Rularea modelului  

    pred_labels = []
    prompturi_generate = []
    raspunsuri_generate = []

    for prompt in tqdm(user_prompts, desc=f"Evaluare {label}"):
        try:
            prompt_instr = prompt.strip() + "\n\nInstrucțiuni: Alege litera răspunsului corect (a–f). Răspunde exact în formatul: Răspuns corect: x"
            messages = []

            if not has_system:
                messages.append({
                    "role": "system",
                    "content": "Ești un asistent care rezolvă exerciții de matematică pentru admitere și oferă explicații clare."
                })

            messages.extend(shots_messages)
            messages.append({"role": "user", "content": prompt_instr})

            prompt_text = "\n".join([f"{m['role']}: {m['content']}" for m in messages])
            prompturi_generate.append(prompt_text)

            response = client.chat.completions.create(
                model=model_name,
                messages=messages
            )
            reply = response.choices[0].message.content
            raspunsuri_generate.append(reply)

            predicted = extrage_raspuns_corect(reply)
            pred_labels.append(predicted if predicted else "?")

        except Exception as e:
            print("Eroare:", e)
            pred_labels.append("?")
            raspunsuri_generate.append("Eroare")
            prompturi_generate.append(prompt_text if 'prompt_text' in locals() else "N/A")

    # Extragerea tuturor informatiilor de la prompt si le aranjam 
    
    df_results = pd.DataFrame({
        "prompt_original": user_prompts,
        "prompt_model": prompturi_generate,
        "output": raspunsuri_generate,
        "true": true_labels,
        "pred": pred_labels
    })
    df_results["correct"] = df_results["true"] == df_results["pred"]

    # Date si rapoarte

    acc = accuracy_score(df_results["true"], df_results["pred"])
    print(f"\nAcuratețe ({label}): {acc:.2%}")
    print("\n=== Raport ===")
    print(classification_report(df_results["true"], df_results["pred"], zero_division=0))

    # Grafic
    toate_literele = list("abcdef")
    true_counts = df_results["true"].value_counts().reindex(toate_literele, fill_value=0)
    correct_counts = df_results[df_results["correct"]].groupby("true")["correct"].count().reindex(toate_literele, fill_value=0)

    plt.figure(figsize=(8, 5))
    plt.bar(true_counts.index, true_counts.values, alpha=0.5, label="Total")
    plt.bar(correct_counts.index, correct_counts.values, alpha=0.8, label="Corecte")
    plt.title(f"Performanță GPT ({label})")
    plt.xlabel("Variantă de răspuns")
    plt.ylabel("Număr")
    plt.legend()
    plt.tight_layout()
    plt.show()

    if output_csv:
        csv_name = f"rezultate_{label.lower().replace(' ', '_')}.csv"
        df_results.to_csv(csv_name, index=False, encoding="utf-8-sig")
        print(f"Rezultatele au fost salvate în: {csv_name}")

    return df_results

In [None]:
# Setari
from dotenv import load_dotenv
import os
load_dotenv("Chei.env") # Chei
API_KEY = os.getenv("OPENAI_API_KEY1")
MODEL_NAME = "gpt-4.1" #modele: GPT-3.5-turbo, GPT-4.1, modele gpt fine-tunate etc.

In [None]:
# Testare pe setul de test
rez_test = evalueaza_modelul("fine_tune_full/fine_tune_test_full.jsonl", MODEL_NAME, API_KEY, label="Few-shot-test", shots_file="few-shots.jsonl")
# Label - influentez numele fisierelor la iesire
# shots-file = None sau fisierul care are shots-urile

In [None]:
# Testare pe setul de validare
rez_valid = evalueaza_modelul("fine_tune_full/fine_tune_validation_full.jsonl", MODEL_NAME, API_KEY, label="Few-shot-validation", shots_file="few-shots.jsonl")
# Label - influentez numele fisierelor la iesire
# shots-file = None sau fisierul care are shots-urile

In [None]:
# Acuratete pe seturi
acc_valid = rez_valid["correct"].mean()
acc_test = rez_test["correct"].mean()

print(f"Acuratețe Validare: {acc_valid:.2%}")
print(f"Acuratețe Test:     {acc_test:.2%}")

In [None]:
# Calculez acuratetea pe dificultate pentru fiecare set
import pandas as pd

def calculeaza_acuratete_pe_dificultate(cale_csv, eticheta_set):
    df = pd.read_csv(cale_csv)

    # Extrag dificultatea din inceputul promptului
    df["dificultate"] = df["prompt_original"].str.extract(r"\d+\.(?:\d+)?([ABC])\.")

    # Calculez corectitudinea
    if "correct" not in df.columns:
        df["correct"] = df["true"] == df["pred"]

    # Acuratete pe fiecare dificultate
    acuratete = df.groupby("dificultate")["correct"].mean().reset_index()
    acuratete.columns = ["Dificultate", f"Acuratețe {eticheta_set}"]

    return acuratete

acuratete_test = calculeaza_acuratete_pe_dificultate("rezultate_few-shot-test.csv", "Test")
acuratete_val  = calculeaza_acuratete_pe_dificultate("rezultate_few-shot-validation.csv", "Validare")

# Comparare
df_comparatie = pd.merge(acuratete_test, acuratete_val, on="Dificultate", how="outer")
print(df_comparatie)

In [None]:
#Acuratete pe capitol pentru test
import pandas as pd

df_test = pd.read_csv("rezultate_few-shot-test.csv")
df_all = pd.read_csv("exercitii_extrase_structurat_rezolvari.csv")

# Extrag ID-ul din începutul promptului si elimin punctul final
df_test["id"] = df_test["prompt_original"].str.extract(r"^(\S+)").iloc[:, 0].str.replace(r"\.$", "", regex=True)

# Asigur tipul string pentru merge
df_test["id"] = df_test["id"].astype(str)
df_all["id"] = df_all["id"].astype(str)

# Adaug coloana 'capitol' prin merge
df_merged = df_test.merge(df_all[["id", "capitol"]], on="id", how="left")

# Calculeaza corectitudinea 
if "correct" not in df_merged.columns:
    df_merged["correct"] = df_merged["true"] == df_merged["pred"]

# Acuratete pe capitol
accuracy_by_chapter = df_merged.groupby("capitol")["correct"].mean().reset_index()
accuracy_by_chapter.columns = ["Capitol", "Acuratețe"]

# Rezultate
print(accuracy_by_chapter)

In [None]:
#Acuratete pe capitol pentru validare
import pandas as pd

df_test = pd.read_csv("rezultate_few-shot-validation.csv")
df_all = pd.read_csv("exercitii_extrase_structurat_rezolvari.csv")

# Extrag ID-ul din începutul promptului si elimin punctul final
df_test["id"] = df_test["prompt_original"].str.extract(r"^(\S+)").iloc[:, 0].str.replace(r"\.$", "", regex=True)

# Asigur tipul string pentru merge
df_test["id"] = df_test["id"].astype(str)
df_all["id"] = df_all["id"].astype(str)

# Adaug coloana 'capitol' prin merge
df_merged = df_test.merge(df_all[["id", "capitol"]], on="id", how="left")

# Calculeaza corectitudinea 
if "correct" not in df_merged.columns:
    df_merged["correct"] = df_merged["true"] == df_merged["pred"]

# Acuratete pe capitol
accuracy_by_chapter = df_merged.groupby("capitol")["correct"].mean().reset_index()
accuracy_by_chapter.columns = ["Capitol", "Acuratețe"]

# Rezultate
print(accuracy_by_chapter)

In [None]:
# Matrice de confuzie pentru testare
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

df_test = pd.read_csv("rezultate_few-shot-test.csv")

# Calculeaz matricea de confuzie
cm = confusion_matrix(df_test["true"], df_test["pred"], labels=["a", "b", "c", "d", "e", "f"])

# Setari pentru grafic
plt.figure(figsize=(4, 3.5))  # dimensiune mică, bună pentru imprimare
sns.set(style="white", font_scale=0.9)

# Heatmap
ax = sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap="Blues",
    cbar=True,
    linewidths=0.5,
    linecolor="black",
    xticklabels=list("abcdef"),
    yticklabels=list("abcdef"),
    annot_kws={"size": 10, "weight": "bold", "color": "black"}
)

# Etichete si titlu
ax.set_xlabel("Predicție", fontsize=10)
ax.set_ylabel("Adevărat", fontsize=10)
ax.set_title("Matrice de confuzie – Răspunsuri a–f", fontsize=11, weight="bold")

# Layout si salvare
plt.tight_layout()
plt.savefig("matrice_confuzie_test.png", dpi=300)
plt.show()

In [None]:
# Matrice de confuzie pentru validare
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

df_test = pd.read_csv("rezultate_few-shot-validation.csv")

# Calculeaz matricea de confuzie
cm = confusion_matrix(df_test["true"], df_test["pred"], labels=["a", "b", "c", "d", "e", "f"])

# Setari pentru grafic
plt.figure(figsize=(4, 3.5))  # dimensiune mică, bună pentru imprimare
sns.set(style="white", font_scale=0.9)

# Heatmap
ax = sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap="Blues",
    cbar=True,
    linewidths=0.5,
    linecolor="black",
    xticklabels=list("abcdef"),
    yticklabels=list("abcdef"),
    annot_kws={"size": 10, "weight": "bold", "color": "black"}
)

# Etichete si titlu
ax.set_xlabel("Predicție", fontsize=10)
ax.set_ylabel("Adevărat", fontsize=10)
ax.set_title("Matrice de confuzie – Răspunsuri a–f", fontsize=11, weight="bold")

# Layout si salvare
plt.tight_layout()
plt.savefig("matrice_confuzie_validare.png", dpi=300)
plt.show()