In [None]:
!pip install datasets



In [None]:
#curatare set de date HuggingFace
import re
from datasets import load_dataset, Dataset, Features, Value
import time

# --- functii de curatare  ---

def curata_problem_statement(text):
    """Curatarea textului unei probleme (care vine din coloana 'prompt')."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = text.replace('$', '')
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    if text.endswith(('.', '?')):
        text = text[:-1].strip()
    return text

def curata_python_solution(text):
    """Curatarea solutiei Python (care vine din coloana 'completion')."""
    if not isinstance(text, str):
        return ""
    text = text.strip()
    return text

# --- functie pentru dedublicare ---

def get_unique_indices(data_list):
    """returneaza indicii primelor aparitii ale elementelor unice dintr-o lista"""
    seen = set()
    indices = []
    for idx, item in enumerate(data_list):
        if item not in seen:
            seen.add(item)
            indices.append(idx)
    return indices

# --- procesarea setului de date ---

# 1. incarcam setul de date fara a specifica 'features', permitand bibliotecii sa detecteze automat coloanele corecte.
print("1. Incarcare set de date")
start_time = time.time()
dataset = None
try:
    # incercam sa incarcam split-ul 'train', lasand 'features' să fie inferat
    dataset = load_dataset("sdiazlor/math-python-reasoning-dataset", split="train")
    print("   Încărcat cu succes split='train'.")
except Exception as e:
    print(f"   Nu am putut încărca split='train', încerc fără split specificat (probabil default='train'). Eroare: {e}")
    try:
        # daca 'train' esueaza, incercam fara a specifica split-ul (de obicei ia 'train' ca default)
        dataset = load_dataset("sdiazlor/math-python-reasoning-dataset")
        # daca datasetul are mai multe splituri, va încarca un DatasetDict ==> selectam 'train'.
        if isinstance(dataset, dict) and "train" in dataset:
             print("   Dataset încărcat ca dict, selectez split='train'.")
             dataset = dataset['train']
        elif isinstance(dataset, dict):
             print(f"   Dataset încărcat ca dict, dar nu conține split='train'. Splituri disponibile: {list(dataset.keys())}")
             dataset = None
        else:
             print("Dataset încărcat cu succes (probabil default='train').")

    except Exception as e2:
        print(f"   Nu am putut încărca setul de date. Eroare finală: {e2}")
        # exit() # a se decomenta daca vrei sa opresti scriptul complet

if dataset is None:
    print("\n!!! EROARE FATALĂ: Nu s-a putut încărca setul de date. Verifică conexiunea și numele setului de date.")
else:
    load_time = time.time() - start_time
    print(f"   Set de date incărcat în {load_time:.2f} secunde.")
    print(f"   Numar initial de exemple: {len(dataset)}")
    print(f"   Coloane detectate: {dataset.column_names}")
    print(f"   Exemplu date originale (folosind coloanele reale):\n{dataset[0]}\n")

    # 2. aplicam functiile de curatare folosind .map()
    # adaptam maparea pentru a citi din 'prompt' și 'completion'
    print("2. Curatare date (prompt -> problem_cleaned, completion -> solution_cleaned)...")
    start_time = time.time()
    dataset_curatat = dataset.map(
        lambda exemplu: {
            'problem_cleaned': curata_problem_statement(exemplu['prompt']),
            'solution_cleaned': curata_python_solution(exemplu['completion'])
        },
        num_proc=4, # pot fi ajustate
        remove_columns=dataset.column_names
    )
    map_time = time.time() - start_time
    print(f"   Curatare aplicata in {map_time:.2f} secunde.")
    print(f"   Coloane după curatare: {dataset_curatat.column_names}")
    print(f"   Exemplu date după curatare:\n{dataset_curatat[0]}\n")

    # 3. identificam și eliminam duplicatele bazate pe 'problem_cleaned'
    print("3. Identificare si eliminare duplicate bazate pe 'problem_cleaned'...")
    start_time = time.time()

    # extragem coloana pe care ne bazam pentru deduplicare
    probleme_curatate_lista = dataset_curatat['problem_cleaned']

    # gasim indicii unici
    indici_unici = get_unique_indices(probleme_curatate_lista)

    num_duplicates = len(dataset_curatat) - len(indici_unici)
    dedup_time = time.time() - start_time
    print(f"   Identificare indici unici în {dedup_time:.2f} secunde.")
    print(f"   Număr de duplicate eliminate: {num_duplicates}")

    # 4. Selectam doar randurile unice pentru a crea setul de date final
    print("4. Creare set de date final deduplicat...")
    start_time = time.time()
    dataset_final = dataset_curatat.select(indici_unici)
    select_time = time.time() - start_time
    print(f"   Selectare rânduri unice în {select_time:.2f} secunde.")
    print(f"   Număr final de exemple: {len(dataset_final)}")

    print("\n--- Procesare completă ---")
    print(f"Numar inițial de probleme: {len(dataset)}") # Folosim len(dataset) original
    print(f"Numar final de probleme unice (după curatare și deduplicare): {len(dataset_final)}")



    # afisam un exemplu final
    print("\nExemplu din setul de date final:")
    if len(dataset_final) > 4:
        print(dataset_final[4])
    elif len(dataset_final) > 0:
        print(dataset_final[0])
    else:
        print("Dataset final este gol.")


    # --- salvare ca CSV ---
    output_csv_filename = "math_python_dataset_curatat.csv"
    print(f"\nSalvare set de date curățat ca fișier CSV: '{output_csv_filename}'...")
    try:
        # index=False este important pentru a nu scrie indexul intern al dataset-ului ca o coloană în CSV
        dataset_final.to_csv(output_csv_filename, index=False, encoding='utf-8')
        print(f"   Set de date salvat cu succes ca '{output_csv_filename}'.")
        # in Colab, fisierul va aparea in panoul "Files" din stanga dupa un timp scurt.
    except Exception as e:
        print(f"   EROARE la salvarea fișierului CSV: {e}")
    # print("\nSalvare set de date curățat...")
    # dataset_final.save_to_disk("./math_python_dataset_curatat")
    # print("   Set de date salvat în directorul './math_python_dataset_curatat'")

1. Incarcare set de date


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]


Downloading data:   0%|          | 0.00/1.12M [00:00<?, ?B/s][A
Downloading data: 100%|██████████| 1.12M/1.12M [00:00<00:00, 3.58MB/s]


Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/500 [00:00<?, ? examples/s]

   Nu am putut încărca split='train', încerc fără split specificat (probabil default='train'). Eroare: Loading a dataset cached in a LocalFileSystem is not supported.
   Nu am putut încărca setul de date. Eroare finală: Loading a dataset cached in a LocalFileSystem is not supported.

!!! EROARE FATALĂ: Nu s-a putut încărca setul de date. Verifică conexiunea și numele setului de date.


In [None]:
#alegerea modelului

In [None]:
!pip install datasets sentence-transformers faiss-cpu torch

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.meta

In [None]:
import pandas as pd

csv_filename = "math_python_dataset_curatat.csv"
try:
    df = pd.read_csv(csv_filename)
    #extragem doar coloana cu problemele curatate intr-o lista
    lista_probleme_curatate = df['problem_cleaned'].tolist()
    print(f"Am încărcat {len(lista_probleme_curatate)} probleme din coloana 'problem_cleaned'.")
    # print("Primele 5 probleme:", lista_probleme_curatate[:5])
except FileNotFoundError:
    print(f"EROARE: Fișierul '{csv_filename}' nu a fost găsit.")
    exit()
except KeyError:
    print(f"EROARE: Coloana 'problem_cleaned' nu a fost găsită în '{csv_filename}'.")
    exit()
except Exception as e:
    print(f"EROARE la citirea CSV-ului: {e}")
    exit()

Am încărcat 491 probleme din coloana 'problem_cleaned'.


In [None]:
modele_de_testat = ['all-MiniLM-L6-v2', 'BAAI/bge-base-en-v1.5']

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import time
import ast

dataset_csv_filename = "math_python_dataset_curatat.csv"
modele_de_testat = ['all-MiniLM-L6-v2', 'BAAI/bge-base-en-v1.5']
K_SEARCH = 5 #vecinii cei mai apropiati sa returneze căutarea

print(f"1. Încărcare probleme din '{dataset_csv_filename}'...")
try:
    df = pd.read_csv(dataset_csv_filename)
    lista_probleme_curatate = df['problem_cleaned'].fillna("").astype(str).tolist() # Asigură stringuri, tratează NaN
    if not lista_probleme_curatate:
         raise ValueError("Lista de probleme este goală după încărcare.")
    print(f"   Am încărcat {len(lista_probleme_curatate)} probleme.")
except FileNotFoundError:
    print(f"EROARE: Fișierul '{dataset_csv_filename}' nu a fost găsit.")
    exit()
except KeyError:
    print(f"EROARE: Coloana 'problem_cleaned' nu a fost găsită în '{dataset_csv_filename}'.")
    exit()
except Exception as e:
    print(f"EROARE la citirea CSV-ului principal: {e}")
    exit()

print("\n2. Definire/Încărcare set de referință...")


set_referinta = [
    { "query_index": 10, "query_text": "what is the sum of 5 and 7", "expected_indices": [25, 150] },
    { "query_index": 45, "query_text": "multiply 6 by 3", "expected_indices": [90] },
    { "query_index": 82, "query_text": "if x = 10 and y = 4 what is x - y", "expected_indices": [120, 30] },
    { "query_index": 15, "query_text": "calculate area of rectangle length 8 width 2", "expected_indices": [200] },

]
if not set_referinta:
     print("AVERTISMENT: Setul de referință este gol. Evaluarea nu va fi posibilă.")
else:
     print(f"   Set de referință definit/încărcat cu {len(set_referinta)} intrări.")


print(f"\n3. Testare modele de embedding (k={K_SEARCH})...")

results = {}

for model_name in modele_de_testat:
    print(f"\n--- Testare Model: {model_name} ---")
    results[model_name] = {
        "encoding_time": None,
        "avg_search_time": None,
        "hits_at_k": 0,
        "total_queries": len(set_referinta) if set_referinta else 0,
        "precision_at_k_sum": 0.0,
        "recall_at_k_sum": 0.0
    }

    try:

        print("   Încărcare model...")
        start_load = time.time()
        model = SentenceTransformer(model_name)
        print(f"   Model încărcat în {time.time() - start_load:.2f} sec.")


        print(f"   Generare embeddings pentru {len(lista_probleme_curatate)} probleme...")
        start_encode = time.time()
        embeddings = model.encode(lista_probleme_curatate, show_progress_bar=True, normalize_embeddings=True)
        encoding_time = time.time() - start_encode
        results[model_name]["encoding_time"] = encoding_time
        print(f"   Embeddings generate în {encoding_time:.2f} sec.")


        embedding_dim = embeddings.shape[1]

        index = faiss.IndexFlatIP(embedding_dim)
        index.add(np.array(embeddings).astype('float32'))
        print(f"   Index FAISS creat (dim={embedding_dim}, {index.ntotal} vectori).")


        if not set_referinta:
             print("   Omitere evaluare din cauza setului de referință gol.")
             continue

        print("   Rulare căutări pe setul de referință...")
        total_search_time = 0
        num_searches = 0

        for item_ref in set_referinta:
            query_idx = item_ref["query_index"]

            if query_idx < 0 or query_idx >= len(lista_probleme_curatate):
                print(f"Avertisment: query_index {query_idx} invalid. Omitere.")
                results[model_name]["total_queries"] -= 1
                continue

            query_text = lista_probleme_curatate[query_idx]
            expected_indices = set(item_ref["expected_indices"])
            num_expected = len(expected_indices)

            start_search_single = time.time()
            query_embedding = model.encode([query_text], normalize_embeddings=True)


            distances, retrieved_indices_with_dist = index.search(np.array(query_embedding).astype('float32'), K_SEARCH)
            total_search_time += (time.time() - start_search_single)
            num_searches += 1

            retrieved_indices = set(retrieved_indices_with_dist[0])

            retrieved_indices_no_self = retrieved_indices - {query_idx}

            relevant_found = retrieved_indices_no_self.intersection(expected_indices)
            num_relevant_found = len(relevant_found)

            if num_relevant_found > 0:
                results[model_name]["hits_at_k"] += 1


            precision_k = num_relevant_found / K_SEARCH if K_SEARCH > 0 else 0.0
            results[model_name]["precision_at_k_sum"] += precision_k

            recall_k = num_relevant_found / num_expected if num_expected > 0 else 0.0
            results[model_name]["recall_at_k_sum"] += recall_k

        if num_searches > 0:
            results[model_name]["avg_search_time"] = total_search_time / num_searches
            avg_precision = results[model_name]["precision_at_k_sum"] / num_searches
            avg_recall = results[model_name]["recall_at_k_sum"] / num_searches
            hit_rate = results[model_name]["hits_at_k"] / num_searches

            print(f"   Evaluare finalizată pentru {model_name}:")
            print(f"      Timp mediu căutare: {results[model_name]['avg_search_time']:.4f} sec")
            print(f"      Hit Rate@{K_SEARCH}:   {hit_rate:.2%}")
            print(f"      Precision@{K_SEARCH}:  {avg_precision:.4f}")
            print(f"      Recall@{K_SEARCH}:     {avg_recall:.4f}")
        else:
             print("   Nu s-au rulat căutări (set de referință gol sau invalid).")


    except Exception as e:
        print(f"EROARE în timpul procesării modelului {model_name}: {e}")

print("\n--- Rezumat Comparativ ---")
if not results:
     print("Nu există rezultate de comparat.")
else:
    print(f"Metrici calculate pentru top {K_SEARCH} rezultate returnate.")
    print("-" * 80)
    print(f"{'Model':<25} | {'Encoding (s)':<12} | {'Avg Search (s)':<14} | {'Hit Rate':<10} | {'Precision':<10} | {'Recall':<10}")
    print("-" * 80)
    for model_name, metrics in results.items():
        enc_time = f"{metrics['encoding_time']:.2f}" if metrics['encoding_time'] is not None else "N/A"
        search_time = f"{metrics['avg_search_time']:.4f}" if metrics['avg_search_time'] is not None else "N/A"

        if metrics['total_queries'] > 0 and metrics['avg_search_time'] is not None:
            hit_rate_val = metrics['hits_at_k'] / metrics['total_queries']
            precision_val = metrics['precision_at_k_sum'] / metrics['total_queries']
            recall_val = metrics['recall_at_k_sum'] / metrics['total_queries']
            hit_rate_str = f"{hit_rate_val:.2%}"
            precision_str = f"{precision_val:.4f}"
            recall_str = f"{recall_val:.4f}"
        else:
             hit_rate_str = "N/A"
             precision_str = "N/A"
             recall_str = "N/A"


        print(f"{model_name:<25} | {enc_time:<12} | {search_time:<14} | {hit_rate_str:<10} | {precision_str:<10} | {recall_str:<10}")
    print("-" * 80)


1. Încărcare probleme din 'math_python_dataset_curatat.csv'...
   Am încărcat 491 probleme.

2. Definire/Încărcare set de referință...
   Set de referință definit/încărcat cu 4 intrări.

3. Testare modele de embedding (k=5)...

--- Testare Model: all-MiniLM-L6-v2 ---
   Încărcare model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

   Model încărcat în 10.53 sec.
   Generare embeddings pentru 491 probleme...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

   Embeddings generate în 24.38 sec.
   Index FAISS creat (dim=384, 491 vectori).
   Rulare căutări pe setul de referință...
   Evaluare finalizată pentru all-MiniLM-L6-v2:
      Timp mediu căutare: 0.0220 sec
      Hit Rate@5:   0.00%
      Precision@5:  0.0000
      Recall@5:     0.0000

--- Testare Model: BAAI/bge-base-en-v1.5 ---
   Încărcare model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

   Model încărcat în 6.51 sec.
   Generare embeddings pentru 491 probleme...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

   Embeddings generate în 225.48 sec.
   Index FAISS creat (dim=768, 491 vectori).
   Rulare căutări pe setul de referință...
   Evaluare finalizată pentru BAAI/bge-base-en-v1.5:
      Timp mediu căutare: 0.1212 sec
      Hit Rate@5:   0.00%
      Precision@5:  0.0000
      Recall@5:     0.0000

--- Rezumat Comparativ ---
Metrici calculate pentru top 5 rezultate returnate.
--------------------------------------------------------------------------------
Model                     | Encoding (s) | Avg Search (s) | Hit Rate   | Precision  | Recall    
--------------------------------------------------------------------------------
all-MiniLM-L6-v2          | 24.38        | 0.0220         | 0.00%      | 0.0000     | 0.0000    
BAAI/bge-base-en-v1.5     | 225.48       | 0.1212         | 0.00%      | 0.0000     | 0.0000    
--------------------------------------------------------------------------------


In [None]:
!pip install langchain langchain-community sentence-transformers chromadb pandas openpyxl

Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting chromadb
  Downloading chromadb-1.0.11-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.2.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collectin

In [None]:
import pandas as pd
import time
import os
from langchain_community.document_loaders import CSVLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

dataset_csv_filename = "math_python_dataset_curatat.csv"
SOURCE_COLUMN = "problem_cleaned"
METADATA_COLUMNS = ["solution_cleaned"]

EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
CHROMA_DB_PATH = "./chroma_db_math"
COLLECTION_NAME = "math_problems"


try:
    import torch
    if torch.cuda.is_available():
        DEVICE = "cuda"
        print("INFO: Se va folosi GPU (cuda).")
    else:
        DEVICE = "cpu"
        print("INFO: Se va folosi CPU.")
except ImportError:
    DEVICE = "cpu"
    print("INFO: PyTorch nu este instalat, se va folosi CPU.")

print("--- Start Creare/Încărcare Vector Store ChromaDB ---")
print(f"Folosind fișierul CSV: '{dataset_csv_filename}'")
print(f"Coloana sursă pentru embedding: '{SOURCE_COLUMN}'")
print(f"Coloane metadate: {METADATA_COLUMNS}")
print(f"Model embedding: '{EMBEDDING_MODEL_NAME}' pe dispozitivul '{DEVICE}'")
print(f"Calea bazei de date ChromaDB: '{CHROMA_DB_PATH}'")
print(f"Nume colecție: '{COLLECTION_NAME}'")


print("\n1. Verificare fișier CSV și coloane...")
if not os.path.exists(dataset_csv_filename):
    print(f"EROARE FATALĂ: Fișierul CSV '{dataset_csv_filename}' nu a fost găsit.")
    exit()

try:
    df_check = pd.read_csv(dataset_csv_filename, nrows=1)
    required_cols = [SOURCE_COLUMN] + METADATA_COLUMNS
    missing_cols = [col for col in required_cols if col not in df_check.columns]
    if missing_cols:
        raise KeyError(f"Coloane lipsă în CSV: {missing_cols}")
    print("   Fișierul CSV și coloanele necesare există.")
except KeyError as e:
    print(f"EROARE FATALĂ: {e}")
    print("   Verificați numele coloanelor în script (SOURCE_COLUMN, METADATA_COLUMNS) și în fișierul CSV.")
    exit()
except Exception as e:
    print(f"EROARE FATALĂ la verificarea CSV-ului: {e}")
    exit()


print(f"\n2. Inițializare model de embedding '{EMBEDDING_MODEL_NAME}'...")
start_time = time.time()
try:

    embedding_function = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        model_kwargs={'device': DEVICE},
        encode_kwargs={'normalize_embeddings': True}
    )
    print(f"   Model de embedding inițializat. Va rula pe {DEVICE}.")
except Exception as e:
    print(f"EROARE FATALĂ la inițializarea modelului de embedding: {e}")
    print("   Verificați conexiunea la internet și dacă modelul există pe Hugging Face Hub.")
    exit()
embed_init_time = time.time() - start_time
print(f"   Timp inițializare model: {embed_init_time:.2f} secunde.")


vectorstore = None
if os.path.exists(CHROMA_DB_PATH):
    print(f"\n3. Se încearcă încărcarea bazei de date existente din '{CHROMA_DB_PATH}'...")
    try:
        vectorstore = Chroma(
            persist_directory=CHROMA_DB_PATH,
            embedding_function=embedding_function,
            collection_name=COLLECTION_NAME
        )
        print(f"   Baza de date ChromaDB încărcată cu succes. Conține {vectorstore._collection.count()} documente.")
    except Exception as e:
        print(f"   EROARE la încărcarea bazei de date existente: {e}")
        print("   Se va încerca crearea unei baze de date noi.")
        vectorstore = None
else:
     print(f"\n3. Directorul '{CHROMA_DB_PATH}' nu există. Se va crea o bază de date nouă.")

if vectorstore is None:
    print(f"\n3. Creare bază de date nouă ChromaDB în '{CHROMA_DB_PATH}'...")
    print("   Încărcare documente din CSV...")
    start_time = time.time()
    try:
        loader = CSVLoader(
            file_path=dataset_csv_filename,
            source_column=SOURCE_COLUMN,
            metadata_columns=METADATA_COLUMNS,
            encoding='utf-8'
        )
        documents = loader.load()

        if not documents:
            raise ValueError("Nu s-au încărcat documente din CSV. Verificați fișierul.")

        print(f"   Am încărcat {len(documents)} documente din CSV.")

        data_to_index = documents

        print("   Generare embeddings și indexare în ChromaDB (acest pas poate dura)...")
        vectorstore = Chroma.from_documents(
            documents=data_to_index,
            embedding=embedding_function,
            persist_directory=CHROMA_DB_PATH,
            collection_name=COLLECTION_NAME

        )
        print(f"   Baza de date ChromaDB creată și salvată cu succes. Conține {vectorstore._collection.count()} documente.")

    except Exception as e:
        print(f"EROARE FATALĂ la încărcarea documentelor sau crearea bazei ChromaDB: {e}")
        exit()
    create_db_time = time.time() - start_time
    print(f"   Timp creare bază de date: {create_db_time:.2f} secunde.")


print("\n4. Testare Căutare Semantică...")
if vectorstore:
    test_query = "solve linear equation 3x + 5 = 11"
    k_results = 3
    print(f"   Query de test: '{test_query}'")
    print(f"   Se caută cele mai similare {k_results} documente...")
    start_time = time.time()
    try:
        search_results = vectorstore.similarity_search(test_query, k=k_results)
        search_time = time.time() - start_time

        if search_results:
            print(f"   {len(search_results)} rezultate găsite în {search_time:.4f} secunde:")
            for i, doc in enumerate(search_results):

                print(f"      Rezultat {i+1}:")
                print(f"         Text (din '{SOURCE_COLUMN}'): {doc.page_content[:150]}...")
                print(f"         Metadate: {doc.metadata}")
        else:
            print("      Nu s-au găsit rezultate similare pentru query-ul de test.")

    except Exception as e:
        print(f"   EROARE la efectuarea căutării semantice: {e}")
else:
    print("   EROARE: Vector store-ul nu a fost inițializat corect. Testarea căutării nu este posibilă.")

print("\n--- Proces Creare/Încărcare Vector Store ChromaDB Finalizat ---")

INFO: Se va folosi CPU.
--- Start Creare/Încărcare Vector Store ChromaDB ---
Folosind fișierul CSV: 'math_python_dataset_curatat.csv'
Coloana sursă pentru embedding: 'problem_cleaned'
Coloane metadate: ['solution_cleaned']
Model embedding: 'all-MiniLM-L6-v2' pe dispozitivul 'cpu'
Calea bazei de date ChromaDB: './chroma_db_math'
Nume colecție: 'math_problems'

1. Verificare fișier CSV și coloane...
   Fișierul CSV și coloanele necesare există.

2. Inițializare model de embedding 'all-MiniLM-L6-v2'...


  embedding_function = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

   Model de embedding inițializat. Va rula pe cpu.
   Timp inițializare model: 42.58 secunde.

3. Directorul './chroma_db_math' nu există. Se va crea o bază de date nouă.

3. Creare bază de date nouă ChromaDB în './chroma_db_math'...
   Încărcare documente din CSV...
   Am încărcat 491 documente din CSV.
   Generare embeddings și indexare în ChromaDB (acest pas poate dura)...
   Baza de date ChromaDB creată și salvată cu succes. Conține 491 documente.
   Timp creare bază de date: 25.90 secunde.

4. Testare Căutare Semantică...
   Query de test: 'solve linear equation 3x + 5 = 11'
   Se caută cele mai similare 3 documente...
   3 rezultate găsite în 0.0290 secunde:
      Rezultat 1:
         Text (din 'problem_cleaned'): problem_cleaned: let's start with a simple math problem. what is the solution to 3x + 10 = 20...
         Metadate: {'solution_cleaned': "<think>\nOkay, so I have this equation: 3x plus 10 equals 20. Hmm, let me try to solve for x. First, maybe I should get all the term

In [None]:
#Pasul 5

In [None]:

!curl -fsSL https://ollama.com/install.sh | sh

!nohup ollama serve > ollama.log 2>&1 &


import time
print("Așteptare 5 secunde pentru pornirea serverului Ollama...")
time.sleep(5)
print("Serverul Ollama ar trebui să fie pornit.")

!ollama list

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
Așteptare 5 secunde pentru pornirea serverului Ollama...
Serverul Ollama ar trebui să fie pornit.
NAME    ID    SIZE    MODIFIED 


In [None]:

print("Descărcare model deepseek-coder:6.7b (poate dura)...")
!ollama pull deepseek-coder:6.7b
print("Model descărcat.")

!ollama list

Descărcare model deepseek-coder:6.7b (poate dura)...
[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠼ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠴ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling 59bb50d8116b...   0% ▕▏    0 B/3.8 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 59bb50d8116b...   0% ▕▏  90 KB/3.8 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 59bb50d8116b...   0% ▕▏ 6.7 MB/3.8 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 59bb50d8116b...   1% ▕▏  34 MB/3.8 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 59bb50d8116b...   2% ▕▏  67 MB/3.8 GB                  [K[?25h[?2026l[?2026h[?25l[

In [None]:
!ollama pull mistral:7b
print("Model descărcat.")

[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠼ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling ff82381e2bea...   0% ▕▏    0 B/4.1 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling ff82381e2bea...   0% ▕▏    0 B/4.1 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling ff82381e2bea...   0% ▕▏ 1.6 MB/4.1 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling ff82381e2bea...   1% ▕▏  39 MB/4.1 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling ff82381e2bea...   2% ▕▏  68 MB/4.1 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling ff82381e2bea...   3% ▕▏ 108 MB/4.1 GB                  [K[?25h[?2026l

In [None]:
!ollama rm deepseek-coder:6.7b


!ollama rm mistral:7b


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25hdeleted 'deepseek-coder:6.7b'
[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25hdeleted 'mistral:7b'


In [None]:
!ollama pull phi
print("Model descărcat.")

[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling 04778965089b...   0% ▕▏    0 B/1.6 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 04778965089b...   0% ▕▏    0 B/1.6 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 04778965089b...   0% ▕▏ 4.0 MB/1.6 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 04778965089b...   3% ▕▏  44 MB/1.6 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 04778965089b...   5% ▕▏  74 MB/1.6 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 04778965089b...   7% ▕▏ 119 MB/1.6 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 04778

In [None]:
!ollama list

/bin/bash: line 1: ollama: command not found


In [None]:
!pip install langchain-openai

Collecting langchain-openai
  Downloading langchain_openai-0.3.16-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-core<1.0.0,>=0.3.58 (from langchain-openai)
  Downloading langchain_core-0.3.59-py3-none-any.whl.metadata (5.9 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading langchain_openai-0.3.16-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_core-0.3.59-py3-none-any.whl (437 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m437.7/437.7 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling coll

In [None]:
import pandas as pd
import time
import os
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

CSV_FILE_PATH = "math_python_dataset_curatat.csv"
CHROMA_DB_PATH = "./chroma_db_math"
COLLECTION_NAME = "math_problems"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
TEST_QUESTION_INDEX = 5

try:
    import torch
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
except ImportError:
    DEVICE = "cpu"

try:
    embedding_function = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        model_kwargs={'device': DEVICE},
        encode_kwargs={'normalize_embeddings': True}
    )
except Exception as e:
    exit()

if not os.path.exists(CHROMA_DB_PATH):
    exit()
try:
    vectorstore = Chroma(
        persist_directory=CHROMA_DB_PATH,
        embedding_function=embedding_function,
        collection_name=COLLECTION_NAME
    )
except Exception as e:
    exit()

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

llm = None
try:
    from google.colab import userdata
    openai_api_key = userdata.get('OPENAI_API_KEY')
    if not openai_api_key:
        raise ValueError
    os.environ["OPENAI_API_KEY"] = openai_api_key
    OPENAI_MODEL_NAME = "gpt-3.5-turbo"
    llm = ChatOpenAI(model_name=OPENAI_MODEL_NAME, temperature=0.1)
except ImportError:
    if "OPENAI_API_KEY" in os.environ:
        OPENAI_MODEL_NAME = "gpt-3.5-turbo"
        llm = ChatOpenAI(model_name=OPENAI_MODEL_NAME, temperature=0.1)
except Exception:
    pass

if llm is None:
    exit()

prompt_template = """
You are an expert Python programmer specialized in mathematical libraries like NumPy and SymPy.
Use the provided context (examples of similar math problems and their Python solutions) to generate Python code that solves or represents the given mathematical problem described in natural language.

Context:
---------------------
{context}
---------------------

Mathematical Problem / Question:
{question}

Instructions:
1. Analyze the problem description.
2. Use relevant examples from the context if helpful.
3. Generate concise, runnable Python code using NumPy or SymPy.
4. Add minimal comments if necessary.
5. If you cannot solve it, state that clearly.

Python Code:
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

def format_docs(docs):
    formatted_list = []
    for i, doc in enumerate(docs):
        content = doc.page_content
        solution = doc.metadata.get('solution_cleaned', 'N/A')
        formatted_list.append(
            f"Example {i+1}:\n"
            f"Problem Description: {content}\n"
            f"Solution:\n```python\n{solution}\n```"
        )
    return "\n\n".join(formatted_list) if formatted_list else "No relevant examples found in the context."

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

try:
    single_test_question = input(">>> Introduceți problema matematică (în limbaj natural) și apăsați Enter: ")
    if not single_test_question or not single_test_question.strip():
        exit()
    single_test_question = single_test_question.strip()
except EOFError:
    exit()
except KeyboardInterrupt:
    exit()

single_result = {}
start_testing_time = time.time()
generated_code = None
error_message = None
invoke_time = 0.0

start_invoke_time = time.time()
try:
    generated_code = rag_chain.invoke(single_test_question)
    invoke_time = time.time() - start_invoke_time
except Exception as e:
    invoke_time = time.time() - start_invoke_time
    error_message = str(e)

single_result = {
    "question": single_test_question,
    "generated_code": generated_code.strip() if generated_code else None,
    "error": error_message,
    "time_taken_invoke": invoke_time
}

end_testing_time = time.time()
total_testing_time = end_testing_time - start_testing_time

if single_result:
    print(f"Întrebare: {single_result['question']}")
    if single_result['error']:
        print(f"EROARE: {single_result['error']}")
    elif single_result['generated_code']:
        print(f"Cod Generat:\n```python\n{single_result['generated_code']}\n```")
    else:
        print("Nu a fost returnat niciun cod.")
    print(f"Timp Generare: {single_result['time_taken_invoke']:.2f}s")
else:
    print("Întrebarea nu a putut fi procesată.")


ModuleNotFoundError: No module named 'langchain_openai'

In [None]:
!pip install --upgrade langchain langchain-core langchain-community langchain-openai

Collecting langchain
  Downloading langchain-0.3.25-py3-none-any.whl.metadata (7.8 kB)
Downloading langchain-0.3.25-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain
  Attempting uninstall: langchain
    Found existing installation: langchain 0.3.24
    Uninstalling langchain-0.3.24:
      Successfully uninstalled langchain-0.3.24
Successfully installed langchain-0.3.25


Collecting langchain-openai
  Downloading langchain_openai-0.3.13-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-core<1.0.0,>=0.3.52 (from langchain-openai)
  Downloading langchain_core-0.3.52-py3-none-any.whl.metadata (5.9 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading langchain_openai-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.7/61.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_core-0.3.52-py3-none-any.whl (433 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.6/433.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling colle