In [None]:
!pip install pymupdf
!pip install sentence-transformers
!pip install chromadb
!pip install transformers accelerate einops bitsandbytes

import torch
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
print("GPU memory cleared.")


Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.6
Collecting chromadb
  Downloading chromadb-1.3.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-many

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Ruta de los PDFs en tu Google Drive
PDF_DIR = "/content/drive/MyDrive/RAG_PDFS"

# Carpetas del proyecto
import os

os.makedirs("data/chunks", exist_ok=True)
os.makedirs("data/dataset", exist_ok=True)
os.makedirs("db/chroma", exist_ok=True)

print("Rutas listas.")


ValueError: mount failed

In [None]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    """
    Extrae todo el texto de un archivo PDF usando PyMuPDF.
    """
    doc = fitz.open(pdf_path)
    text = ""

    for page in doc:
        text += page.get_text("text")

    return text


In [None]:
import re

def clean_text(text):
    """
    Limpieza básica de texto: elimina saltos de línea y espacios múltiples.
    """
    text = text.replace("\n", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [None]:
def chunk_text(text, chunk_size=350, overlap=75):
    """
    Divide texto en chunks de tamaño fijo con overlap.
    """
    words = text.split()
    chunks = []
    start = 0

    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start = end - overlap

    return chunks


In [None]:
import json
from tqdm import tqdm
import os

dataset = []

for pdf in tqdm(os.listdir(PDF_DIR)):
    if not pdf.lower().endswith(".pdf"):
        continue

    pdf_path = os.path.join(PDF_DIR, pdf)

    text = extract_text_from_pdf(pdf_path)
    text = clean_text(text)
    chunks = chunk_text(text)

    for i, ch in enumerate(chunks):
        dataset.append({
            "doc_id": pdf,
            "chunk_id": f"{pdf}_{i}",
            "text": ch
        })

# Guardamos dataset
with open("data/dataset/dataset.jsonl", "w", encoding="utf-8") as f:
    for item in dataset:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print("Dataset creado con", len(dataset), "chunks.")


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("BAAI/bge-m3")
print("Modelo BGE-M3 cargado.")


In [None]:
import chromadb
from chromadb.config import Settings

client = chromadb.PersistentClient(
    path="db/chroma",
    settings=Settings(anonymized_telemetry=False)
)

collection = client.get_or_create_collection(
    name="dane_docs",
    metadata={"hnsw:space": "cosine"}
)

print("ChromaDB lista.")


In [None]:
from tqdm import tqdm

ids = []
texts = []
embeddings = []
metadatas = []

for item in tqdm(dataset):
    emb = model.encode(
        item["text"],
        normalize_embeddings=True
    )

    ids.append(item["chunk_id"])
    texts.append(item["text"])
    embeddings.append(emb.tolist())
    metadatas.append({"doc_id": item["doc_id"]})

collection.add(
    ids=ids,
    embeddings=embeddings,
    documents=texts,
    metadatas=metadatas
)

print("Embeddings cargados:", len(ids))


In [None]:
def search(query, k=5):
    q_emb = model.encode(query, normalize_embeddings=True).tolist()

    results = collection.query(
        query_embeddings=[q_emb],
        n_results=k
    )
    return results

In [None]:
search("¿Cómo calcula el DANE la tasa de desempleo?")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"

# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Define max_memory for explicit control over memory allocation
# Adjust these values based on your actual GPU and system RAM available
# Example: 14GB for GPU 0, 30GB for CPU
max_memory = {0: "14GB", "cpu": "30GB"}

llm = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    device_map="auto",
    max_memory=max_memory, # Explicitly tell accelerate about memory
    dtype=torch.float16 # Use dtype instead of deprecated torch_dtype
)

def rag_answer(query, k=5):
    results = search(query, k)
    retrieved_chunks = results["documents"][0]

    context = "\n\n".join(retrieved_chunks)

    prompt = f"""
Eres un asistente experto en economía laboral de Colombia.
Usa SOLO la información recuperada del DANE.

Pregunta:
{query}

Información recuperada:
{context}

Respuesta:
"""

    inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)
    output = llm.generate(**inputs, max_new_tokens=100) # Reduced max_new_tokens to 100

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
import torch

def rag_answer_safe(query, k=3):
    # 1️⃣ Recuperar documentos
    results = search(query, k)
    retrieved_chunks = results["documents"][0]
    context = "\n\n".join(retrieved_chunks)

    # 2️⃣ Construir prompt
    prompt = f"""
Eres un asistente experto en economía laboral de Colombia.
Usa SOLO la información recuperada del DANE.

Pregunta:
{query}

Información recuperada:
{context}

Respuesta:
"""

    # 3️⃣ Generación con no_grad
    with torch.no_grad():
        inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)
        output = llm.generate(**inputs, max_new_tokens=50)
        answer = tokenizer.decode(output[0], skip_special_tokens=True)

    # 4️⃣ Limpiar GPU
    del inputs
    del output
    torch.cuda.empty_cache()

    return answer


In [None]:


rag_answer_safe("¿Cómo calcula el DANE la tasa de desempleo?")

In [None]:
rag_answer_safe("¿Cual fue la tasa de desempleo en septiembre 2025?")