In [0]:
%pip install transformers pymupdf scikit-learn hf_xet

Python interpreter will be restarted.
Collecting hf_xet
  Downloading hf_xet-1.0.5-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (54.0 MB)
Installing collected packages: hf-xet
Successfully installed hf-xet-1.0.5
Python interpreter will be restarted.


In [0]:
%pip install torch

Python interpreter will be restarted.
Collecting torch
  Downloading torch-2.7.0-cp39-cp39-manylinux_2_28_x86_64.whl (865.2 MB)
Collecting nvidia-cufft-cu12==11.3.0.4
  Downloading nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (200.2 MB)
Collecting nvidia-nvjitlink-cu12==12.6.85
  Downloading nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (19.7 MB)
Collecting nvidia-cuda-cupti-cu12==12.6.80
  Downloading nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (8.9 MB)
Collecting nvidia-cuda-runtime-cu12==12.6.77
  Downloading nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (897 kB)
Collecting networkx
  Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)
Collecting nvidia-curand-cu12==10.3.7.77
  Downloading nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (56.3 MB)
Collecting typing-extensions>=4.10.0
 

In [0]:
import fitz  # PyMuPDF
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch


In [0]:
# Simulated extracted text (pretend it came from a PDF)
texts = [
    "Machine learning is a field of artificial intelligence that focuses on teaching computers to learn patterns.",
    "Databricks is a cloud platform for big data and machine learning workflows.",
    "Transformers are a type of deep learning model based on self-attention mechanisms.",
    "Retrieval-augmented generation improves LLMs by letting them fetch facts from a knowledge base.",
    "PyTorch is an open-source machine learning framework based on the Torch library."
]


In [0]:
def embed_texts(texts, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()

    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            embeddings.append(embedding)

    return np.array(embeddings)


In [0]:
embeddings = embed_texts(texts)

In [0]:
def search_similar_texts(query, texts, embeddings, model_name="sentence-transformers/all-MiniLM-L6-v2", top_k=3):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()

    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        query_embedding = model(**inputs).last_hidden_state.mean(dim=1).squeeze().numpy()

    query_embedding = np.expand_dims(query_embedding, axis=0)

    similarities = cosine_similarity(query_embedding, embeddings)
    top_indices = similarities[0].argsort()[-top_k:][::-1]

    results = [texts[i] for i in top_indices]
    return results


In [0]:
query = "What does Databricks do?"
top_results = search_similar_texts(query, texts, embeddings)

print("Top Retrieved Passages:")
for i, passage in enumerate(top_results):
    print(f"{i+1}. {passage}")
    print("-" * 50)


Top Retrieved Passages:
1. Databricks is a cloud platform for big data and machine learning workflows.
--------------------------------------------------
2. PyTorch is an open-source machine learning framework based on the Torch library.
--------------------------------------------------
3. Machine learning is a field of artificial intelligence that focuses on teaching computers to learn patterns.
--------------------------------------------------


In [0]:
from transformers import pipeline

# Load a small QA model
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

context = " ".join(top_results)

result = qa_pipeline(question=query, context=context)
print("Generated Answer:")
print(result['answer'])


config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cpu


Generated Answer:
a cloud platform for big data and machine learning workflows
