In [1]:
import sys
import torch
import fitz
import re
import textwrap
from transformers import AutoTokenizer, AutoModelForCausalLM
# path to user functions
sys.path.append("../Src")
import read_pdf

# reload user functions
from importlib import reload

# PDF File Path
PDF_FILE_PATH = "/Users/sir/Downloads/Data/PDF/test/Matryoshka Representation Learning.pdf" 

# LLM Local Paths
LLM_PATH = "/Users/sir/Downloads/HuggingFace/LLM/meta-Llama-3.2-1B-Instruct"

# use mps if available, else cuda, else cpu
if torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
elif torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")
print(f"Using device: {DEVICE}")

Using device: mps


In [2]:
# Read PDF and extract text
text = read_pdf.get_text_from_pdf(PDF_FILE_PATH)

# print original length
print(len(text))
text = textwrap.fill(text, width=180)
print(text)

Reading full text from '/Users/sir/Downloads/Data/PDF/test/Matryoshka Representation Learning.pdf'...
Extraction stop found immediately before: 'References'
Successfully extracted and cleaned 39716 characters.
39716
Learned representations are a central component in modern ML systems, serving a multitude of downstream tasks. When training such representations, it is often the case that
computational and statistical constraints for each downstream task are unknown. In this context, rigid fixedcapacity representations can be either over or underaccommodating to the
task at hand. This leads us to ask: can we design a flexible representation that can adapt to multiple downstream tasks with varying computational resources? Our main contribution
is Matryoshka Representation Learning (MRL) which encodes information at different granularities and allows a single embedding to adapt to the computational constraints of
downstream tasks. MRL minimally modifies existing representation learning pipe

In [3]:
# --- MODEL 1: THE "GENERATOR" (Llama 3.1 for summarizing) ---
print(f"Loading Generator: {LLM_PATH}")

# This line will now work correctly
generator_model = AutoModelForCausalLM.from_pretrained(
    LLM_PATH, 
    device_map=DEVICE, # Automatically map to your M3 GPU
    dtype=torch.bfloat16, # Use bfloat16 for M3
    trust_remote_code=True
)
generator_tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)

Loading Generator: /Users/sir/Downloads/HuggingFace/LLM/meta-Llama-3.2-1B-Instruct


In [4]:
# prompt construction
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Clearly summarize the following text:

{text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

input = generator_tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = generator_model.generate(
        **input,
        max_new_tokens=700,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.06,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=generator_tokenizer.eos_token_id
    )

# 1. Decode the raw text
# We only want the generated part, not the input prompt
output_token_ids = outputs[0][len(input['input_ids'][0]):]
raw_output = generator_tokenizer.decode(output_token_ids, skip_special_tokens=True)

# 2. Clean the output (it might have extra spaces)
response_only = raw_output.strip()

# 3. Wrap and print the final, correct summary
formatted_text = textwrap.fill(response_only, width=80)
print("\n--- GENERATED SUMMARY ---")
print(formatted_text)


--- GENERATED SUMMARY ---
The text appears to be a research paper on a new representation learning
approach called Matryoshka Representation Learning (MRL). Here's a summary of
the main points:  **Introduction**  The paper introduces Matryoshka
Representation Learning (MRL), a new approach to representation learning that
encodes information at multiple granularities in a single embedding vector. This
allows for adaptive deployment of representations across various tasks, such as
classification and retrieval.  **Background**  The authors discuss the
limitations of existing representation learning approaches, which often require
multiple neural networks with varying capacities to achieve good performance.
They argue that this leads to high computational costs and memory requirements.
**Methodology**  MRL is a novel approach that encodes information at multiple
granularities in a single embedding vector. The approach involves learning a set
of nested representations, where each dimension