In [1]:
import sys
import torch
import fitz
import re
import textwrap
from transformers import AutoTokenizer, AutoModelForCausalLM
# path to user functions
sys.path.append("../Src")
import read_pdf

# reload user functions
from importlib import reload

# PDF File Path
PDF_FILE_PATH = "/Users/sir/Downloads/Data/PDF/test/Matryoshka Representation Learning.pdf" 

# LLM Local Paths
LLM_PATH = "/Users/sir/Downloads/HuggingFace/LLM/meta-Llama-3.2-1B-Instruct"

# use mps if available, else cuda, else cpu
if torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
elif torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")
print(f"Using device: {DEVICE}")

Using device: mps


In [2]:
# Read PDF and extract text
text = read_pdf.get_text_from_pdf(PDF_FILE_PATH)

# print original length
print(len(text))
text = textwrap.fill(text, width=180)
print(text)

Reading full text from '/Users/sir/Downloads/Data/PDF/test/Matryoshka Representation Learning.pdf'...
Extraction stop found immediately before: 'References'
Successfully extracted and cleaned 39716 characters.
39716
Learned representations are a central component in modern ML systems, serving a multitude of downstream tasks. When training such representations, it is often the case that
computational and statistical constraints for each downstream task are unknown. In this context, rigid fixedcapacity representations can be either over or underaccommodating to the
task at hand. This leads us to ask: can we design a flexible representation that can adapt to multiple downstream tasks with varying computational resources? Our main contribution
is Matryoshka Representation Learning (MRL) which encodes information at different granularities and allows a single embedding to adapt to the computational constraints of
downstream tasks. MRL minimally modifies existing representation learning pipe

In [14]:
# --- MODEL 1: THE "GENERATOR" (Llama 3.1 for summarizing) ---
print(f"Loading Generator: {LLM_PATH}")

# This line will now work correctly
generator_model = AutoModelForCausalLM.from_pretrained(
    LLM_PATH, 
    device_map=DEVICE, # Automatically map to your M3 GPU
    dtype=torch.bfloat16, # Use bfloat16 for M3
    trust_remote_code=True
)
generator_tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)


# Check model context length
print("Model context length:", generator_model.config.max_position_embeddings)

Loading Generator: /Users/sir/Downloads/HuggingFace/LLM/meta-Llama-3.2-1B-Instruct
Model context length: 131072


In [None]:
# prompt construction
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Clearly summarize the following text in detail in a concise manner and 
avoid personal opinions or commentary including extraneous information:

{text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

input = generator_tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = generator_model.generate(
        **input,
        max_new_tokens=10000,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.06,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=generator_tokenizer.eos_token_id
    )

# 1. Decode the raw text
# We only want the generated part, not the input prompt
output_token_ids = outputs[0][len(input['input_ids'][0]):]
raw_output = generator_tokenizer.decode(output_token_ids, skip_special_tokens=True)

# 2. Clean the output (it might have extra spaces)
response_only = raw_output.strip()

# 3. Wrap and print the final, correct summary
formatted_text = textwrap.fill(response_only, width=80)
print("\n--- GENERATED SUMMARY ---")
print(formatted_text)


--- GENERATED SUMMARY ---
Here is a concise summary of the text in detail:  **Introduction**  The text
discusses the concept of Matryoshka Representation Learning (MRL), a method for
learning flexible and adaptive representations that can be used for various
downstream tasks, such as classification and retrieval. MRL is a technique that
encodes information at multiple granularities in a single embedding vector,
enabling adaptive deployment in various environments.  **Background**  The text
reviews the existing representation learning techniques, including supervised
and unsupervised learning paradigms, and their limitations. It highlights the
need for a flexible and adaptive representation learning approach that can
handle varying computational resources and accuracy constraints.
**Methodology**  The text introduces Matryoshka Representation Learning (MRL), a
method that encodes information at different granularities in a single embedding
vector. The method is based on the idea of nes

In [None]:
# prompt construction
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Summarize the following text in no more than ten sentences, using a neutral 
and objective tone. Make sure the summary is clear, concise, and avoids personal 
opinions or commentary

{text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

input = generator_tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = generator_model.generate(
        **input,
        max_new_tokens=10000,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.06,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=generator_tokenizer.eos_token_id
    )

# 1. Decode the raw text
# We only want the generated part, not the input prompt
output_token_ids = outputs[0][len(input['input_ids'][0]):]
raw_output = generator_tokenizer.decode(output_token_ids, skip_special_tokens=True)

# 2. Clean the output (it might have extra spaces)
response_only = raw_output.strip()

# 3. Wrap and print the final, correct summary
formatted_text = textwrap.fill(response_only, width=80)
print("\n--- GENERATED SUMMARY ---")
print(formatted_text)


--- GENERATED SUMMARY ---
Here is a summary of the text in ten sentences:  Matryoshka Representation
Learning (MRL) is a method for learning flexible representations that can adapt
to multiple downstream tasks with varying computational resources. The method
encodes information at different granularities and allows for a single embedding
to adapt to the computational constraints of downstream tasks. MRL minimizes the
computational cost of inference and deployment by learning coarsetofine
representations that are as accurate as independently trained low-dimensional
representations. The method is designed to be adaptable to various
representation learning frameworks, including supervised and unsupervised
learning. MRL can be used for largescale adaptive classification and retrieval,
and has been evaluated on several benchmarks, including ImageNet-1K and
ImageNet-4K. The method has been shown to be more accurate than fixed-feature
representations, even at lower dimensions. MRL has also b

### New Paper

In [10]:
# PDF File Path
PDF_FILE_PATH = "/Users/sir/Downloads/Data/PDF/test/Unlearn Dataset Bias in Natural Language Inference.pdf" 

# Read PDF and extract text
text = read_pdf.get_text_from_pdf(PDF_FILE_PATH)

# print original length
print(len(text))
text = textwrap.fill(text, width=180)
print(text)

Reading full text from '/Users/sir/Downloads/Data/PDF/test/Unlearn Dataset Bias in Natural Language Inference.pdf'...
Extraction stop found immediately before: 'References'
Successfully extracted and cleaned 36017 characters.
36017
Statistical natural language inference (NLI) models are susceptible to learning dataset bias: superﬁcial cues that happen to associate with the label on a particular dataset, but
are not useful in general, e.g., negation words indicate contradiction. As exposed by several recent challenge datasets, these models perform poorly when such association is
absent, e.g., predicting that “I love dogs.” contradicts “I don’t love cats.”. Our goal is to design learning algorithms that guard against known dataset bias. We formalize the
concept of dataset bias under the framework of distribution shift and present a simple debiasing algorithm based on residual ﬁtting, which we call DRiFt. We ﬁrst learn a biased
model that only uses features that are known to relate to dat

In [11]:
# prompt construction
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Summarize the following text in no more than ten sentences, using a neutral 
and objective tone. Make sure the summary is clear, concise, and avoids personal 
opinions or commentary

{text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

input = generator_tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = generator_model.generate(
        **input,
        max_new_tokens=700,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.06,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=generator_tokenizer.eos_token_id
    )

# 1. Decode the raw text
# We only want the generated part, not the input prompt
output_token_ids = outputs[0][len(input['input_ids'][0]):]
raw_output = generator_tokenizer.decode(output_token_ids, skip_special_tokens=True)

# 2. Clean the output (it might have extra spaces)
response_only = raw_output.strip()

# 3. Wrap and print the final, correct summary
formatted_text = textwrap.fill(response_only, width=80)
print("\n--- GENERATED SUMMARY ---")
print(formatted_text)


--- GENERATED SUMMARY ---
Here is a summary of the text in ten sentences:  The paper discusses the problem
of dataset bias in natural language inference (NLI) models, which can lead to
poor performance on challenging datasets. Dataset bias occurs when the model is
trained on a dataset that is not representative of the real-world data
distribution. The authors propose a debiasing algorithm called DRiFt, which
learns a debiased model by first learning a biased model on a training dataset
and then removing the biased features from the debiased model. The debiased
model is then used to predict the test data. The authors analyze the behavior of
DRiFt using the cross-entropy loss function and show that it adjusts the
gradient on each example depending on how well it is predicted by the biased
model. The authors also evaluate DRiFt on two benchmark datasets, SNLI and MNLI,
and show that it outperforms a baseline model on the challenge datasets. They
also evaluate the method on a synthetic da

In [26]:
input['input_ids'].shape

torch.Size([1, 9596])

In [28]:
full_text = generator_tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Full text:", full_text)


Full text: user

Summarize the following text in no more than ten sentences, using a neutral 
and objective tone. Make sure the summary is clear, concise, and avoids personal 
opinions or commentary

Statistical natural language inference (NLI) models are susceptible to learning dataset bias: superﬁcial cues that happen to associate with the label on a particular dataset, but
are not useful in general, e.g., negation words indicate contradiction. As exposed by several recent challenge datasets, these models perform poorly when such association is
absent, e.g., predicting that “I love dogs.” contradicts “I don’t love cats.”. Our goal is to design learning algorithms that guard against known dataset bias. We formalize the
concept of dataset bias under the framework of distribution shift and present a simple debiasing algorithm based on residual ﬁtting, which we call DRiFt. We ﬁrst learn a biased
model that only uses features that are known to relate to dataset bias. Then, we train a debi

In [21]:
type(output_token_ids), output_token_ids.shape

(torch.Tensor, torch.Size([327]))

In [15]:
response_only

'Here is a summary of the text in ten sentences:\n\nThe paper discusses the problem of dataset bias in natural language inference (NLI) models, which can lead to poor performance on challenging datasets. Dataset bias occurs when the model is trained on a dataset that is not representative of the real-world data distribution. The authors propose a debiasing algorithm called DRiFt, which learns a debiased model by first learning a biased model on a training dataset and then removing the biased features from the debiased model. The debiased model is then used to predict the test data. The authors analyze the behavior of DRiFt using the cross-entropy loss function and show that it adjusts the gradient on each example depending on how well it is predicted by the biased model. The authors also evaluate DRiFt on two benchmark datasets, SNLI and MNLI, and show that it outperforms a baseline model on the challenge datasets. They also evaluate the method on a synthetic dataset bias and find that

In [17]:
print(raw_output)

Here is a summary of the text in ten sentences:

The paper discusses the problem of dataset bias in natural language inference (NLI) models, which can lead to poor performance on challenging datasets. Dataset bias occurs when the model is trained on a dataset that is not representative of the real-world data distribution. The authors propose a debiasing algorithm called DRiFt, which learns a debiased model by first learning a biased model on a training dataset and then removing the biased features from the debiased model. The debiased model is then used to predict the test data. The authors analyze the behavior of DRiFt using the cross-entropy loss function and show that it adjusts the gradient on each example depending on how well it is predicted by the biased model. The authors also evaluate DRiFt on two benchmark datasets, SNLI and MNLI, and show that it outperforms a baseline model on the challenge datasets. They also evaluate the method on a synthetic dataset bias and find that it