In [1]:
import sys
import torch
import fitz
import re
import textwrap
from transformers import AutoTokenizer, AutoModelForCausalLM
# from sentence_transformers import SentenceTransformer

# Local Paths
# MODEL_PATH ="/Users/sir/Downloads/HuggingFace/sentence_transformer/intfloat_e5-large-v2"
# LLM_PATH = "/Users/sir/Downloads/HuggingFace/LLM/meta-Llama-3.1-8B-Instruct"
LLM_PATH = "/Users/sir/Downloads/HuggingFace/LLM/meta-Llama-3.2-1B-Instruct"

# use mps if available, else cuda, else cpu
if torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
elif torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")
print(f"Using device: {DEVICE}")

Using device: mps


In [2]:
# --- MODEL 1: THE "GENERATOR" (Llama 3.1 for summarizing) ---
print(f"Loading Generator: {LLM_PATH}")

# This line will now work correctly
generator_model = AutoModelForCausalLM.from_pretrained(
    LLM_PATH, 
    device_map=DEVICE, # Automatically map to your M3 GPU
    dtype=torch.bfloat16, # Use bfloat16 for M3
    trust_remote_code=True
)
generator_tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)

Loading Generator: /Users/sir/Downloads/HuggingFace/LLM/meta-Llama-3.2-1B-Instruct


### Summarization Example

In [3]:
text = """We survey 146 papers analyzing “bias” in
NLP systems, finding that their motivations
are often vague, inconsistent, and lacking
in normative reasoning, despite the fact that
analyzing “bias” is an inherently normative
process. We further find that these papers’
proposed quantitative techniques for measuring
or mitigating “bias” are poorly matched to
their motivations and do not engage with the
relevant literature outside of NLP. Based on
these findings, we describe the beginnings of a
path forward by proposing three recommendations
that should guide work analyzing “bias”
in NLP systems. These recommendations rest
on a greater recognition of the relationships
between language and social hierarchies,
encouraging researchers and practitioners
to articulate their conceptualizations of
“bias”—i.e., what kinds of system behaviors
are harmful, in what ways, to whom, and why,
as well as the normative reasoning underlying
these statements—and to center work around
the lived experiences of members of communities
affected by NLP systems, while interrogating
and reimagining the power relations
between technologists and such communities."""

In [4]:
# prompt construction
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Clearly summarize the following text in one concise paragraph:

{text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

input = generator_tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = generator_model.generate(
        **input,
        max_new_tokens=700,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.06,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=generator_tokenizer.eos_token_id
    )

# 1. Decode the raw text
# We only want the generated part, not the input prompt
output_token_ids = outputs[0][len(input['input_ids'][0]):]
raw_output = generator_tokenizer.decode(output_token_ids, skip_special_tokens=True)

# 2. Clean the output (it might have extra spaces)
response_only = raw_output.strip()

# 3. Wrap and print the final, correct summary
formatted_text = textwrap.fill(response_only, width=80)
print("\n--- GENERATED SUMMARY ---")
print(formatted_text)


--- GENERATED SUMMARY ---
Here is a concise summary of the text:  Researchers in the field of Natural
Language Processing (NLP) have found that many papers on bias in NLP systems are
vague, inconsistent, and lack a clear understanding of what constitutes bias. To
improve the field, the authors propose three recommendations: (1) researchers
should articulate their conceptualizations of bias and its impact on
communities, (2) work should center on the lived experiences of those affected
by NLP systems, and (3) researchers should interrogate and reimagine the power
dynamics between technologists and the communities they serve.


In [5]:
# --- Your Original Text (This is what E5 would find) ---
text = """
A UPS MD-11 plane crashed shortly after take-off near the Louisville, Kentucky, airport, according to the Federal Aviation Administration. UPS Flight 2976 crashed 
just after 5 p.m. local time and was headed to Daniel K. Inouye International Airport in Honolulu, according to a statement from the FAA, which is investigating the crash 
along with the National Transportation Safety Board. The NTSB will lead the investigation, the FAA said Tuesday.
Three crewmembers were on the plane, according to a statement from UPS that said in part, “At this time, we have not confirmed any injuries/casualties.”
Louisville Metro Police Department and other agencies are responding to the crash, LMPD said in an X post. Injuries have been reported, police said.
A massive plume of black smoke is rising not far from the tarmac at Louisville Muhammad Ali International Airport, videos from CNN affiliate WAVE show.
Louisville Muhammad Ali International Airport is the worldwide air hub for UPS. The company’s Worldport is more than 5 million square feet where more 
than 12,000 UPS employees process more than two million packages a day, according to the company.
A shelter-in-place has been issued for all locations within 5 miles of the airport, police added.
“LMPD and multiple other agencies are responding to reports of a plan crash near Fern Valley and Grade Lane,” the post said. “Grade lane will be 
closed indefinitely between Stooges and Crittenden.” The McDonnell Douglas MD-11F is a freight transport aircraft manufactured originally by McDonnell 
Douglas and later by Boeing. The aircraft is primarily flown by FedEx Express, Lufthansa Cargo and UPS Airlines for cargo.
The plane also served as a popular wide-bodied passenger airplane after it was first flown in 1990. The aircraft involved in Tuesday’s crash was built in 1991.
As fuel costs increased for the three engine jets many of them were converted to freighters. The plane can take off weighing in at a maximum 633,000 pounds and 
carrying more than 38,000 gallons of fuel, according to Boeing, which bought McDonnell Douglass.
"""

# prompt construction
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Clearly summarize the following text in one concise paragraph:

{text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

input = generator_tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = generator_model.generate(
        **input,
        max_new_tokens=700,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.07,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=generator_tokenizer.eos_token_id
    )

# 1. Decode the raw text
# We only want the generated part, not the input prompt
output_token_ids = outputs[0][len(input['input_ids'][0]):]
raw_output = generator_tokenizer.decode(output_token_ids, skip_special_tokens=True)

# 2. Clean the output (it might have extra spaces)
response_only = raw_output.strip()

# 3. Wrap and print the final, correct summary
formatted_text = textwrap.fill(response_only, width=80)
print("\n--- GENERATED SUMMARY ---")
print(formatted_text)


--- GENERATED SUMMARY ---
Here is a concise summary of the text:  A UPS MD-11 plane crashed shortly after
take-off near Louisville, Kentucky, airport, killing three crew members and
injuring others. The plane, which was carrying over 2 million packages, was
headed to Honolulu International Airport when it crashed, and authorities are
investigating the cause. The plane was manufactured by Boeing and was originally
designed for passenger flights, but was converted to a freighter after fuel
costs increased.


In [6]:
# The question you want to ask
question = "What is the capital of United States?"

# Build the prompt using the Llama 3.1 template
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""


# Tokenize the prompt for the GENERATOR
# We must set pad_token_id to eos_token_id for Llama 3
if generator_tokenizer.pad_token_id is None:
    generator_tokenizer.pad_token_id = generator_tokenizer.eos_token_id

input = generator_tokenizer(prompt, return_tensors="pt").to(DEVICE)

# Generate text using the GENERATOR model
print("\nGenerating summary with Llama-3.1-Instruct...")
with torch.no_grad():
    # We must also pass the eos_token_id to stop generation
    outputs = generator_model.generate(
        **input,
        max_new_tokens=750,        
        do_sample=True,
        temperature=0.7,         # A good temperature for creative summary
        top_p=0.9,
        # Llama 3.1 uses <|eot_id|> as its end token
        eos_token_id=generator_tokenizer.eos_token_id
    )

# 1. Decode the raw text
# We only want the generated part, not the input prompt
output_token_ids = outputs[0][len(input['input_ids'][0]):]
raw_output = generator_tokenizer.decode(output_token_ids, skip_special_tokens=True)

# 2. Clean the output (it might have extra spaces)
response_only = raw_output.strip()

# 3. Wrap and print the final, correct summary
formatted_text = textwrap.fill(response_only, width=80)
print("\n--- GENERATED SUMMARY ---")
print(formatted_text)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.



Generating summary with Llama-3.1-Instruct...

--- GENERATED SUMMARY ---
The capital of the United States is Washington, D.C. (short for District of
Columbia).


In [7]:
# --- PDF Path ---
PDF_FILE_PATH = "/Users/sir/Downloads/Data/PDF/test/A_Critical_Survey_of_Bias_in_NLP.pdf" 

# --- 1. PDF EXTRACTION FUNCTION ---
def get_text_from_pdf(pdf_path):
    """
    Extracts text from a local PDF file, starting after the Abstract/Metadata, 
    stopping before "References", and cleaning up citation noise and URLs.
    """
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Error opening PDF '{pdf_path}': {e}", file=sys.stderr)
        return None
        
    print(f"Reading full text from '{pdf_path}'...")
    text = "".join(page.get_text() for page in doc)
    doc.close()
    
    if not text:
        return ""

    # --- FIND THE START (Skip Abstract wording) ---
    start_pos = 0
    abstract_pattern = r'\babstract\b'
    abstract_match = re.search(abstract_pattern, text, re.IGNORECASE)
    
    if abstract_match:
        start_pos = abstract_match.end()
    else:
        print("Warning: 'Abstract' marker not found. Starting extraction from beginning.")
        
    core_text = text[start_pos:]
    
    # --- FIND THE END (Stop before "References") ---
    end_pos = len(core_text)
    references_pattern = r'\breferences\b'
    end_match = re.search(references_pattern, core_text, re.IGNORECASE)

    if end_match:
        end_pos = end_match.start()
        print(f"Extraction stop found immediately before: '{end_match.group(0)}'")
    else:
        print("Warning: 'References' section not found. Extracting until EOF.")

    final_text = core_text[:end_pos]
    
    
    # --- CLEANUP (Remove Citations, Links, and Fix Hyphenation) ---
    # 1. Patterns for Citation and Link Removal (Your existing good patterns)
    pattern_et_al = r'\s*\([^()]*et al\.[^()]*\)' 
    pattern_raw_url = r'https?:\/\/[^\s\)]+'
    pattern_markdown_link = r'\[https?:[^\]]*\]\([^\)]*\)' 
    combined_pattern_noise = f'({pattern_et_al})|({pattern_raw_url})|({pattern_markdown_link})'

    cleaned_text = re.sub(combined_pattern_noise, '', final_text)

    # 2. Target Hyphenation Artifacts (e.g., 'popu-larity' -> 'popularity')
    # Finds a letter, followed by a hyphen, followed optionally by spaces, followed by a letter.
    # Replaces it with the two letters joined (e.g., a-b becomes ab).
    pattern_broken_word = r'([a-zA-Z])-\s*([a-zA-Z])'
    cleaned_text = re.sub(pattern_broken_word, r'\1\2', cleaned_text)

    # 3. Final Whitespace Cleanup
    # This removes excess newlines/spaces (and any remaining hyphenation artifacts that had a space)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    print(f"Successfully extracted and cleaned {len(cleaned_text)} characters.")
    return cleaned_text

In [8]:
PDF_FILE_PATH = "/Users/sir/Downloads/Data/PDF/test/DataAugmentationApproachesforNLP.pdf" 
text = get_text_from_pdf(PDF_FILE_PATH)

# Open the PDF document
doc = fitz.open(PDF_FILE_PATH)

## Accessing Title and Author

# 1. Get the document's metadata dictionary
metadata = doc.metadata

# 2. Extract the 'title' and 'author' keys
title = metadata.get('title', 'N/A')
author = metadata.get('author', 'N/A')

# 3. Print the results
print(f"Document Title: {title}")
print(f"Document Author: {author}")

print(f"PDF File Path: {PDF_FILE_PATH}")

# Access text or other content from a page (0-based index)
page = doc.load_page(0)
text = page.get_text()
print("\n",text)

# Don't forget to close the document
doc.close()

doc.metadata


Reading full text from '/Users/sir/Downloads/Data/PDF/test/DataAugmentationApproachesforNLP.pdf'...
Extraction stop found immediately before: 'References'
Successfully extracted and cleaned 36325 characters.
Document Title: 
Document Author: 
PDF File Path: /Users/sir/Downloads/Data/PDF/test/DataAugmentationApproachesforNLP.pdf

 A Survey of Data Augmentation Approaches for NLP
Steven Y. Feng∗, 1 Varun Gangal∗, 1 Jason Wei†, 2 Sarath Chandar,3
Soroush Vosoughi,4 Teruko Mitamura,1 Eduard Hovy1
1Carnegie Mellon University, 2Google Research
3Mila - Quebec AI Institute, 4Dartmouth College
{syfeng,vgangal,teruko,hovy}@cs.cmu.edu
jasonwei@google.com
sarath.chandar@mila.quebec
soroush@dartmouth.edu
Abstract
Data augmentation has recently seen increased
interest in NLP due to more work in low-
resource domains, new tasks, and the popu-
larity of large-scale neural networks that re-
quire large amounts of training data.
De-
spite this recent upsurge, this area is still rel-
atively underexplore

{'format': 'PDF 1.5',
 'title': '',
 'author': '',
 'subject': '',
 'keywords': '',
 'creator': 'LaTeX with hyperref',
 'producer': 'pdfTeX-1.40.21',
 'creationDate': 'D:20211203020921Z',
 'modDate': 'D:20211203020921Z',
 'trapped': '',
 'encryption': None}

In [9]:
text = textwrap.fill(text, width=120)
print(text)

A Survey of Data Augmentation Approaches for NLP Steven Y. Feng∗, 1 Varun Gangal∗, 1 Jason Wei†, 2 Sarath Chandar,3
Soroush Vosoughi,4 Teruko Mitamura,1 Eduard Hovy1 1Carnegie Mellon University, 2Google Research 3Mila - Quebec AI
Institute, 4Dartmouth College {syfeng,vgangal,teruko,hovy}@cs.cmu.edu jasonwei@google.com sarath.chandar@mila.quebec
soroush@dartmouth.edu Abstract Data augmentation has recently seen increased interest in NLP due to more work in low-
resource domains, new tasks, and the popu- larity of large-scale neural networks that re- quire large amounts of
training data. De- spite this recent upsurge, this area is still rel- atively underexplored, perhaps due to the chal-
lenges posed by the discrete nature of language data. In this paper, we present a comprehen- sive and unifying survey of
data augmenta- tion for NLP by summarizing the literature in a structured manner. We ﬁrst introduce and motivate data
augmentation for NLP, and then discuss major methodologically rep

In [10]:
# --- MODEL 1: THE "GENERATOR" (Llama 3.1 for summarizing) ---
print(f"Loading Generator: {LLM_PATH}")

# This line will now work correctly
generator_model = AutoModelForCausalLM.from_pretrained(
    LLM_PATH, 
    device_map=DEVICE, # Automatically map to your M3 GPU
    dtype=torch.float32, # Use float32 for M3
    trust_remote_code=True
)
generator_tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)

Loading Generator: /Users/sir/Downloads/HuggingFace/LLM/meta-Llama-3.2-1B-Instruct


In [11]:
# prompt construction
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Summarize the following text:

{text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

input = generator_tokenizer(prompt, return_tensors="pt").to(DEVICE)

# --- Tokenizer Configuration ---
# Ensure pad_token_id is set before tokenizing or generating
if generator_tokenizer.pad_token_id is None:
    generator_tokenizer.pad_token_id = generator_tokenizer.eos_token_id

# Define the terminators for Llama 3.1
terminators = [
    generator_tokenizer.eos_token_id,
    generator_tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# --- Generate text ---
print("\nGenerating summary with Llama-3.1-Instruct...")
with torch.no_grad():
    outputs = generator_model.generate(
        **input,
        max_new_tokens=750,        
        do_sample=True,
        # Increase temperature slightly for stability on bfloat16/MPS
        temperature=0.7,             # Standard, moderate randomness
        top_p=0.9,                  
        # Use the list of terminators for Llama 3.1
        eos_token_id=terminators,    # Use the list of terminators
        pad_token_id=generator_tokenizer.eos_token_id
    )

# 1. Decode the raw text
output_token_ids = outputs[0][len(input['input_ids'][0]):]
raw_output = generator_tokenizer.decode(output_token_ids, skip_special_tokens=True)

# 2. Clean the output
response_only = raw_output.strip()

# 3. Wrap and print the final, correct summary
formatted_text = textwrap.fill(response_only, width=80)
print("\n--- GENERATED SUMMARY ---")
print(formatted_text)


Generating summary with Llama-3.1-Instruct...

--- GENERATED SUMMARY ---
Here's a summary of the text:  The paper discusses the concept of data
augmentation (DA) in Natural Language Processing (NLP), which refers to
techniques used to increase the diversity of training data without collecting
new data. Despite its growing interest, DA is still underexplored in the NLP
community, particularly in low-resource domains and tasks where large amounts of
training data are scarce.  The authors review the literature on DA, categorizing
it into three approaches: rule-based, example interpolation-based, and model-
based. They also discuss popular NLP applications and tasks that benefit from
DA, including:  * Low-resource languages * Mitigating bias * Fxing class
imbalance * Few-shot learning * Adversarial examples  The authors also highlight
the challenges associated with DA, including the discrete nature of language
data and the need to maintain invariance during training.  The paper aims to
cl

## New PDF

In [12]:
PDF_FILE_PATH = "/Users/sir/Downloads/Data/PDF/test/A_Critical_Survey_of_Bias_in_NLP.pdf" 
text = get_text_from_pdf(PDF_FILE_PATH)

Reading full text from '/Users/sir/Downloads/Data/PDF/test/A_Critical_Survey_of_Bias_in_NLP.pdf'...
Extraction stop found immediately before: 'References'
Successfully extracted and cleaned 39150 characters.


In [13]:
text = textwrap.fill(text, width=120)
print(text)

We survey 146 papers analyzing “bias” in NLP systems, ﬁnding that their motivations are often vague, inconsistent, and
lacking in normative reasoning, despite the fact that analyzing “bias” is an inherently normative process. We further
ﬁnd that these papers’ proposed quantitative techniques for measuring or mitigating “bias” are poorly matched to their
motivations and do not engage with the relevant literature outside of NLP. Based on these ﬁndings, we describe the
beginnings of a path forward by proposing three recommendations that should guide work analyzing “bias” in NLP systems.
These recommendations rest on a greater recognition of the relationships between language and social hierarchies,
encouraging researchers and practitioners to articulate their conceptualizations of “bias”—i.e., what kinds of system
behaviors are harmful, in what ways, to whom, and why, as well as the normative reasoning underlying these
statements—and to center work around the lived experiences of members 

In [14]:
PDF_FILE_PATH = "/Users/sir/Downloads/Data/PDF/test/A_Critical_Survey_of_Bias_in_NLP.pdf" 
doc = fitz.open(PDF_FILE_PATH)

doc.metadata['title'] = 'Language (Technology) is Power: A Critical Survey of “Bias” in NLP'
doc.metadata['author'] = 'Su Lin Blodgett, Solon Baracas, Hal Daumé III, Hanna Wallach'
doc.metadata['subject'] = 'A Critical Survey of Bias in NLP Systems'

## Accessing Title and Author

# 1. Get the document's metadata dictionary
metadata = doc.metadata

# 2. Extract the 'title' and 'author' keys
title = metadata.get('title', 'N/A')
author = metadata.get('author', 'N/A')

# 3. Print the results
print(f"Document Title: {title}")
print(f"Document Author: {author}")

print(f"PDF File Path: {PDF_FILE_PATH}")

# Access text or other content from a page (0-based index)
page = doc.load_page(0)
text = page.get_text()
print("\n",text)

# Don't forget to close the document
doc.close()

doc.metadata

Document Title: Language (Technology) is Power: A Critical Survey of “Bias” in NLP
Document Author: Su Lin Blodgett, Solon Baracas, Hal Daumé III, Hanna Wallach
PDF File Path: /Users/sir/Downloads/Data/PDF/test/A_Critical_Survey_of_Bias_in_NLP.pdf

 Language (Technology) is Power: A Critical Survey of “Bias” in NLP
Su Lin Blodgett
College of Information and Computer Sciences
University of Massachusetts Amherst
blodgett@cs.umass.edu
Solon Barocas
Microsoft Research
Cornell University
solon@microsoft.com
Hal Daumé III
Microsoft Research
University of Maryland
me@hal3.name
Hanna Wallach
Microsoft Research
wallach@microsoft.com
Abstract
We survey 146 papers analyzing “bias” in
NLP systems, ﬁnding that their motivations
are often vague, inconsistent, and lacking
in normative reasoning, despite the fact that
analyzing “bias” is an inherently normative
process.
We further ﬁnd that these papers’
proposed quantitative techniques for measur-
ing or mitigating “bias” are poorly matched to
their m

{'format': 'PDF 1.5',
 'title': 'Language (Technology) is Power: A Critical Survey of “Bias” in NLP',
 'author': 'Su Lin Blodgett, Solon Baracas, Hal Daumé III, Hanna Wallach',
 'subject': 'A Critical Survey of Bias in NLP Systems',
 'keywords': '',
 'creator': 'LaTeX with hyperref package',
 'producer': 'pdfTeX-1.40.17',
 'creationDate': 'D:20200601004146Z',
 'modDate': 'D:20200601004146Z',
 'trapped': '',
 'encryption': None}

In [15]:
# prompt construction
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Summarize the following text:

{text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

input = generator_tokenizer(prompt, return_tensors="pt").to(DEVICE)

# --- Tokenizer Configuration ---
# Ensure pad_token_id is set before tokenizing or generating
if generator_tokenizer.pad_token_id is None:
    generator_tokenizer.pad_token_id = generator_tokenizer.eos_token_id

# Define the terminators for Llama 3.1
terminators = [
    generator_tokenizer.eos_token_id,
    generator_tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# --- Generate text ---
print("\nGenerating summary with Llama-3.1-Instruct...")
with torch.no_grad():
    outputs = generator_model.generate(
        **input,
        max_new_tokens=750,        
        do_sample=True,
        # Increase temperature slightly for stability on bfloat16/MPS
        temperature=0.7,             # Standard, moderate randomness
        top_p=0.9,                  
        # Use the list of terminators for Llama 3.1
        eos_token_id=terminators,    # Use the list of terminators
        pad_token_id=generator_tokenizer.eos_token_id
    )

# 1. Decode the raw text
output_token_ids = outputs[0][len(input['input_ids'][0]):]
raw_output = generator_tokenizer.decode(output_token_ids, skip_special_tokens=True)

# 2. Clean the output
response_only = raw_output.strip()

# 3. Wrap and print the final, correct summary
formatted_text = textwrap.fill(response_only, width=80)
print("\n--- GENERATED SUMMARY ---")
print(formatted_text)


Generating summary with Llama-3.1-Instruct...

--- GENERATED SUMMARY ---
The article "Language (Technology) is Power: A Critical Survey of "Bias" in NLP"
by Solon Barocas, Hal Daumé III, Hanna Wallach, and Solon Blodgett, explores the
concept of "bias" in natural language processing (NLP) systems. The authors find
that the field of NLP research on bias is often vague, inconsistent, and lacking
in normative reasoning, despite the fact that analyzing bias is a normative
process.  The authors conclude that papers on bias in NLP systems often fail to
articulate the motivations behind their findings and do not engage with relevant
literature outside of NLP. They propose three recommendations to improve the
field:  1. **Recognize the relationships between language and social
hierarchies**: The authors suggest that researchers and practitioners should
articulate their conceptualizations of "bias" in NLP systems, including what
kinds of system behaviors are harmful, in what ways, to whom, and

In [16]:
PDF_FILE_PATH = "/Users/sir/Downloads/Data/PDF/test/DeepBoost.pdf" 
doc = fitz.open(PDF_FILE_PATH)

## Accessing Title and Author

# 1. Get the document's metadata dictionary
metadata = doc.metadata

# 2. Extract the 'title' and 'author' keys
title = metadata.get('title', 'N/A')
author = metadata.get('author', 'N/A')

# 3. Print the results
print(f"Document Title: {title}")
print(f"Document Author: {author}")

print(f"PDF File Path: {PDF_FILE_PATH}")

# Access text or other content from a page (0-based index)
page = doc.load_page(0)
text = page.get_text()
print("\n",text)

# Don't forget to close the document
doc.close()

doc.metadata

Document Title: Deep Boosting
Document Author: Corinna Cortes, Mehryar Mohri, Umar Syed
PDF File Path: /Users/sir/Downloads/Data/PDF/test/DeepBoost.pdf

 Deep Boosting
Corinna Cortes
CORINNA@GOOGLE.COM
Google Research, 111 8th Avenue, New York, NY 10011
Mehryar Mohri
MOHRI@CIMS.NYU.EDU
Courant Institute and Google Research, 251 Mercer Street, New York, NY 10012
Umar Syed
USYED@GOOGLE.COM
Google Research, 111 8th Avenue, New York, NY 10011
Abstract
We present a new ensemble learning algorithm,
DeepBoost, which can use as base classiﬁers a
hypothesis set containing deep decision trees, or
members of other rich or complex families, and
succeed in achieving high accuracy without over-
ﬁtting the data. The key to the success of the al-
gorithm is a capacity-conscious criterion for the
selection of the hypotheses. We give new data-
dependent learning bounds for convex ensembles
expressed in terms of the Rademacher complexi-
ties of the sub-families composing the base clas-
siﬁer set, and the

{'format': 'PDF 1.4',
 'title': 'Deep Boosting',
 'author': 'Corinna Cortes, Mehryar Mohri, Umar Syed',
 'subject': '',
 'keywords': 'ensemble methods, learning theory, boosting',
 'creator': 'LaTeX with hyperref package',
 'producer': 'Mac OS X 10.9.2 Quartz PDFContext',
 'creationDate': "D:20140513025941Z00'00'",
 'modDate': "D:20140513025941Z00'00'",
 'trapped': '',
 'encryption': None}

In [17]:
text = get_text_from_pdf(PDF_FILE_PATH)
print("\n--- PDF TEXT ---")
text = textwrap.fill(text, width=120)
print(text)

Reading full text from '/Users/sir/Downloads/Data/PDF/test/DeepBoost.pdf'...
Extraction stop found immediately before: 'References'
Successfully extracted and cleaned 35873 characters.

--- PDF TEXT ---
We present a new ensemble learning algorithm, DeepBoost, which can use as base classiﬁers a hypothesis set containing
deep decision trees, or members of other rich or complex families, and succeed in achieving high accuracy without over-
ﬁtting the data. The key to the success of the algorithm is a capacityconscious criterion for the selection of the
hypotheses. We give new datadependent learning bounds for convex ensembles expressed in terms of the Rademacher
complexities of the subfamilies composing the base classiﬁer set, and the mixture weight assigned to each subfamily. Our
algorithm directly beneﬁts from these guarantees since it seeks to minimize the corresponding learning bound. We give a
full description of our algorithm, including the details of its derivation, and report the 

In [18]:
# Local LLM_PATH
LLM_PATH = "/Users/sir/Downloads/HuggingFace/LLM/meta-Llama-3.2-1B-Instruct"

# --- MODEL 1: THE "GENERATOR" (Llama 3.1 for summarizing) ---
print(f"Loading Generator: {LLM_PATH}")

# This line will now work correctly
generator_model = AutoModelForCausalLM.from_pretrained(
    LLM_PATH, 
    device_map=DEVICE, # Automatically map to your M3 GPU
    dtype=torch.float32, # Use float32 for M3
    trust_remote_code=True
)
generator_tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)

# prompt construction
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Summarize the following text:

{text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

input = generator_tokenizer(prompt, return_tensors="pt").to(DEVICE)

# --- Tokenizer Configuration ---
# Ensure pad_token_id is set before tokenizing or generating
if generator_tokenizer.pad_token_id is None:
    generator_tokenizer.pad_token_id = generator_tokenizer.eos_token_id

# Define the terminators for Llama 3.1
terminators = [
    generator_tokenizer.eos_token_id,
    generator_tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# --- Generate text ---
print("\nGenerating summary with Llama-3.1-Instruct...")
with torch.no_grad():
    outputs = generator_model.generate(
        **input,
        max_new_tokens=750,        
        do_sample=True,
        # Increase temperature slightly for stability on bfloat16/MPS
        temperature=0.7,             # Standard, moderate randomness
        top_p=0.9,                  
        # Use the list of terminators for Llama 3.1
        eos_token_id=terminators,    # Use the list of terminators
        pad_token_id=generator_tokenizer.eos_token_id
    )

# 1. Decode the raw text
output_token_ids = outputs[0][len(input['input_ids'][0]):]
raw_output = generator_tokenizer.decode(output_token_ids, skip_special_tokens=True)

# 2. Clean the output
response_only = raw_output.strip()

# 3. Wrap and print the final, correct summary
formatted_text = textwrap.fill(response_only, width=80)
print("\n--- GENERATED SUMMARY ---")
print(formatted_text)

Loading Generator: /Users/sir/Downloads/HuggingFace/LLM/meta-Llama-3.2-1B-Instruct

Generating summary with Llama-3.1-Instruct...

--- GENERATED SUMMARY ---
The text you provided is a research paper that presents a theoretical analysis
of a new ensemble learning algorithm called DeepBoost, which is designed to use
a hypothesis set of increasing complexity, including very deep or complex
hypotheses, to improve the accuracy of machine learning models. The paper also
reports the results of experiments with DeepBoost on several datasets, including
UCI datasets and the MNIST dataset.  Here's a summary of the key points:  **Key
contributions:**  1. **Theoretical analysis:** The paper provides a theoretical
analysis of the DeepBoost algorithm, which is based on a hypothesis set of
increasing complexity, including very deep or complex hypotheses. 2. **Learning
bounds:** The paper derives new learning bounds for the DeepBoost algorithm,
which are based on the Rademacher complexity of the hypoth