In [1]:
import torch
import textwrap
from transformers import AutoTokenizer, AutoModelForCausalLM
# from sentence_transformers import SentenceTransformer

# Local Paths
# MODEL_PATH ="/Users/sir/Downloads/HuggingFace/sentence_transformer/intfloat_e5-large-v2"
LLM_PATH = "/Users/sir/Downloads/HuggingFace/LLM/meta-Llama-3.1-8B-Instruct"

# use mps if available, else cuda, else cpu
if torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
elif torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")
print(f"Using device: {DEVICE}")

Using device: mps


In [None]:
# --- MODEL 1: THE "GENERATOR" (Llama 3.1 for summarizing) ---
generator_model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
print(f"Loading Generator: {LLM_PATH}")

# This line will now work correctly
generator_model = AutoModelForCausalLM.from_pretrained(
    LLM_PATH, 
    device_map=DEVICE, # Automatically map to your M3 GPU
    dtype=torch.bfloat16, # Use bfloat16 for M3
    trust_remote_code=True
)
generator_tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading Generator: /Users/sir/Downloads/HuggingFace/LLM/meta-Llama-3.1-8B-Instruct


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
# --- MODEL 1: THE "GENERATOR" (Llama 3.1 for summarizing) ---
print(f"Loading Local LLM: {LLM_PATH}")

# Load the model that can READ and WRITE
# Using bfloat16 for better performance on M-series chips
generator_model = AutoModelForCausalLM.from_pretrained(
    LLM_PATH, 
    device_map=DEVICE, # Automatically map to your M3 GPU
    dtype=torch.bfloat16, # Use bfloat16 for M3
    trust_remote_code=True
)
generator_tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)

# --- MODEL 2: THE "RETRIEVER" (E5 for searching) ---
print(f"Loading Retriever: {LLM_PATH}")

# Load the model that can SEARCH
retriever_model = SentenceTransformer(LLM_PATH, device=DEVICE)
print("\n--- Models Loaded Successfully ---")

### Summarization Example

In [None]:
text = """We survey 146 papers analyzing “bias” in
NLP systems, finding that their motivations
are often vague, inconsistent, and lacking
in normative reasoning, despite the fact that
analyzing “bias” is an inherently normative
process. We further find that these papers’
proposed quantitative techniques for measuring
or mitigating “bias” are poorly matched to
their motivations and do not engage with the
relevant literature outside of NLP. Based on
these findings, we describe the beginnings of a
path forward by proposing three recommendations
that should guide work analyzing “bias”
in NLP systems. These recommendations rest
on a greater recognition of the relationships
between language and social hierarchies,
encouraging researchers and practitioners
to articulate their conceptualizations of
“bias”—i.e., what kinds of system behaviors
are harmful, in what ways, to whom, and why,
as well as the normative reasoning underlying
these statements—and to center work around
the lived experiences of members of communities
affected by NLP systems, while interrogating
and reimagining the power relations
between technologists and such communities."""

In [None]:
# question
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Clearly summarize the following text in one concise paragraph:

{text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

input = tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = model.generate(
        **input,
        max_new_tokens=500,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.01,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=tokenizer.eos_token_id
    )

In [None]:
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# answer_only = response_text.split("### Response:")[1].split("###")[0].strip()
print(response_text)

In [None]:
print(answer_only)

In [None]:
# question
prompt = "### Instruction:\nWhat is the capital of South Africa?\n\n### Response:"
# prompt = f"{question}\nAnswer:"

input = tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = model.generate(
        **input,
        max_new_tokens=500,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.01,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=tokenizer.eos_token_id
    )

In [None]:
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
answer_only = response_text.split("### Response:")[1].split("###")[0].strip()
print(response_text)

In [None]:
print(answer_only)

In [None]:
# question
prompt = "### Instruction:\nWhy did UPS plane with 3 crew members crashed near the Louisville airport?\n\n### Response:"
# prompt = f"{question}\nAnswer:"

input = tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = model.generate(
        **input,
        max_new_tokens=500,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.01,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=tokenizer.eos_token_id
    )

In [None]:
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
answer_only = response_text.split("### Response:")[1].split("###")[0].strip()
print(answer_only)

In [None]:
response_text

In [None]:
text = """A UPS MD-11 plane crashed shortly after take-off near the Louisville, Kentucky, airport, according to the Federal Aviation Administration. UPS Flight 2976 crashed 
just after 5 p.m. local time and was headed to Daniel K. Inouye International Airport in Honolulu, according to a statement from the FAA, which is investigating the crash 
along with the National Transportation Safety Board. The NTSB will lead the investigation, the FAA said Tuesday.
Three crewmembers were on the plane, according to a statement from UPS that said in part, “At this time, we have not confirmed any injuries/casualties.”
Louisville Metro Police Department and other agencies are responding to the crash, LMPD said in an X post. Injuries have been reported, police said.
A massive plume of black smoke is rising not far from the tarmac at Louisville Muhammad Ali International Airport, videos from CNN affiliate WAVE show.
Louisville Muhammad Ali International Airport is the worldwide air hub for UPS. The company’s Worldport is more than 5 million square feet where more 
than 12,000 UPS employees process more than two million packages a day, according to the company.
A shelter-in-place has been issued for all locations within 5 miles of the airport, police added.
“LMPD and multiple other agencies are responding to reports of a plan crash near Fern Valley and Grade Lane,” the post said. “Grade lane will be 
closed indefinitely between Stooges and Crittenden.” The McDonnell Douglas MD-11F is a freight transport aircraft manufactured originally by McDonnell 
Douglas and later by Boeing. The aircraft is primarily flown by FedEx Express, Lufthansa Cargo and UPS Airlines for cargo.
The plane also served as a popular wide-bodied passenger airplane after it was first flown in 1990. The aircraft involved in Tuesday’s crash was built in 1991.
As fuel costs increased for the three engine jets many of them were converted to freighters. The plane can take off weighing in at a maximum 633,000 pounds and 
carrying more than 38,000 gallons of fuel, according to Boeing, which bought McDonnell Douglass.
"""


In [None]:
len(text)

In [None]:
# question
prompt = f"### Instruction:\nClearly summarize the following text in one concise paragraph within 500 words:\n{text}\n### Response:"
input = tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = model.generate(
        **input,
        max_new_tokens=750,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.06,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        eos_token_id=tokenizer.eos_token_id
    )

In [None]:
import textwrap

# 1. Decode the raw text
raw_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# 2. Wrap the text to 80 characters per line
answer_only = raw_text.split("### Response:")[1].split("###")[0].strip()
formatted_text = textwrap.fill(answer_only, width=80)

# 3. Print the perfectly formatted output
print(formatted_text)

In [None]:
# 1. Decode the raw text
raw_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# 2. Wrap the text to 80 characters per line
formatted_text = textwrap.fill(raw_text, width=80)

# 3. Print the perfectly formatted output
print(formatted_text)

## Test

In [None]:
# --- Your Original Text (This is what E5 would find) ---
text = """
A UPS MD-11 plane crashed shortly after take-off near the Louisville, Kentucky, airport, according to the Federal Aviation Administration. UPS Flight 2976 crashed 
just after 5 p.m. local time and was headed to Daniel K. Inouye International Airport in Honolulu, according to a statement from the FAA, which is investigating the crash 
along with the National Transportation Safety Board. The NTSB will lead the investigation, the FAA said Tuesday.
Three crewmembers were on the plane, according to a statement from UPS that said in part, “At this time, we have not confirmed any injuries/casualties.”
Louisville Metro Police Department and other agencies are responding to the crash, LMPD said in an X post. Injuries have been reported, police said.
A massive plume of black smoke is rising not far from the tarmac at Louisville Muhammad Ali International Airport, videos from CNN affiliate WAVE show.
Louisville Muhammad Ali International Airport is the worldwide air hub for UPS. The company’s Worldport is more than 5 million square feet where more 
than 12,000 UPS employees process more than two million packages a day, according to the company.
A shelter-in-place has been issued for all locations within 5 miles of the airport, police added.
“LMPD and multiple other agencies are responding to reports of a plan crash near Fern Valley and Grade Lane,” the post said. “Grade lane will be 
closed indefinitely between Stooges and Crittenden.” The McDonnell Douglas MD-11F is a freight transport aircraft manufactured originally by McDonnell 
Douglas and later by Boeing. The aircraft is primarily flown by FedEx Express, Lufthansa Cargo and UPS Airlines for cargo.
The plane also served as a popular wide-bodied passenger airplane after it was first flown in 1990. The aircraft involved in Tuesday’s crash was built in 1991.
As fuel costs increased for the three engine jets many of them were converted to freighters. The plane can take off weighing in at a maximum 633,000 pounds and 
carrying more than 38,000 gallons of fuel, according to Boeing, which bought McDonnell Douglass.
"""

# --- STEP 1: USE THE "RETRIEVER" (E5 Model) ---
# (Simulated) In a real RAG app, E5 would search a database and *find* 
# this text chunk.
# E5's job is to make a vector:
vector = retriever_model.encode([text])
print(f"\nE5 Model (Retriever) created a vector of shape: {vector.shape}")


# --- STEP 2: USE THE "GENERATOR" (Llama-3.1-Instruct Model) ---
# Now, we build a prompt and ask the GENERATOR to summarize the text
# that the RETRIEVER found.

# CRITICAL: Use the Llama 3.1 prompt format
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Clearly summarize the following text in one concise paragraph:

{text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

# Tokenize the prompt for the GENERATOR
# We must set pad_token_id to eos_token_id for Llama 3
if generator_tokenizer.pad_token_id is None:
    generator_tokenizer.pad_token_id = generator_tokenizer.eos_token_id

input = generator_tokenizer(prompt, return_tensors="pt").to(DEVICE)

# Generate text using the GENERATOR model
print("\nGenerating summary with Llama-3.1-Instruct...")
with torch.no_grad():
    # We must also pass the eos_token_id to stop generation
    outputs = generator_model.generate(
        **input,
        max_new_tokens=750,        
        do_sample=True,
        temperature=0.7,         # A good temperature for creative summary
        top_p=0.9,
        # Llama 3.1 uses <|eot_id|> as its end token
        eos_token_id=generator_tokenizer.eos_token_id
    )

# 1. Decode the raw text
# We only want the generated part, not the input prompt
output_token_ids = outputs[0][len(input['input_ids'][0]):]
raw_output = generator_tokenizer.decode(output_token_ids, skip_special_tokens=True)

# 2. Clean the output (it might have extra spaces)
response_only = raw_output.strip()

# 3. Wrap and print the final, correct summary
formatted_text = textwrap.fill(response_only, width=80)
print("\n--- GENERATED SUMMARY ---")
print(formatted_text)

In [None]:
# The question you want to ask
question = "What is the capital of United States?"

# Build the prompt using the Llama 3.1 template
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""


# Tokenize the prompt for the GENERATOR
# We must set pad_token_id to eos_token_id for Llama 3
if generator_tokenizer.pad_token_id is None:
    generator_tokenizer.pad_token_id = generator_tokenizer.eos_token_id

input = generator_tokenizer(prompt, return_tensors="pt").to(DEVICE)

# Generate text using the GENERATOR model
print("\nGenerating summary with Llama-3.1-Instruct...")
with torch.no_grad():
    # We must also pass the eos_token_id to stop generation
    outputs = generator_model.generate(
        **input,
        max_new_tokens=750,        
        do_sample=True,
        temperature=0.7,         # A good temperature for creative summary
        top_p=0.9,
        # Llama 3.1 uses <|eot_id|> as its end token
        eos_token_id=generator_tokenizer.eos_token_id
    )

# 1. Decode the raw text
# We only want the generated part, not the input prompt
output_token_ids = outputs[0][len(input['input_ids'][0]):]
raw_output = generator_tokenizer.decode(output_token_ids, skip_special_tokens=True)

# 2. Clean the output (it might have extra spaces)
response_only = raw_output.strip()

# 3. Wrap and print the final, correct summary
formatted_text = textwrap.fill(response_only, width=80)
print("\n--- GENERATED SUMMARY ---")
print(formatted_text)

In [None]:
raw_output

### Function to extract text from PDFs

In [None]:
pdf = extract_text_from_pdfs(PDF_PATH)

In [None]:
text = (pdf[0]['text'])

In [None]:
print(text)

In [None]:
def extract_text_from_pdfs(pdf_folder: str) -> list[dict]:
    texts = []
    for pdf_file in Path(pdf_folder).glob("*.pdf"):
        doc = fitz.open(pdf_file)
        text = "\n".join(page.get_text() for page in doc)
        texts.append({"source": str(pdf_file), "text": text})
    return texts


def chunk_extracted_text(extracted_data):
    """Splits large texts into smaller, overlapping chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separators=["\n\n", "\n", " ", ""]
    )

    all_chunks = []
    for item in extracted_data:
        source_text = item["text"]
        chunks = text_splitter.create_documents([source_text])
        
        for chunk in chunks:
             all_chunks.append({
                 "source": item["source"],
                 "text": chunk.page_content
             })
             
    print(f"-> Successfully created {len(all_chunks)} text chunks.")
    return all_chunks





def create_positive_pairs(chunks: List[str]) -> List[InputExample]:
    return [
        InputExample(texts=[chunks[i], chunks[i+1]], label=1.0)
        for i in range(len(chunks) - 1)
    ]

In [None]:
pdf = extract_text_from_pdfs(PDF_PATH)

In [None]:
chunked_text = chunk_extracted_text(pdf)

In [None]:
len(chunked_text)

In [None]:
chunked_text[1]['text']

In [None]:
pairs = create_positive_pairs(pdf)

In [None]:
# View the output
for pair in pairs:
    print(f"Texts: {pair.texts}, Label: {pair.label}")