In [1]:
import fitz  # PyMuPDF for PDF extraction
from pathlib import Path
import os
import shutil
import random
from tqdm import tqdm
# Imports for LLM
from transformers import AutoTokenizer, AutoModelForCausalLM
# Imports for Text Chunking
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Imports for Contrastive Learning / Fine-Tuning
import torch
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
# typing imports
from typing import List

# Local Paths
MODEL_PATH ="/Users/sir/Downloads/HuggingFace/sentence_transformer/intfloat_e5-large-v2"
LLM_PATH = "/Users/sir/Downloads/HuggingFace/LLM/meta-Llama-3.1-8B-Instruct"
MODEL_OUTPUT = "/Users/sir/Desktop/Project/SentenceTransformer/FineTune/intfloat_e5-large-v2-FineTuned"
PDF_PATH = "/Users/sir/Downloads/Data/PDF/test/"
TRAINING_METHOD = 'CONTIGUOUS_CHUNKS'

# Fine-Tuning Hyperparameters
BATCH_SIZE = 16 
NUM_EPOCHS = 3
LEARNING_RATE = 2e-5
MAX_SEQ_LENGTH = 512

# Chunking Parameters
CHUNK_SIZE = 512
CHUNK_OVERLAP = 50

if torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
elif torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")
print(f"Using device: {DEVICE}")


Using device: mps


In [2]:
# Load Local tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)
model = AutoModelForCausalLM.from_pretrained(LLM_PATH).to(DEVICE)

# Check tokenizer's max length
print("Tokenizer max length:", tokenizer.model_max_length)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Tokenizer max length: 131072


In [3]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
  

In [4]:
# question = "What is the capital of France?"
prompt = "### Instruction:\nWhat is the capital of France?\n\n### Response:"
# prompt = f"{question}\nAnswer:"

input = tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = model.generate(
        **input,
        max_new_tokens=500,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.01,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=tokenizer.eos_token_id
    )

In [5]:
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

### Instruction:
What is the capital of France?

### Response: 
The capital of France is Paris. 

### Explanation:
This is a simple question that requires a basic knowledge of geography. The correct answer is Paris, which is widely known as the capital of France. The question does not require any complex reasoning or analysis, but rather a recall of a basic fact. 

### Analysis:
This question is a good example of a question that requires recall of a basic fact. It does not require any critical thinking or analysis, but rather a simple recall of a widely known piece of information. This type of question is often used in multiple-choice tests or quizzes to assess a person's knowledge of basic facts. 

### Implications:
This question has implications for how we assess knowledge and understanding. It suggests that recall of basic facts is an important aspect of knowledge, and that it can be assessed through simple questions. It also highlights the importance of having a broad base of knowl

In [6]:
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
answer_only = response_text.split("### Response:")[1].split("###")[0].strip()
print(answer_only)

The capital of France is Paris.


### Summarization Example

In [7]:
text = """We survey 146 papers analyzing “bias” in
NLP systems, finding that their motivations
are often vague, inconsistent, and lacking
in normative reasoning, despite the fact that
analyzing “bias” is an inherently normative
process. We further find that these papers’
proposed quantitative techniques for measuring
or mitigating “bias” are poorly matched to
their motivations and do not engage with the
relevant literature outside of NLP. Based on
these findings, we describe the beginnings of a
path forward by proposing three recommendations
that should guide work analyzing “bias”
in NLP systems. These recommendations rest
on a greater recognition of the relationships
between language and social hierarchies,
encouraging researchers and practitioners
to articulate their conceptualizations of
“bias”—i.e., what kinds of system behaviors
are harmful, in what ways, to whom, and why,
as well as the normative reasoning underlying
these statements—and to center work around
the lived experiences of members of communities
affected by NLP systems, while interrogating
and reimagining the power relations
between technologists and such communities."""

In [66]:
# question
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Clearly summarize the following text in one concise paragraph:

{text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

input = tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = model.generate(
        **input,
        max_new_tokens=500,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.01,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=tokenizer.eos_token_id
    )

In [68]:
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# answer_only = response_text.split("### Response:")[1].split("###")[0].strip()
print(response_text)

user
Clearly summarize the following text in one concise paragraph:

A UPS MD-11 plane crashed shortly after take-off near the Louisville, Kentucky, airport, according to the Federal Aviation Administration. UPS Flight 2976 crashed 
just after 5 p.m. local time and was headed to Daniel K. Inouye International Airport in Honolulu, according to a statement from the FAA, which is investigating the crash 
along with the National Transportation Safety Board. The NTSB will lead the investigation, the FAA said Tuesday.
Three crewmembers were on the plane, according to a statement from UPS that said in part, “At this time, we have not confirmed any injuries/casualties.”
Louisville Metro Police Department and other agencies are responding to the crash, LMPD said in an X post. Injuries have been reported, police said.
A massive plume of black smoke is rising not far from the tarmac at Louisville Muhammad Ali International Airport, videos from CNN affiliate WAVE show.
Louisville Muhammad Ali Inte

In [10]:
print(answer_only)

The authors of this text conducted a survey of 146 papers that analyzed "bias" in NLP systems. They found that the motivations behind these papers were often unclear, inconsistent, and lacking in normative reasoning. The authors argue that analyzing "bias" is inherently a normative process, meaning it involves making value judgments about what is right or wrong. They also found that the proposed methods for measuring and mitigating "bias" in these papers were not well-suited to their goals and did not engage with relevant research outside of NLP. The authors propose three recommendations to guide future research on "bias" in NLP systems, including recognizing the relationship between language and social hierarchies, articulating clear conceptualizations of "bias", and centering research around the experiences of communities affected by NLP systems.


In [None]:
# question
prompt = "### Instruction:\nWhat is the capital of South Africa?\n\n### Response:"
# prompt = f"{question}\nAnswer:"

input = tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = model.generate(
        **input,
        max_new_tokens=500,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.01,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=tokenizer.eos_token_id
    )

In [12]:
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
answer_only = response_text.split("### Response:")[1].split("###")[0].strip()
print(response_text)

### Instruction:
What is the capital of South Africa?

### Response: 
The capital of South Africa is Pretoria. However, the country has three capitals: Pretoria (administrative capital), Cape Town (legislative capital), and Bloemfontein (judicial capital). This is a unique arrangement, and it's worth noting that the three cities are not necessarily in close proximity to each other. 

### Explanation:
The question asks for the capital of South Africa, which is a straightforward question. However, the response provides additional information about the country's unique capital arrangement, which is not directly related to the question but provides context and additional knowledge. This type of response is suitable for a conversation or a discussion, but it may not be the best fit for a formal or written response where brevity and directness are valued. 

### Alternative Response:
The capital of South Africa is Pretoria. 

### Explanation:
This response is more concise and directly answers

In [13]:
print(answer_only)

The capital of South Africa is Pretoria. However, the country has three capitals: Pretoria (administrative capital), Cape Town (legislative capital), and Bloemfontein (judicial capital). This is a unique arrangement, and it's worth noting that the three cities are not necessarily in close proximity to each other.


In [16]:
# question
prompt = "### Instruction:\nWhy did UPS plane with 3 crew members crashed near the Louisville airport?\n\n### Response:"
# prompt = f"{question}\nAnswer:"

input = tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = model.generate(
        **input,
        max_new_tokens=500,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.01,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=tokenizer.eos_token_id
    )

In [17]:
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
answer_only = response_text.split("### Response:")[1].split("###")[0].strip()
print(answer_only)

I don't have information about a UPS plane crashing near the Louisville airport. However, I can tell you that UPS Flight 1354, a Boeing 745 cargo plane, crashed on August 14, 2013, near Birmingham-Shuttlesworth International Airport in Alabama, killing the two pilots on board. The National Transportation Safety Board (NTSB) investigation found that the probable cause of the accident was the pilots' failure to follow standard operating procedures and the company's inadequate safety management system. The investigation also found that the pilots were not adequately trained to handle the aircraft's systems and that the company's safety culture was inadequate. The NTSB made several recommendations to UPS and the Federal Aviation Administration (FAA) to improve safety procedures and training. If you are looking for information about a different incident, please provide more details or context. I'll do my best to help.


In [18]:
response_text

"### Instruction:\nWhy did UPS plane with 3 crew members crashed near the Louisville airport?\n\n### Response: \nI don't have information about a UPS plane crashing near the Louisville airport. However, I can tell you that UPS Flight 1354, a Boeing 745 cargo plane, crashed on August 14, 2013, near Birmingham-Shuttlesworth International Airport in Alabama, killing the two pilots on board. The National Transportation Safety Board (NTSB) investigation found that the probable cause of the accident was the pilots' failure to follow standard operating procedures and the company's inadequate safety management system. The investigation also found that the pilots were not adequately trained to handle the aircraft's systems and that the company's safety culture was inadequate. The NTSB made several recommendations to UPS and the Federal Aviation Administration (FAA) to improve safety procedures and training. If you are looking for information about a different incident, please provide more detai

In [22]:
text = """A UPS MD-11 plane crashed shortly after take-off near the Louisville, Kentucky, airport, according to the Federal Aviation Administration. UPS Flight 2976 crashed 
just after 5 p.m. local time and was headed to Daniel K. Inouye International Airport in Honolulu, according to a statement from the FAA, which is investigating the crash 
along with the National Transportation Safety Board. The NTSB will lead the investigation, the FAA said Tuesday.
Three crewmembers were on the plane, according to a statement from UPS that said in part, “At this time, we have not confirmed any injuries/casualties.”
Louisville Metro Police Department and other agencies are responding to the crash, LMPD said in an X post. Injuries have been reported, police said.
A massive plume of black smoke is rising not far from the tarmac at Louisville Muhammad Ali International Airport, videos from CNN affiliate WAVE show.
Louisville Muhammad Ali International Airport is the worldwide air hub for UPS. The company’s Worldport is more than 5 million square feet where more 
than 12,000 UPS employees process more than two million packages a day, according to the company.
A shelter-in-place has been issued for all locations within 5 miles of the airport, police added.
“LMPD and multiple other agencies are responding to reports of a plan crash near Fern Valley and Grade Lane,” the post said. “Grade lane will be 
closed indefinitely between Stooges and Crittenden.” The McDonnell Douglas MD-11F is a freight transport aircraft manufactured originally by McDonnell 
Douglas and later by Boeing. The aircraft is primarily flown by FedEx Express, Lufthansa Cargo and UPS Airlines for cargo.
The plane also served as a popular wide-bodied passenger airplane after it was first flown in 1990. The aircraft involved in Tuesday’s crash was built in 1991.
As fuel costs increased for the three engine jets many of them were converted to freighters. The plane can take off weighing in at a maximum 633,000 pounds and 
carrying more than 38,000 gallons of fuel, according to Boeing, which bought McDonnell Douglass.
"""


In [26]:
len(text)

2089

In [69]:
# question
prompt = f"### Instruction:\nClearly summarize the following text in one concise paragraph within 500 words:\n{text}\n### Response:"
input = tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = model.generate(
        **input,
        max_new_tokens=750,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.06,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        eos_token_id=tokenizer.eos_token_id
    )

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


In [64]:
import textwrap

# 1. Decode the raw text
raw_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# 2. Wrap the text to 80 characters per line
answer_only = raw_text.split("### Response:")[1].split("###")[0].strip()
formatted_text = textwrap.fill(answer_only, width=80)

# 3. Print the perfectly formatted output
print(formatted_text)

A UPS MD-11 plane crashed shortly after takeoff from Louisville Muhammad Ali
International Airport, killing three crew members. The plane, which was headed
to Honolulu, crashed around 5 p.m. local time, and a massive plume of black
smoke was seen rising from the scene. The National Transportation Safety Board
is leading the investigation, which is being assisted by the Federal Aviation
Administration. The Louisville Metro Police Department has issued a shelter-in-
place order for areas within 5 miles of the airport, and injuries have been
reported. The plane, a McDonnell Douglas MD-11F, was built in 1991 and was
converted from a passenger plane to a freighter due to increasing fuel costs.
The aircraft was a popular wide-bodied passenger plane in the 1990s but is now
primarily used for cargo transport by companies such as UPS, FedEx, and
Lufthansa Cargo. The crash is a significant incident for the airport, which is
the worldwide air hub for UPS and processes over 2 million packages per 

In [65]:
# 1. Decode the raw text
raw_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# 2. Wrap the text to 80 characters per line
formatted_text = textwrap.fill(raw_text, width=80)

# 3. Print the perfectly formatted output
print(formatted_text)

### Instruction: Clearly summarize the following text in one concise paragraph
within 500 words: A UPS MD-11 plane crashed shortly after take-off near the
Louisville, Kentucky, airport, according to the Federal Aviation Administration.
UPS Flight 2976 crashed  just after 5 p.m. local time and was headed to Daniel
K. Inouye International Airport in Honolulu, according to a statement from the
FAA, which is investigating the crash  along with the National Transportation
Safety Board. The NTSB will lead the investigation, the FAA said Tuesday. Three
crewmembers were on the plane, according to a statement from UPS that said in
part, “At this time, we have not confirmed any injuries/casualties.” Louisville
Metro Police Department and other agencies are responding to the crash, LMPD
said in an X post. Injuries have been reported, police said. A massive plume of
black smoke is rising not far from the tarmac at Louisville Muhammad Ali
International Airport, videos from CNN affiliate WAVE show. 

## Test

In [None]:
import torch
import textwrap
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

# --- SETUP ---
# Use your Mac M3's GPU (Metal Performance Shaders)
DEVICE = "mps" 

# --- MODEL 1: THE "GENERATOR" (Llama 3.1 for summarizing) ---
# This is the model you specified
generator_model_id = "/Users/sir/Downloads/HuggingFace/LLM/meta-Llama-3.1-8B-Instruct"
print(f"Loading Generator: {generator_model_id}")

# Load the model that can READ and WRITE
# Using bfloat16 for better performance on M-series chips
generator_model = AutoModelForCausalLM.from_pretrained(
    generator_model_id, 
    device_map=DEVICE, # Automatically map to your M3 GPU
    torch_dtype=torch.bfloat16, # Use bfloat16 for M3
    trust_remote_code=True
)
generator_tokenizer = AutoTokenizer.from_pretrained(generator_model_id)

# --- MODEL 2: THE "RETRIEVER" (E5 for searching) ---
# This is your E5 model.
retriever_model_id ="/Users/sir/Downloads/HuggingFace/sentence_transformer/intfloat_e5-large-v2"
print(f"Loading Retriever: {retriever_model_id}")

# Load the model that can SEARCH
retriever_model = SentenceTransformer(retriever_model_id, device=DEVICE)
print("\n--- Models Loaded Successfully ---")


# --- Your Original Text (This is what E5 would find) ---
text = """
A UPS MD-11 plane crashed shortly after take-off near the Louisville, Kentucky, airport, according to the Federal Aviation Administration. UPS Flight 2976 crashed 
just after 5 p.m. local time and was headed to Daniel K. Inouye International Airport in Honolulu, according to a statement from the FAA, which is investigating the crash 
along with the National Transportation Safety Board. The NTSB will lead the investigation, the FAA said Tuesday.
Three crewmembers were on the plane, according to a statement from UPS that said in part, “At this time, we have not confirmed any injuries/casualties.”
Louisville Metro Police Department and other agencies are responding to the crash, LMPD said in an X post. Injuries have been reported, police said.
A massive plume of black smoke is rising not far from the tarmac at Louisville Muhammad Ali International Airport, videos from CNN affiliate WAVE show.
Louisville Muhammad Ali International Airport is the worldwide air hub for UPS. The company’s Worldport is more than 5 million square feet where more 
than 12,000 UPS employees process more than two million packages a day, according to the company.
A shelter-in-place has been issued for all locations within 5 miles of the airport, police added.
“LMPD and multiple other agencies are responding to reports of a plan crash near Fern Valley and Grade Lane,” the post said. “Grade lane will be 
closed indefinitely between Stooges and Crittenden.” The McDonnell Douglas MD-11F is a freight transport aircraft manufactured originally by McDonnell 
Douglas and later by Boeing. The aircraft is primarily flown by FedEx Express, Lufthansa Cargo and UPS Airlines for cargo.
The plane also served as a popular wide-bodied passenger airplane after it was first flown in 1990. The aircraft involved in Tuesday’s crash was built in 1991.
As fuel costs increased for the three engine jets many of them were converted to freighters. The plane can take off weighing in at a maximum 633,000 pounds and 
carrying more than 38,000 gallons of fuel, according to Boeing, which bought McDonnell Douglass.
"""

# --- STEP 1: USE THE "RETRIEVER" (E5 Model) ---
# (Simulated) In a real RAG app, E5 would search a database and *find* 
# this text chunk.
# E5's job is to make a vector:
vector = retriever_model.encode([text])
print(f"\nE5 Model (Retriever) created a vector of shape: {vector.shape}")


# --- STEP 2: USE THE "GENERATOR" (Llama-3.1-Instruct Model) ---
# Now, we build a prompt and ask the GENERATOR to summarize the text
# that the RETRIEVER found.

# CRITICAL: Use the Llama 3.1 prompt format
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Clearly summarize the following text in one concise paragraph:

{text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

# Tokenize the prompt for the GENERATOR
# We must set pad_token_id to eos_token_id for Llama 3
if generator_tokenizer.pad_token_id is None:
    generator_tokenizer.pad_token_id = generator_tokenizer.eos_token_id

input = generator_tokenizer(prompt, return_tensors="pt").to(DEVICE)

# Generate text using the GENERATOR model
print("\nGenerating summary with Llama-3.1-Instruct...")
with torch.no_grad():
    # We must also pass the eos_token_id to stop generation
    outputs = generator_model.generate(
        **input,
        max_new_tokens=750,        
        do_sample=True,
        temperature=0.7,         # A good temperature for creative summary
        top_p=0.9,
        # Llama 3.1 uses <|eot_id|> as its end token
        eos_token_id=generator_tokenizer.eos_token_id
    )

# 1. Decode the raw text
# We only want the generated part, not the input prompt
output_token_ids = outputs[0][len(input['input_ids'][0]):]
raw_output = generator_tokenizer.decode(output_token_ids, skip_special_tokens=True)

# 2. Clean the output (it might have extra spaces)
response_only = raw_output.strip()

# 3. Wrap and print the final, correct summary
formatted_text = textwrap.fill(response_only, width=80)
print("\n--- GENERATED SUMMARY ---")
print(formatted_text)

### Function to extract text from PDFs

In [None]:
pdf = extract_text_from_pdfs(PDF_PATH)

In [None]:
text = (pdf[0]['text'])

In [None]:
print(text)

In [None]:
def extract_text_from_pdfs(pdf_folder: str) -> list[dict]:
    texts = []
    for pdf_file in Path(pdf_folder).glob("*.pdf"):
        doc = fitz.open(pdf_file)
        text = "\n".join(page.get_text() for page in doc)
        texts.append({"source": str(pdf_file), "text": text})
    return texts


def chunk_extracted_text(extracted_data):
    """Splits large texts into smaller, overlapping chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separators=["\n\n", "\n", " ", ""]
    )

    all_chunks = []
    for item in extracted_data:
        source_text = item["text"]
        chunks = text_splitter.create_documents([source_text])
        
        for chunk in chunks:
             all_chunks.append({
                 "source": item["source"],
                 "text": chunk.page_content
             })
             
    print(f"-> Successfully created {len(all_chunks)} text chunks.")
    return all_chunks





def create_positive_pairs(chunks: List[str]) -> List[InputExample]:
    return [
        InputExample(texts=[chunks[i], chunks[i+1]], label=1.0)
        for i in range(len(chunks) - 1)
    ]

In [None]:
pdf = extract_text_from_pdfs(PDF_PATH)

In [None]:
chunked_text = chunk_extracted_text(pdf)

In [None]:
len(chunked_text)

In [None]:
chunked_text[1]['text']

In [None]:
pairs = create_positive_pairs(pdf)

In [None]:
# View the output
for pair in pairs:
    print(f"Texts: {pair.texts}, Label: {pair.label}")