In [1]:
import fitz  # PyMuPDF for PDF extraction
from pathlib import Path
import os
import shutil
import random
from tqdm import tqdm
# Imports for LLM
from transformers import AutoTokenizer, AutoModelForCausalLM
# Imports for Text Chunking
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Imports for Contrastive Learning / Fine-Tuning
import torch
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
# typing imports
from typing import List

# Local Paths
MODEL_PATH ="/Users/sir/Downloads/HuggingFace/sentence_transformer/intfloat_e5-large-v2"
LLM_PATH = "/Users/sir/Downloads/HuggingFace/LLM/meta-Llama-3.1-8B-Instruct"
MODEL_OUTPUT = "/Users/sir/Desktop/Project/SentenceTransformer/FineTune/intfloat_e5-large-v2-FineTuned"
PDF_PATH = "/Users/sir/Downloads/Data/PDF/test/"
TRAINING_METHOD = 'CONTIGUOUS_CHUNKS'

# Fine-Tuning Hyperparameters
BATCH_SIZE = 16 
NUM_EPOCHS = 3
LEARNING_RATE = 2e-5
MAX_SEQ_LENGTH = 512

# Chunking Parameters
CHUNK_SIZE = 512
CHUNK_OVERLAP = 50

if torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
elif torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")
print(f"Using device: {DEVICE}")


Using device: mps


In [29]:
# Load Local tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)
model = AutoModelForCausalLM.from_pretrained(LLM_PATH).to(DEVICE)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [33]:
# Check tokenizer's max length
print("Tokenizer max length:", tokenizer.model_max_length)

Tokenizer max length: 131072


In [34]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
  

In [46]:
# question = "What is the capital of France?"
prompt = "### Instruction:\nWhat is the capital of France?\n\n### Response:"
# prompt = f"{question}\nAnswer:"

input = tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = model.generate(
        **input,
        max_new_tokens=500,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.01,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=tokenizer.eos_token_id
    )

In [None]:
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

### Instruction:
What is the capital of France?

### Response: 
The capital of France is Paris. 

### Explanation:
This is a simple question that requires a basic knowledge of geography. The correct answer is Paris, which is widely known as the capital of France. The question does not require any complex reasoning or analysis, but rather a recall of a basic fact. 

### Analysis:
This question is a good example of a question that requires recall of a basic fact. It does not require any critical thinking or analysis, but rather a simple recall of a widely known piece of information. This type of question is often used in multiple-choice tests or quizzes to assess a person's knowledge of basic facts. 

### Implications:
This question has implications for how we assess knowledge and understanding. It suggests that recall of basic facts is an important aspect of knowledge, and that it can be assessed through simple questions. It also highlights the importance of having a broad base of knowl

In [49]:
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
answer_only = response_text.split("### Response:")[1].split("###")[0].strip()
print(answer_only)

The capital of France is Paris.


In [None]:
# use prompt to ask question
prompt = "### Instruction:\nWhat is the capital of France?\n\n### Response:"

input = tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = model.generate(
        **input,
        max_new_tokens=500,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.01,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=tokenizer.eos_token_id
    )

In [None]:
text = """We survey 146 papers analyzing “bias” in
NLP systems, finding that their motivations
are often vague, inconsistent, and lacking
in normative reasoning, despite the fact that
analyzing “bias” is an inherently normative
process. We further find that these papers’
proposed quantitative techniques for measuring
or mitigating “bias” are poorly matched to
their motivations and do not engage with the
relevant literature outside of NLP. Based on
these findings, we describe the beginnings of a
path forward by proposing three recommendations
that should guide work analyzing “bias”
in NLP systems. These recommendations rest
on a greater recognition of the relationships
between language and social hierarchies,
encouraging researchers and practitioners
to articulate their conceptualizations of
“bias”—i.e., what kinds of system behaviors
are harmful, in what ways, to whom, and why,
as well as the normative reasoning underlying
these statements—and to center work around
the lived experiences of members of communities
affected by NLP systems, while interrogating
and reimagining the power relations
between technologists and such communities."""

In [None]:
# question = "What is the capital of France?"
prompt = "### Instruction:\nSummarize the following text:\n\n### Response:"
# prompt = f"{question}\nAnswer:"

input = tokenizer(prompt, return_tensors="pt").to(DEVICE)


# Generate text
with torch.no_grad():
    outputs = model.generate(
        **input,
        max_new_tokens=500,         # Number of tokens to generate
        do_sample=True,             # Enable sampling for more natural output
        temperature=0.01,            # Controls randomness
        top_p=0.9,                  # Nucleus sampling
        pad_token_id=tokenizer.eos_token_id
    )

### Function to extract text from PDFs

In [8]:
pdf = extract_text_from_pdfs(PDF_PATH)

In [11]:
text = (pdf[0]['text'])

In [28]:
print(text)

Language (Technology) is Power: A Critical Survey of “Bias” in NLP
Su Lin Blodgett
College of Information and Computer Sciences
University of Massachusetts Amherst
blodgett@cs.umass.edu
Solon Barocas
Microsoft Research
Cornell University
solon@microsoft.com
Hal Daumé III
Microsoft Research
University of Maryland
me@hal3.name
Hanna Wallach
Microsoft Research
wallach@microsoft.com
Abstract
We survey 146 papers analyzing “bias” in
NLP systems, ﬁnding that their motivations
are often vague, inconsistent, and lacking
in normative reasoning, despite the fact that
analyzing “bias” is an inherently normative
process.
We further ﬁnd that these papers’
proposed quantitative techniques for measur-
ing or mitigating “bias” are poorly matched to
their motivations and do not engage with the
relevant literature outside of NLP. Based on
these ﬁndings, we describe the beginnings of a
path forward by proposing three recommenda-
tions that should guide work analyzing “bias”
in NLP systems. These recommen

In [4]:
def extract_text_from_pdfs(pdf_folder: str) -> list[dict]:
    texts = []
    for pdf_file in Path(pdf_folder).glob("*.pdf"):
        doc = fitz.open(pdf_file)
        text = "\n".join(page.get_text() for page in doc)
        texts.append({"source": str(pdf_file), "text": text})
    return texts


def chunk_extracted_text(extracted_data):
    """Splits large texts into smaller, overlapping chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separators=["\n\n", "\n", " ", ""]
    )

    all_chunks = []
    for item in extracted_data:
        source_text = item["text"]
        chunks = text_splitter.create_documents([source_text])
        
        for chunk in chunks:
             all_chunks.append({
                 "source": item["source"],
                 "text": chunk.page_content
             })
             
    print(f"-> Successfully created {len(all_chunks)} text chunks.")
    return all_chunks





def create_positive_pairs(chunks: List[str]) -> List[InputExample]:
    return [
        InputExample(texts=[chunks[i], chunks[i+1]], label=1.0)
        for i in range(len(chunks) - 1)
    ]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [45]:
pdf = extract_text_from_pdfs(PDF_PATH)

In [46]:
chunked_text = chunk_extracted_text(pdf)

-> Successfully created 242 text chunks.


In [49]:
len(chunked_text)

242

In [50]:
chunked_text[1]['text']

'NLP systems, ﬁnding that their motivations\nare often vague, inconsistent, and lacking\nin normative reasoning, despite the fact that\nanalyzing “bias” is an inherently normative\nprocess.\nWe further ﬁnd that these papers’\nproposed quantitative techniques for measur-\ning or mitigating “bias” are poorly matched to\ntheir motivations and do not engage with the\nrelevant literature outside of NLP. Based on\nthese ﬁndings, we describe the beginnings of a\npath forward by proposing three recommenda-'

In [31]:
pairs = create_positive_pairs(pdf)

In [32]:
# View the output
for pair in pairs:
    print(f"Texts: {pair.texts}, Label: {pair.label}")

Texts: [{'source': '/Users/sir/Downloads/Data/PDF/Combining-image--voice--and-the-patient-s-questionna_2010_Artificial-Intelli.pdf', 'text': 'Combining image, voice, and the patient’s questionnaire data to categorize\nlaryngeal disorders\nAntanas Verikas a,b,*, Adas Gelzinis a, Marija Bacauskiene a, Magnus Ha˚ llander b, Virgilijus Uloza c,\nMarius Kaseta c\na Department of Electrical & Control Equipment, Kaunas University of Technology, Studentu 50, LT-51368, Kaunas, Lithuania\nb Intelligent Systems Laboratory, Halmstad University, Box 823, S 301 18 Halmstad, Sweden\nc Department of Otolaryngology, Kaunas University of Medicine, Eiveniu 2, LT-50009 Kaunas, Lithuania\n1. Introduction\nIn clinical practice, the diagnostic procedure of laryngeal\ndiseases is based on evaluation of patient’s complaints, history,\nand data of instrumental as well as histological examination.\nDuring the last years a variety of techniques for examination of the\nlarynx and objective measurements of voice qu