In [2]:
import torch 
import torch.nn as nn 
import math

In [3]:
class InputEmbedding(nn.Module):
    def __init__(self,d_model:int,vocab_size:int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size,d_model)
    
    def forward(self,x):
        self.embedding(x) * math.sqrt(self.d_model)

In [4]:
class LayerNormalization(nn.Module):

    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
         # Keep the dimension for broadcasting
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # eps is to prevent dividing by zero or when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias


In [9]:
%pip install 'accelerate>=0.26.0'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting accelerate>=0.26.0
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.2.1-py3-none-any.whl (336 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.2.1
Note: you may need to restart the kernel to use updated packages.


In [8]:
import os
import torch
import pdfplumber
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    with pdfplumber.open(pdf_path) as pdf:
        text = "".join([page.extract_text() for page in pdf.pages])
    return text

def chunk_text(text, max_chunk_size=512):
    """Splits text into smaller chunks fitting the model's token limit."""
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        if current_length + len(word) + 1 > max_chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(word)
        current_length += len(word) + 1
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

def prepare_dataset(chunks):
    """Creates a dataset from text chunks."""
    return Dataset.from_dict({"text": chunks})

def fine_tune_model(pdf_path, model_name="gpt2", output_dir="fine_tuned_model"):
    """Fine-tunes a model on text extracted from a PDF."""
    print("Extracting text from PDF...")
    text = extract_text_from_pdf(pdf_path)
    
    print("Chunking text...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Set pad_token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    chunks = chunk_text(text, max_chunk_size=tokenizer.model_max_length)
    
    print("Preparing dataset...")
    dataset = prepare_dataset(chunks)
    
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=tokenizer.model_max_length,
            return_tensors="pt"
        )
    
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"]
    )
    
    print("Loading model...")
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    # Explicitly set to CPU
    device = torch.device("cpu")
    print("Using CPU device")
    model.to(device)
    
    print("Setting up training arguments...")
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        save_steps=100,
        save_total_limit=2,
        logging_dir=f"{output_dir}/logs",
        logging_steps=10,
        learning_rate=5e-5,
        weight_decay=0.01,
        warmup_steps=100,
        # Explicitly set no_cuda to True to prevent GPU usage
        no_cuda=True,
        # Remove all mixed precision settings
        fp16=None,
        fp16_opt_level=None,
        fp16_backend=None,
        fp16_full_eval=None,
        bf16=None,
        tf32=None,
        push_to_hub=False,
        max_grad_norm=0.5,
        remove_unused_columns=False,
        gradient_checkpointing=True,
        dataloader_pin_memory=False
    )
    
    print("Initializing trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        tokenizer=tokenizer,
    )
    
    print("Fine-tuning the model...")
    trainer.train()
    
    print("Saving the model...")
    trainer.save_model(output_dir)
    print(f"Model saved to {output_dir}")

if __name__ == "__main__":
    pdf_path = "/Users/ripeshghimire/coding/ResearchPaper/Andhra_Pradesh.pdf"
    model_name = "gpt2"
    output_dir = "fine_tuned_model"
    
    if not os.path.exists(pdf_path):
        print(f"Error: File {pdf_path} does not exist.")
    else:
        fine_tune_model(pdf_path, model_name=model_name, output_dir=output_dir)

Extracting text from PDF...
Chunking text...
Preparing dataset...


Map:   0%|          | 0/768 [00:00<?, ? examples/s]

Loading model...
Using CPU device
Setting up training arguments...
Initializing trainer...
Fine-tuning the model...


  trainer = Trainer(


  0%|          | 0/144 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask.