In [37]:
import mysql.connector
import fitz 
import re
from datasets import Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

In [52]:
def get_pdf_paths():
    conn = mysql.connector.connect(
        host="localhost",
        user="root",
        password="RootUser@123",
        database="pdf_database"
    )
    cursor = conn.cursor()
    cursor.execute("SELECT file_path FROM pdf_files")
    pdf_paths = cursor.fetchall()
    cursor.close()
    conn.close()
    
    # Print the obtained PDF paths
    print("PDF paths obtained from the database:")
    for path in pdf_paths:
        print(path[0])  # printing the file path
    return [path[0] for path in pdf_paths]


In [55]:
def read_pdfs(pdf_paths):
    text = ""
    print(f"Reading {len(pdf_paths)} PDF(s)...")
    for pdf_path in pdf_paths:
        print(f"Reading PDF file: {pdf_path}")
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text("text")
    print("Finished reading PDFs.")
    return text


In [56]:
def clean_text(all_text):
    print("Cleaning the extracted text...")
    text = re.sub(r"\b(page \d+|footnote|Header)\b", "", all_text)
    text = text.strip()
    print("Text cleaning complete.")
    return text


In [41]:
def prepare_dataset(cleaned_text):
    data = [{"text": cleaned_text}]
    dataset = Dataset.from_dict({"text": [entry["text"] for entry in data]})
    return dataset

In [42]:
def tokenize_dataset(dataset, tokenizer):
    def tokenize_function(examples):
        return tokenizer(examples["text"], return_tensors="pt", padding=True, truncation=True)
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset

In [43]:
def add_labels(examples):
    examples["labels"] = examples["input_ids"]
    return examples


In [44]:
def load_model_and_tokenizer():
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer


In [50]:
def train_model(tokenized_dataset, model, tokenizer):
    training_args = TrainingArguments(
        output_dir="./gpt2_finetuned",          # Output directory
        num_train_epochs=1,                     # Number of training epochs
        per_device_train_batch_size=2,          # Batch size per device during training
        per_device_eval_batch_size=2,           # Batch size for evaluation
        warmup_steps=500,                       # Number of warmup steps for learning rate scheduler
        weight_decay=0.01,                      # Strength of weight decay
        logging_dir="./logs",                   # Directory for storing logs
        logging_steps=10,
        evaluation_strategy="epoch",            # Evaluation strategy to adopt during training
    )
    
    trainer = Trainer(
        model=model,                            # The model to be trained
        args=training_args,                     # Training arguments
        train_dataset=tokenized_dataset,        # Training dataset
        eval_dataset=tokenized_dataset,         # Evaluation dataset
    )

    trainer.train()


In [None]:
# After the model has been fine-tuned, save it along with the tokenizer
model.save_pretrained("./gpt2_finetuned")
tokenizer.save_pretrained("./gpt2_finetuned")