## Import Libraries

In [5]:
import os
from datasets import load_dataset
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, get_scheduler
import torch

In [6]:
# Load the XSum dataset
from datasets import load_dataset

ds = load_dataset("EdinburghNLP/xsum")


# Split the dataset into training and evaluation sets
train_dataset = ds['train']
eval_dataset = ds['test']


In [7]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Load the pre-trained PEGASUS model and tokenizer
model_name = "google/pegasus-large"  # You can also try other PEGASUS variants like "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

pytorch_model.bin:  95%|#########4| 2.16G/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"device : {device}")

device : cuda


In [9]:
def preprocess_data(examples):
    # Tokenize the inputs (documents)
    model_inputs = tokenizer(examples['document'], max_length=1024, truncation=True)

    # Setup the tokenizer for the labels (summaries)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['summary'], max_length=128, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Preprocess the datasets
train_dataset = train_dataset.map(preprocess_data, batched=True)
eval_dataset = eval_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/204045 [00:00<?, ? examples/s]



Map:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [17]:
# Define training arguments with evaluation every few steps
training_args = TrainingArguments(
    output_dir="./model_checkpoints",
    evaluation_strategy="steps",  # Change to steps
    eval_steps=1000,  # Evaluate every 1000 steps
    save_strategy="steps",
    save_steps=1000,  # Save every 1000 steps
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    logging_dir='./logs',
    logging_steps=100,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [None]:
# Set up early stopping
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)  # Stop after 2 evaluations without improvement

# Define a learning rate scheduler
def get_lr_scheduler(optimizer, num_warmup_steps, num_training_steps):
    return get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
    )


In [None]:
# Create Trainer with callbacks
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[early_stopping_callback],
)

# Train the model
trainer.train()


In [None]:
# Save the model after training
model.save_pretrained("./pegasus_finetuned")
tokenizer.save_pretrained("./pegasus_finetuned")

# Last

In [18]:
import fitz  # PyMuPDF for PDF handling
import re
import string
import nltk
from nltk.tokenize import sent_tokenize
from transformers import pipeline
from datasets import load_dataset

# Download required NLTK resources
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\praye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
def load_newspaper_dataset(train_data,eval_data):
    # dataset = load_dataset("cnn_dailymail", "3.0.0")  # Load the dataset
    # Shuffle the data and split it into 90% train and 10% eval
    # train_data = dataset['train'].shuffle(seed=42).select(range(27000))  # Select 27,000 samples for training
    # eval_data = dataset['train'].shuffle(seed=42).select(range(27000, 30000))  # Select 3,000 samples for evaluation
    return train_data, eval_data

In [None]:
# Fine-tune a summarization model using the dataset
def train_summarization_model(train_data, eval_data, save_model_path):
    model = BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-12-6')
    tokenizer = BartTokenizer.from_pretrained('sshleifer/distilbart-cnn-12-6')

    def tokenize_data(examples):
        inputs = examples['article']
        model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

        # Tokenize labels
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples['highlights'], max_length=150, truncation=True, padding="max_length")

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_train_data = train_data.map(tokenize_data, batched=True)
    tokenized_eval_data = eval_data.map(tokenize_data, batched=True)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="steps",
        num_train_epochs=1,
        per_device_train_batch_size=2,  # Adjust batch size to fit your GPU memory
        per_device_eval_batch_size=2,
        save_steps=500,
        save_total_limit=2,
        remove_unused_columns=False,
        eval_steps=500,  # Evaluate every 500 steps
        push_to_hub=False,
        fp16=True,  # Use 16-bit precision to speed up training
        logging_dir="./logs",  # Directory for storing logs
        logging_steps=100,  # Log every 100 steps
    )

    # Create Trainer object
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_data,
        eval_dataset=tokenized_eval_data,  # Include eval_dataset
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer at the specified path
    model.save_pretrained(save_model_path)
    tokenizer.save_pretrained(save_model_path)

    return model, tokenizer

# Summarize text using the fine-tuned model
# def summarize_with_model(model, tokenizer, text, max_length=150, min_length=30):
#     inputs = tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
#     summary_ids = model.generate(inputs["input_ids"], max_length=max_length, min_length=min_length, num_beams=4, early_stopping=True)
#     summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
#     return summary

# Example usage
pdf_path = "output_cleaned.pdf"  # Path to the cleaned PDF
save_model_path = "/content/drive/MyDrive/Project-I-Model/Facebook-Bert"  # Path where the model will be saved

# Step 1: Extract and clean text from the PDF
extracted_text = extract_and_clean_pdf_text(pdf_path)
print("Extracted Text from PDF:\n", extracted_text[:500])  # Print first 500 characters for sanity check

# Step 2: Load the small newspaper dataset (CNN/DailyMail)
train_data, eval_data = load_newspaper_dataset(train_dataset,val_dataset)

# Step 3: Fine-tune the summarization model and save it at the specified path
model, tokenizer = train_summarization_model(train_data, eval_data, save_model_path)

# # Step 4: Summarize the extracted PDF text using the fine-tuned model
# summary = summarize_with_model(model, tokenizer, extracted_text)
# print("Summary of PDF Text:\n", summary)


In [None]:
summary = summarize_with_model(model, tokenizer, extracted_text)
print("Summary of PDF Text:\n", summary)

# Test

In [20]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

In [21]:
# Load the model and tokenizer from the local path
local_model_path = "Facebook-Bert-20241016T112414Z-001/Facebook-Bert"  # e.g., "./pegasus_finetuned" or wherever your model is saved
tokenizer = BartTokenizer.from_pretrained(local_model_path)
model = BartForConditionalGeneration.from_pretrained(local_model_path)


In [22]:
import fitz  # PyMuPDF

# Path to your PDF file
pdf_path = r"C:\Users\praye\Downloads\bayesian_optimization.pdf"

# Initialize an empty string to hold the extracted text
extracted_text = ""

# Open the PDF file
with fitz.open(pdf_path) as pdf:
    for page in pdf:
        extracted_text += page.get_text() + "\n"  # Append text from each page

print("Extracted Text:", extracted_text)

Extracted Text: A Tutorial on Bayesian Optimization
Peter I. Frazier
July 10, 2018
Abstract
Bayesian optimization is an approach to optimizing objective functions that take a long time (min-
utes or hours) to evaluate. It is best-suited for optimization over continuous domains of less than 20
dimensions, and tolerates stochastic noise in function evaluations. It builds a surrogate for the objective
and quantiﬁes the uncertainty in that surrogate using a Bayesian machine learning technique, Gaussian
process regression, and then uses an acquisition function deﬁned from this surrogate to decide where to
sample. In this tutorial, we describe how Bayesian optimization works, including Gaussian process re-
gression and three common acquisition functions: expected improvement, entropy search, and knowledge
gradient. We then discuss more advanced techniques, including running multiple function evaluations
in parallel, multi-ﬁdelity and multi-information source optimization, expensive-to-evalua

In [None]:
extracted_text = ""

In [26]:
# Tokenize the input
inputs = tokenizer(extracted_text, return_tensors="pt", max_length=1024, truncation=True)

# Generate summary
summary_ids = model.generate(inputs['input_ids'], max_length=350, min_length=280, length_penalty=2.0, num_beams=4, early_stopping=True)

# Decode the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Summary:", summary)


Summary: Bayesian optimization is an approach to optimizing objective functions that take a long time to evaluate.
It is best-suited for optimization over continuous domains of less than 20-dimensions.
BayesOpt has been used extensively for engineering systems since the 1960s.
The ability to optimize expensive black box functions makes it extremely popular for testing hyperparameters in machine learning.
We conclude with a discussion of Bayesian optimization software and future research directions in the ﬁeld.com tutorial.
Our focus is on �nding a global rather than local optimum, our focus on the BayesOpt is on the black box.com guide to Bayes Optimization.com/Bayes Optimism.com's "Bayesoptimism" and our guide to the guide to more advanced techniques, which we will use in the guide.com-to-planning.com launch of the guide, which will be available on July 10, 2018.com.com: Back to Back to the page you came from the back of the Back of the Web.com page: Share your memories. Back to back 

In [12]:
import nltk

In [13]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\praye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\praye\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [14]:
tokens = nltk.word_tokenize(summary)
pos_tags = nltk.pos_tag(tokens)

# Display POS tags
print("POS Tags:", pos_tags)

# Optionally, reconstruct the summary with sentences (if needed)
structured_summary = ' '.join(tokens)
print("Final Summary with POS tagging:", structured_summary)

POS Tags: [('Bayesian', 'JJ'), ('optimization', 'NN'), ('is', 'VBZ'), ('an', 'DT'), ('approach', 'NN'), ('to', 'TO'), ('optimizing', 'VBG'), ('objective', 'JJ'), ('functions', 'NNS'), ('that', 'WDT'), ('take', 'VBP'), ('a', 'DT'), ('long', 'JJ'), ('time', 'NN'), ('to', 'TO'), ('evaluate', 'VB'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('best-suited', 'JJ'), ('for', 'IN'), ('optimization', 'NN'), ('over', 'IN'), ('continuous', 'JJ'), ('domains', 'NNS'), ('of', 'IN'), ('less', 'JJR'), ('than', 'IN'), ('20-dimensions', 'NNS'), ('.', '.'), ('BayesOpt', 'NNP'), ('has', 'VBZ'), ('been', 'VBN'), ('used', 'VBN'), ('extensively', 'RB'), ('for', 'IN'), ('engineering', 'NN'), ('systems', 'NNS'), ('since', 'IN'), ('the', 'DT'), ('1960s', 'NNS'), ('.', '.'), ('The', 'DT'), ('ability', 'NN'), ('to', 'TO'), ('optimize', 'VB'), ('expensive', 'JJ'), ('black', 'JJ'), ('box', 'NN'), ('functions', 'NNS'), ('makes', 'VBZ'), ('it', 'PRP'), ('extremely', 'RB'), ('popular', 'JJ'), ('for', 'IN'), ('testing',