In [12]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartForConditionalGeneration, BartTokenizer

# Custom Dataset class
class SummarizationDataset(Dataset):
    def __init__(self, dataset_path, tokenizer, max_length=512):
        self.data = self.load_dataset(dataset_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        article = str(self.data.iloc[idx]['Article'])
        summary = str(self.data.iloc[idx]['Summary'])

        encoding = self.tokenizer(
            article,
            summary,
            max_length=self.max_length,
            padding="max_length",
            truncation="only_first",  # Change truncation strategy here
            return_tensors="pt"
        )

        # Handle labels based on model requirements
        if "labels" in encoding:
            labels = encoding["labels"].flatten()
        else:
            # Modify this part based on your specific task
            labels = self.tokenizer(
                summary,
                max_length=self.max_length,
                padding="max_length",
                truncation="only_first",
                return_tensors="pt"
            )["input_ids"].flatten()

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": labels
        }

    def load_dataset(self, dataset_path):
        return pd.read_csv(dataset_path, encoding='latin-1', low_memory=False)


# Replace 'your_dataset.csv' with the actual path to your CSV file
dataset_path = 'Eng_train.csv'

# Initialize BART model and tokenizer
model_name = "facebook/bart-large-cnn"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# Create dataset and dataloader
dataset = SummarizationDataset(dataset_path, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Training parameters
epochs = 3
learning_rate = 3e-5

# Set up optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)  # Use labels directly

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

# Save the trained model
model.save_pretrained("your_summarization_model")
tokenizer.save_pretrained("your_summarization_model")


KeyboardInterrupt: 

In [3]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Load pre-trained PEGASUS model and tokenizer
model_name = 'google/pegasus-large'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Example text to summarize
input_text = """
    Adani is also among the most controversial of India’s billionaires for his association with the Bharatiya Janata Party (BJP). His close relationship with the party is not coincidental: Adani frequently refers to his business strategy as motivated by “nation building,” which the Adani Group describes on its website as “helping build world-class infrastructure capabilities to help accelerate the growth of India.” Mundra Port and its associated Adani Special Economic Zone, the central components of Adani’s business empire, were attained and developed in cooperation with the Gujarat state government. The BJP led the Gujarat state government during key moments of the Adani Group’s growth, and the relationship resulted in the symbiotic rise of both the BJP and the Adani Group.
    """

# Tokenize input text
inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)

# Generate summary
summary_ids = model.generate(inputs, max_length=100, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the original text and the generated summary
print("Original Text:\n", input_text)
print("\nGenerated Summary:\n", summary)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Original Text:
 
    Adani is also among the most controversial of India’s billionaires for his association with the Bharatiya Janata Party (BJP). His close relationship with the party is not coincidental: Adani frequently refers to his business strategy as motivated by “nation building,” which the Adani Group describes on its website as “helping build world-class infrastructure capabilities to help accelerate the growth of India.” Mundra Port and its associated Adani Special Economic Zone, the central components of Adani’s business empire, were attained and developed in cooperation with the Gujarat state government. The BJP led the Gujarat state government during key moments of the Adani Group’s growth, and the relationship resulted in the symbiotic rise of both the BJP and the Adani Group.
    

Generated Summary:
 His close relationship with the party is not coincidental: Adani frequently refers to his business strategy as motivated by “nation building,” which the Adani Group descri

In [11]:
import pandas as pd
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import Trainer, TrainingArguments

# Load your CSV file into a Pandas DataFrame
df = pd.read_csv('Eng_train.csv', encoding='ISO-8859-1', dtype=str)
# Replace 'your_training_data.csv' with your actual CSV file

# Extract input text and target summaries
train_data = df['Article'].tolist()
train_labels = df['Summary'].tolist()

# Load pre-trained PEGASUS model and tokenizer
model_name = 'google/pegasus-large'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Tokenize input data and labels
tokenized_train = tokenizer(train_data, return_tensors="pt", truncation=True, padding=True)
tokenized_labels = tokenizer(train_labels, return_tensors="pt", truncation=True, padding=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./pegasus_fine_tuned",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize Trainer and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    data_collator=None,  # You can customize the data collator based on your needs
    tokenizer=tokenizer,
)
trainer.train()


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`