# Text summarizer
This is my playground to mess around and get some stuff done.

### All imports

In [5]:
from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
)

### Preparing Dataset

In [7]:
# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1

# Load the dataset
file_path = "../Dataset/wikihowSep.csv"
df = pd.read_csv(file_path)

# Ensure the dataset has the correct columns
if "text" not in df.columns or "headline" not in df.columns:
    raise ValueError("The dataset must contain 'text' and 'headline' columns")

# Transform the dataset into a Hugging Face dataset
dataset = Dataset.from_pandas(df[["text", "headline"]])

# Display the first two rows to verify the data
print(df.head(2))

# Print out row 1
print(df.iloc[1])

# Load the tokenizer and model
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


# Define the preprocess function
def preprocess_function(examples):
    inputs = examples["text"]
    inputs = [str(i) for i in inputs]  # Ensure inputs are strings
    model_inputs = tokenizer(
        inputs, max_length=512, truncation=True, padding="max_length"
    )

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            [str(i) for i in examples["headline"]],
            max_length=150,
            truncation=True,
            padding="max_length",
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Apply the preprocess function to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Use a DataCollator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

                                            overview  \
0   So you're a new or aspiring artist and your c...   
1   If you want to be well-read, then, in the wor...   

                           headline  \
0            \nSell yourself first.   
1  \nRead the classics before 1600.   

                                                text          sectionLabel  \
0   Before doing anything else, stop and sum up y...                 Steps   
1   Reading the classics is the very first thing ...  Reading the Classics   

                         title  
0  How to Sell Fine Art Online  
1          How to Be Well Read  
overview         If you want to be well-read, then, in the wor...
headline                         \nRead the classics before 1600.
text             Reading the classics is the very first thing ...
sectionLabel                                 Reading the Classics
title                                         How to Be Well Read
Name: 1, dtype: object


Map:   0%|          | 0/1585695 [00:00<?, ? examples/s]



In [37]:
training_args = Seq2SeqTrainingArguments(
    output_dir="base",
    evaluation_strategy="steps",
    eval_steps=20,  # Evaluate every 20 steps
    logging_strategy="steps",
    logging_steps=20,  # Log every 20 steps
    save_strategy="steps",
    save_steps=100,  # Save the model every 100 steps
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,
    save_total_limit=5,
    max_steps=100,  # Run training for 100 steps
    weight_decay=0.01,
    gradient_checkpointing=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    predict_with_generate=True,
    optim="adafactor",
    bf16=True,
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [38]:
# Fine-tune the model
trainer.train()



Step,Training Loss,Validation Loss
