In [6]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.22.4 (from pandas)
  Downloading numpy-2.1.3-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp310-cp310-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
    --------------------------------------- 0.3/11.6 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.6 MB 1.0 MB/s eta 0:00:11
   -- ------------------------------------- 0.8/11.6 MB 1.0 MB/s eta 0:00:11
   --- ------------------------------------ 1.0/11.6 MB 1.0 MB/s eta 0:00:11
   --- ------------------------------------ 1.0/11.6 MB 1.0 MB/s eta 0:00:11
   ---- --------

In [1]:
import pandas as pd

# Load the dataset locally
# Replace `file_path` with your local path to BookSum JSON files
train_data = pd.read_json('booksum/alignments/book-level-summary-alignments/book_summaries_aligned_train.jsonl', lines=True)
val_data = pd.read_json('booksum/alignments/book-level-summary-alignments/book_summaries_aligned_val.jsonl', lines=True)
test_data = pd.read_json('booksum/alignments/book-level-summary-alignments/book_summaries_aligned_test.jsonl', lines=True)


In [5]:
!pip install datasets

Collecting datasets
  Using cached datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting filelock (from datasets)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Using cached aiohttp-3.11.0-cp310-cp310-win_amd64.whl.metadata (8.0 kB)
Collecting huggingface-hub>=0.23.0 (from datasets)
  Using cached huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Using cached aiohappyeyeballs-2.4.3-py3-none-any.whl.metadata (6.1 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Using cached

In [4]:
from datasets import Dataset, DatasetDict

# Convert to DatasetDict format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

booksum_data = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print(booksum_data)


DatasetDict({
    train: Dataset({
        features: ['bid', 'source', 'title', 'summary_path', 'book_path'],
        num_rows: 314
    })
    validation: Dataset({
        features: ['bid', 'source', 'title', 'summary_path', 'book_path'],
        num_rows: 45
    })
    test: Dataset({
        features: ['bid', 'source', 'title', 'summary_path', 'book_path'],
        num_rows: 46
    })
})


In [10]:
!pip install transformers



In [9]:
from transformers import AutoTokenizer

# Load BART or T5 tokenizer
model_name = 'facebook/bart-large'  # or 't5-large' for T5 model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define preprocessing
max_input_length = 1024
max_output_length = 150

import os

def preprocess(batch):
    book_path = batch['book_path']
    summary_path = batch['summary_path']
    
    # Check if both files exist
    if not os.path.exists(book_path) or not os.path.exists(summary_path):
        print(f"Skipping missing files: {book_path} or {summary_path}")
        return {}  # Return an empty dictionary to skip this batch

    # Read the book text and summary
    with open(book_path, 'r', encoding='utf-8') as book_file:
        book_text = book_file.read()
    with open(summary_path, 'r', encoding='utf-8') as summary_file:
        summary_text = summary_file.read()

    inputs = tokenizer(book_text, max_length=max_input_length, truncation=True, padding="max_length")
    targets = tokenizer(summary_text, max_length=max_output_length, truncation=True, padding="max_length")
    inputs['labels'] = targets['input_ids']
    return inputs




# Apply preprocessing
# tokenized_data = booksum_data.map(preprocess, batched=False)
valid_data = booksum_data.filter(lambda example: os.path.exists(example['book_path']) and os.path.exists(example['summary_path']))
tokenized_data = valid_data.map(preprocess, batched=False)



Filter:   0%|          | 0/314 [00:00<?, ? examples/s]

Filter:   0%|          | 0/45 [00:00<?, ? examples/s]

Filter:   0%|          | 0/46 [00:00<?, ? examples/s]

In [None]:
!pip install torch

In [10]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['validation'],
    tokenizer=tokenizer
)

# Train the model
trainer.train()


ImportError: 
AutoModelForSeq2SeqLM requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


In [None]:
sample_text = "Your sample chapter or paragraph text here."
inputs = tokenizer(sample_text, return_tensors="pt", max_length=max_input_length, truncation=True)
summary_ids = model.generate(inputs['input_ids'], max_length=max_output_length, num_beams=4, early_stopping=True)

print("Generated Summary:", tokenizer.decode(summary_ids[0], skip_special_tokens=True))
