In [2]:
import pandas as pd

filename = "/content/drive/MyDrive/Usable_Dataset/train.csv"
filename2 = "/content/drive/MyDrive/Usable_Dataset/test.csv"

df = pd.read_csv(filename,index_col=0)
df.rename(columns = {'document':'source', 'summary':'target'}, inplace = True)

df2 = pd.read_csv(filename2,index_col=0)
df2.rename(columns = {'document':'source', 'summary':'target'}, inplace = True)
print(len(df),len(df2))

6120 85


In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)


def prepare_data(model_name,
                 train_texts, train_labels,
                 val_texts=None, val_labels=None,
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length = 512)
    decodings = tokenizer(labels, truncation=True, padding=True, max_length = 256)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset, tokenizer


def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

  if val_dataset is not None:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=1,           
      per_device_train_batch_size=2,   
      per_device_eval_batch_size=2,    
      save_steps=500,                  
      save_total_limit=5,              
      evaluation_strategy='steps',     
      eval_steps=100,                  
      warmup_steps=500,                
      weight_decay=0.01,               
      logging_dir='./logs',            
      logging_steps=100,
    )

    trainer = Trainer(
      model=model,                     
      args=training_args,              
      train_dataset=train_dataset,     
      eval_dataset=val_dataset,        
      tokenizer=tokenizer
    )

  else:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=1,           
      per_device_train_batch_size=2,   
      save_steps=500,                  
      save_total_limit=5,              
      #warmup_steps=500,               
      weight_decay=0.01,               
      logging_dir='./logs',            
      logging_steps=500,
      fp16=True
    )

    trainer = Trainer(
      model=model,                     
      args=training_args,              
      train_dataset=train_dataset,     
      tokenizer=tokenizer
    )

  return trainer

In [4]:
train_texts, train_labels = (list(df['source'])), (list(df['target']))
test_texts, test_labels = (list(df2['source'])), (list(df2['target']))

model_name = 'nsi319/legal-pegasus'
train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
test_dataset, _, _, tokenizer = prepare_data(model_name, test_texts, test_labels)

trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset, test_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

  trainer = Trainer(


In [5]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
100,3.0107,2.698269
200,2.6958,2.570392
300,2.6185,2.495053
400,2.6245,2.442623
500,2.4394,2.393202
600,2.5022,2.357375
700,2.4799,2.328946
800,2.3809,2.313165
900,2.322,2.291429
1000,2.3632,2.275033




TrainOutput(global_step=3060, training_loss=2.368292620291118, metrics={'train_runtime': 4923.4741, 'train_samples_per_second': 1.243, 'train_steps_per_second': 0.622, 'total_flos': 8841761107476480.0, 'train_loss': 2.368292620291118, 'epoch': 1.0})

In [6]:
import os
if not os.path.exists('./ouput_model/'):
    os.makedirs('./ouput_model/')
trainer.model.save_pretrained("./ouput_model/")


In [15]:
tokenizer.save_pretrained('./ouput_model/')

('./ouput_model/tokenizer_config.json',
 './ouput_model/special_tokens_map.json',
 './ouput_model/spiece.model',
 './ouput_model/added_tokens.json',
 './ouput_model/tokenizer.json')

In [16]:
!zip -r output_model.zip ./ouput_model/

updating: ouput_model/ (stored 0%)
updating: ouput_model/generation_config.json (deflated 45%)
updating: ouput_model/config.json (deflated 61%)
updating: ouput_model/model.safetensors (deflated 7%)
  adding: ouput_model/tokenizer_config.json (deflated 94%)
  adding: ouput_model/tokenizer.json (deflated 78%)
  adding: ouput_model/special_tokens_map.json (deflated 82%)
  adding: ouput_model/spiece.model (deflated 50%)
