In [1]:
from transformers import RobertaTokenizerFast, EncoderDecoderModel
from transformers import get_linear_schedule_with_warmup
from transformers.tokenization_utils import BatchEncoding
import pandas as pd
import torch
from rouge_score import rouge_scorer
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import numpy as np
import matplotlib.pyplot as plt
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"


In [2]:
!nvidia-smi

Tue Mar 19 03:19:28 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-PCIE-40GB          On  | 00000000:27:00.0 Off |                    0 |
| N/A   30C    P0              37W / 250W |  39385MiB / 40960MiB |     14%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCIE-40GB          On  | 00000000:A3:00.0 Off |  

In [3]:
model_name = "mrm8488/camembert2camembert_shared-finetuned-french-summarization"
model = EncoderDecoderModel.from_pretrained(model_name)
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)

device = 'cuda:1' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.config.pad_token_id = tokenizer.pad_token_id
model.eval()

The following encoder weights were not tied to the decoder ['roberta/pooler']
The following encoder weights were not tied to the decoder ['roberta/pooler']
The following encoder weights were not tied to the decoder ['roberta/pooler']
The following encoder weights were not tied to the decoder ['roberta/pooler']


EncoderDecoderModel(
  (encoder): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

In [4]:
# print the number of parameters

num_params = sum(p.numel() for p in model.parameters())
print(f"Number of parameters: {num_params}")

Number of parameters: 139612933


In [5]:
def tokenize_text(text: pd.Series) -> BatchEncoding:
    tokens = tokenizer(
        text.tolist(),
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=514,
    )

    return tokens

class SummaryDataset(Dataset):
    def __init__(self, encodings, summaries):
        self.encodings = encodings
        self.summaries = summaries
    
    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.summaries['input_ids'][idx].clone().detach()
        return item

    def __len__(self):
        return len(self.summaries['input_ids'])

In [6]:
train_df = pd.read_csv('data/train.csv')
validation_df = pd.read_csv('data/validation.csv')
# longest title is 967 words

df = train_df

input_encodings = tokenize_text(df['text'])
summary_encodings = tokenize_text(df['titles'])

dataset = SummaryDataset(input_encodings, summary_encodings)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


In [None]:
# def find_lr(model, dataloader, device, init_value=1e-8, final_value=10., beta=0.98):
#     num = len(dataloader) - 1
#     mult = (final_value / init_value) ** (1/num)
#     lr = init_value
#     optimizer = AdamW(model.parameters(), lr=lr)
#     model.train()
#     avg_loss = 0.
#     best_loss = 0.
#     batch_num = 0
#     losses = []
#     log_lrs = []
#     for batch in dataloader:
#         batch_num += 1
#         optimizer.param_groups[0]['lr'] = lr
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)

#         optimizer.zero_grad()
#         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss

#         # Compute the smoothed loss
#         avg_loss = beta * avg_loss + (1-beta) * loss.item()
#         smoothed_loss = avg_loss / (1 - beta**batch_num)

#         # Stop if the loss is exploding
#         if batch_num > 1 and smoothed_loss > 4 * best_loss:
#             break

#         # Record the best loss
#         if smoothed_loss < best_loss or batch_num == 1:
#             best_loss = smoothed_loss

#         # Store the values
#         losses.append(smoothed_loss)
#         log_lrs.append(np.log10(lr))

#         # Do the SGD step
#         loss.backward()
#         optimizer.step()

#         # Update the lr for the next step
#         lr *= mult

#     plt.plot(log_lrs, losses)
#     plt.xlabel('Log10 Learning rate')
#     plt.ylabel('Loss')
#     plt.show()

# # Assuming model and dataloader are defined and initialized
# find_lr(model, dataloader, device)


In [None]:
def camembert2_summary(encodings, batch_size: int = 8) -> list:
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    summaries = []
    # tokenizer.src_lang = "fr_XX"
    model.eval()

    for i in tqdm(range(0, input_ids.size(0), batch_size)):
        batch_input_ids = input_ids[i:i+batch_size].to(device)
        batch_attention_mask = attention_mask[i:i+batch_size].to(device)

        summary_tokens = model.generate(
            input_ids=batch_input_ids,
            attention_mask=batch_attention_mask,
            max_length=150,
            num_beams=4,
            early_stopping=True
        )

        batch_summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_tokens]
        summaries.extend(batch_summaries)

    return summaries

In [None]:
def score_summaries(predicted_summary: pd.Series, reference_summary: pd.Series):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = []
    for i in tqdm(range(len(predicted_summary))):
        score = scorer.score(predicted_summary[i], reference_summary[i])[
            'rougeL'][2]
        scores.append(score)
    avg_score = sum(scores) / len(scores)

    return avg_score

In [None]:
def validation_test(val_df, verbose=True, n=100):
    input_encodings = tokenize_text(val_df['text'][:n])
    summaries = camembert2_summary(input_encodings)
    reference_summaries = val_df['titles'][:n]
    rouge_score = score_summaries(summaries, reference_summaries)
    if verbose:
        print(f"Rouge score: {rouge_score}")
    return rouge_score

In [12]:
optimizer = torch.optim.Adam(model.parameters(), lr=7e-5)
validation_test(validation_df)


# Then inside your training loop, after optimizer.step(), add:


model.train()
starting_epoch = 0
num_epochs = 6
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader) * num_epochs)

for epoch in range(starting_epoch, starting_epoch + num_epochs):
    epoch_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        epoch_loss += loss.item()


    print(f"Epoch {epoch+1}: Loss {epoch_loss / len(dataloader)}")
    # save every epoch
    checkpoint_path = f"./checkpoints/model_checkpoint_epoch_{epoch+1}_7e-5_linear_decay_1024tkn.pt"
    torch.save(model.state_dict(), checkpoint_path)

    # validation step
    model.eval()
    validation_test(validation_df)
    model.train()

  0%|          | 0/7 [00:00<?, ?it/s]../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [844,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [844,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [844,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [844,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [844,0,0], thread: [68,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [844,0,0], thread: [69,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
def generate_test_summary(test_df):
    input_encodings = tokenize_text(test_df['text'])
    summaries = camembert2_summary(input_encodings)
    submission_df = pd.DataFrame([[i, summary] for i, summary in enumerate(summaries)],
                                  columns=['ID', 'titles'])
    submission_df.to_csv('submission.csv', index=False)

In [None]:
test_df = pd.read_csv('data/test.csv')
generate_test_summary(test_df)

In [None]:
validation_test(validation_df, n=1500)

In [None]:
# load a model
# model.load_state_dict(torch.load("checkpoints/model_checkpoint_epoch_1.pt"))

In [None]:
input_encodings = tokenize_text(validation_df['text'][:8])

In [None]:
summaries = camembert2_summary(input_encodings)
# summaries = camembert2_summary(test_tensor['input_ids'], test_tensor['attention_mask'])


In [None]:
summaries

In [None]:
validation_df['titles'][:8].tolist()

In [None]:
score_summaries(summaries, validation_df['titles'])

In [None]:
validation_df['text'][5]

In [None]:
summaries[5]

In [None]:
validation_df['titles'][0]