In [1]:
from transformers import RobertaTokenizerFast, EncoderDecoderModel
from transformers.tokenization_utils import BatchEncoding
import pandas as pd
import torch
from rouge_score import rouge_scorer
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW

In [None]:
!nvidia-smi

In [2]:
model_name = "mrm8488/camembert2camembert_shared-finetuned-french-summarization"
model = EncoderDecoderModel.from_pretrained(model_name)
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)

device = 'cuda:1' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.config.pad_token_id = tokenizer.pad_token_id
model.eval()

The following encoder weights were not tied to the decoder ['roberta/pooler']
The following encoder weights were not tied to the decoder ['roberta/pooler']
The following encoder weights were not tied to the decoder ['roberta/pooler']
The following encoder weights were not tied to the decoder ['roberta/pooler']


EncoderDecoderModel(
  (encoder): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

In [3]:
# print the number of parameters

num_params = sum(p.numel() for p in model.parameters())
print(f"Number of parameters: {num_params}")

Number of parameters: 139612933


In [4]:
def tokenize_text(text: pd.Series) -> BatchEncoding:
    tokens = tokenizer(
        text.tolist(),
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )

    return tokens

class SummaryDataset(Dataset):
    def __init__(self, encodings, summaries):
        self.encodings = encodings
        self.summaries = summaries
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.summaries['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.summaries)

In [39]:
train_df = pd.read_csv('data/train.csv', dtype={'text': str, 'titles': str})
# validation_df = pd.read_csv('data/validation.csv', dtype={'text': str, 'titles': str})
# longest title is 967 words

df = train_df

input_encodings = tokenize_text(df['text'])
summary_encodings = tokenize_text(df['titles'])

dataset = SummaryDataset(input_encodings, summary_encodings)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


In [49]:
batch = next(iter(dataloader))
batch.__len__()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.summaries['input_ids'][idx])


3

In [38]:
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
num_epochs = 50

for epoch in range(num_epochs):
    epoch_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}: Loss {epoch_loss / len(dataloader)}")

    # Save a checkpoint after each epoch
    checkpoint_path = f"./model_checkpoint_epoch_{epoch+1}.pt"
    torch.save(model.state_dict(), checkpoint_path)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.summaries['input_ids'][idx])
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
100%|██████████| 1/1 [00:00<00:00,  8.64it/s]


Epoch 1: Loss 1.2248425483703613


100%|██████████| 1/1 [00:00<00:00,  9.24it/s]


Epoch 2: Loss 0.6722902059555054


100%|██████████| 1/1 [00:00<00:00,  8.86it/s]


Epoch 3: Loss 0.6096243262290955


100%|██████████| 1/1 [00:00<00:00,  9.25it/s]


Epoch 4: Loss 0.5643374919891357


100%|██████████| 1/1 [00:00<00:00,  9.23it/s]


Epoch 5: Loss 0.549957275390625


100%|██████████| 1/1 [00:00<00:00,  9.24it/s]


Epoch 6: Loss 0.4149073660373688


100%|██████████| 1/1 [00:00<00:00,  9.25it/s]


Epoch 7: Loss 0.36458301544189453


100%|██████████| 1/1 [00:00<00:00,  9.24it/s]


Epoch 8: Loss 0.29214081168174744


100%|██████████| 1/1 [00:00<00:00,  9.20it/s]


Epoch 9: Loss 0.25216665863990784


100%|██████████| 1/1 [00:00<00:00,  9.21it/s]


Epoch 10: Loss 0.21353426575660706


100%|██████████| 1/1 [00:00<00:00,  9.17it/s]


Epoch 11: Loss 0.17668142914772034


100%|██████████| 1/1 [00:00<00:00,  9.18it/s]


Epoch 12: Loss 0.1552847921848297


KeyboardInterrupt: 

In [16]:
def camembert2_summary(encodings, batch_size: int = 8) -> list:
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    summaries = []
    tokenizer.src_lang = "fr_XX"
    model.eval()

    for i in tqdm(range(0, input_ids.size(0), batch_size)):
        batch_input_ids = input_ids[i:i+batch_size].to(device)
        batch_attention_mask = attention_mask[i:i+batch_size].to(device)

        summary_tokens = model.generate(
            input_ids=batch_input_ids,
            attention_mask=batch_attention_mask,
            max_length=150,
            num_beams=4,
            early_stopping=True
        )

        batch_summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_tokens]
        summaries.extend(batch_summaries)

    return summaries

In [8]:
def score_summaries(predicted_summary: pd.Series, reference_summary: pd.Series):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = []
    for i in tqdm(range(len(predicted_summary))):
        score = scorer.score(predicted_summary[i], reference_summary[i])[
            'rougeL'][2]
        scores.append(score)
    avg_score = sum(scores) / len(scores)

    return avg_score

In [9]:
input_encodings = tokenize_text(validation_df['text'][:100])

In [10]:
summaries = camembert2_summary(input_encodings)
# summaries = camembert2_summary(test_tensor['input_ids'], test_tensor['attention_mask'])


100%|██████████| 13/13 [00:16<00:00,  1.24s/it]


In [29]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = []
for i in range(100):
    score = scorer.score(summaries[i], validation_df['titles'][i])['rougeL'][2]
    scores.append(score)
    

In [35]:
# get the worst index and the best index
worst_index = scores.index(min(scores))
summaries[worst_index]

"L'écrivain de 83 ans publie jeudi 2 janvier « Consentement » aux éditions Grasset dans lequel elle accuse le célèbre écrivain d'avoir entretenu une relation avec elle quand elle était adolescente."

In [11]:
validation_df['text'][0]

"Sur les réseaux sociaux, les images sont impressionnantes. Dimanche matin à Venise, l'équipage du MSC Opéra a perdu le contrôle du paquebot, à son arrivée dans le port de la cité des Doges. Le navire, qui peut contenir plus de 2.600 passagers, est venu heurter le quai auquel il voulait s'arrimer. Le paquebot a raclé le quai sur plusieurs mètres, suscitant la panique des personnes à terre, avant de percuter un autre bateau touristique, le Michelangelo, stoppant ainsi sa course. Des témoins ont filmé la scène. Les vidéos montrent des touristes courant pour tenter de fuir le paquebot, qui ne semble pas vouloir s'arrêter. Quatre personnes ont été blessées dans cet accident : deux légèrement, tandis que les deux autres ont été transportées à l'hôpital pour des examens. L'incident s'est produit à San Basilio-Zaterre, dans le canal de la Giudecca, où de nombreux navires de croisière s'arrêtent pour permettre à leurs passagers de visiter Venise.Selon le quotidien italien Corriere della Serra,

In [12]:
summaries[0]

"Quatre personnes ont été blessées dans cet accident survenu dimanche matin à Venise, avant de percutér un quai lors de son arrivée dans le port de Venise. Quatre autres ont été transportées à l'hôpital pour des examens médicaux."

In [13]:
validation_df['titles'][0]

'Le bateau de croisière, long de 275 m, a percuté un quai lors de son arrivée dans le port de Venise, dimanche 2 juin. Quatre personnes ont été blessées.'

In [14]:
score_summaries(summaries, validation_df['titles'])

100%|██████████| 100/100 [00:00<00:00, 937.90it/s]


0.2021833120235293