In [None]:
!pip3 install torch
!pip3 install torchvision
!pip3 install sentencepiece
!pip3 install transformers
!pip3 install datasets
!pip install rouge-score
#!pip3 install wandb

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
import datetime
import os
import time
import sys

import numpy as np
import random
import pandas as pd
import nltk
import numpy as np
import json
import wandb
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
torch.manual_seed(42)

from datasets.dataset_dict import DatasetDict
from datasets import load_metric, load_dataset 
from transformers import AutoTokenizer,  AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer


In [None]:
lr = 3e-4
weight_dec = 0.01
epochs = 12
save_path = ''

In [None]:
nltk.download('punkt')

In [None]:
train_data = load_dataset('csv', data_files='data/c2t-big/train_c2t_big.csv', sep='\t')
val_data = load_dataset('csv', data_files='data/c2t-big//val_c2t_big.csv', sep='\t')
dataset = DatasetDict({
    'train': train_data['train'],
    'valid': val_data['train']})
dataset
'''
train_data = load_dataset('csv', data_files='data/c2t-big/train_c2t_small.csv', sep='\t')
val_data = load_dataset('csv', data_files='data/c2t-big//val_c2t_small.csv', sep='\t')
dataset = DatasetDict({
    'train': train_data['train'],
    'valid': val_data['train']})
dataset
'''

'''
train_data = load_dataset('csv', data_files='data/c2t-big/train_c2t_small_nli.csv', sep='\t')
val_data = load_dataset('csv', data_files='data/c2t-big//val_c2t_small_nli.csv', sep='\t')
dataset = DatasetDict({
    'train': train_data['train'],
    'valid': val_data['train']})
dataset
'''

In [None]:
metric = load_metric("rouge")

In [None]:
model_checkpoint = 't5-base'
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model =  AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
prefix = "C2T: "

In [None]:
max_input_length = 1024
max_target_length = 512

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["Data"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding='max_length')

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Summaries"], max_length=max_target_length, truncation=True,  padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [None]:
model =  AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    save_path,
    evaluation_strategy = "epoch",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_dec,
    save_total_limit=1,
    num_train_epochs=epochs,
    predict_with_generate=True
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.save_model('t5base_C2T_big')