**Installations**

In [1]:
! pip install datasets transformers rouge-score nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 15.7 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 56.4 MB/s 
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 69.5 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 58.3 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████

**Libraries**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from transformers import AutoTokenizer, AutoModel
import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset
from datasets import load_metric
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import nltk

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**Dataset Loading**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
INPUT_PATH1 = "/content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Dataset/preprocessed_data/divided_dataset"
INPUT_PATH2 = "/content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Dataset/preprocessed_data/whole_dataset"
RESULT_PATH = "/content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Results"
MODEL_PATH = "/content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Models_pickled_file"

In [6]:
train = pd.read_csv(os.path.join(INPUT_PATH1,"train.csv"))
val = pd.read_csv(os.path.join(INPUT_PATH1,"test.csv"))

In [7]:
test = pd.read_csv(os.path.join(INPUT_PATH2,"test.csv"))

**Combining Heading and Article**

In [8]:
train['Source'] = train['Heading'] + train['Article']
train.drop(columns=['Article','Heading'],inplace=True)
train.head()

Unnamed: 0,Summary,id,Source
0,"pakistan termed the indian action as ""unilater...",1,"un urges for maximum restraint, invokes simla ..."
1,"""the agreement will be finalised between khybe...",2,"china, pak to finalise deal to develop sez und..."
2,the top health research institute said that an...,3,"covaxin effectively neutralises both alpha, de..."
3,a 25-year-old man in the us has caught coronav...,5,man gets coronavirus twice with more severe sy...
4,reports say that afghanistan president ashraf ...,6,afghanistan president ghani flees to tajikista...


In [9]:
val['Source'] = val['Heading'] + val['Article']
val.drop(columns=['Article','Heading'],inplace=True)
val.head()

Unnamed: 0,Summary,id,Source
0,the name of all member countries except india ...,0,india opposes china's belt and road initiative...
1,the decision to shelve detailed advice from th...,4,"top white house officials buried cdc report, r..."
2,as india holds the council presidency this mon...,11,us and china clash at un over south china sea ...
3,the food and drug administration ruled that tr...,13,"us allows extra covid vaccine doses for some, ..."
4,pakistan's information minister fawad chaudhry...,30,pak minister claims threatening email was sent...


In [10]:
test['Source'] = test['Heading'] + test['Article']
test.drop(columns=['Article','Heading'],inplace=True)
test.head()

Unnamed: 0,id,Source
0,0,explainer: how worrying is the variant first s...
1,1,pakistan parliament to elect new prime ministe...
2,2,indian-origin pathologist accused of botching ...
3,3,china begins world's biggest census drive to c...
4,4,"indonesia prison fire kills 41 drug inmates, i..."


**Converting to pyarrow datasets**

In [11]:
dataset = ds.dataset(pa.Table.from_pandas(train).to_batches())

### convert to Huggingface dataset
train_dataset = Dataset(pa.Table.from_pandas(train))

In [12]:
train_dataset

Dataset({
    features: ['Summary', 'id', 'Source'],
    num_rows: 9046
})

In [13]:
dataset = ds.dataset(pa.Table.from_pandas(val).to_batches())

### convert to Huggingface dataset
val_dataset = Dataset(pa.Table.from_pandas(val))

In [14]:
val_dataset

Dataset({
    features: ['Summary', 'id', 'Source'],
    num_rows: 1006
})

In [15]:
dataset = ds.dataset(pa.Table.from_pandas(test).to_batches())

### convert to Huggingface dataset
test_dataset = Dataset(pa.Table.from_pandas(test))

In [16]:
test_dataset

Dataset({
    features: ['id', 'Source'],
    num_rows: 2513
})

**Hyperparameters**

In [17]:
model_checkpoint = "t5-small"
max_input_length = 1520
max_target_length = 56
batch_size = 1
NUM_EPOCHS = 5

**Load metric**

In [18]:
metric = load_metric("rouge")

  metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

**Preprocess**

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [20]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [21]:
def preprocess_function_test(examples):
    inputs = [prefix + doc for doc in examples["Source"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # # Setup the tokenizer for targets
    # with tokenizer.as_target_tokenizer():
    #     labels = tokenizer(examples["Summary"], max_length=max_target_length, truncation=True)

    # model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [22]:
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["Source"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [23]:
tokenized_dataset_train = train_dataset.map(preprocess_function, batched=True)
tokenized_dataset_val = val_dataset.map(preprocess_function, batched=True)

  0%|          | 0/10 [00:00<?, ?ba/s]



  0%|          | 0/2 [00:00<?, ?ba/s]

In [24]:
tokenized_dataset_test = test_dataset.map(preprocess_function_test, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

**Fine-tuning the model**

In [25]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [26]:
args = Seq2SeqTrainingArguments(
    "results",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    # fp16=True,
)

In [27]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [28]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [29]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [31]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["Source"],
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str

In [32]:
!rm -r "/content/results"

In [33]:
trainer.save_model(os.path.join(MODEL_PATH,model_checkpoint))

Saving model checkpoint to /content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Models_pickled_file/t5-small
Configuration saved in /content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Models_pickled_file/t5-small/config.json
Model weights saved in /content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Models_pickled_file/t5-small/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Models_pickled_file/t5-small/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Models_pickled_file/t5-small/special_tokens_map.json


In [34]:
# model.from_pretrained(os.path.join(MODEL_PATH,model_checkpoint))

In [35]:
# summaries_after_tuning = generate_summary(test_dataset, model)[1]

In [36]:
# df = pd.DataFrame(zip(summaries_after_tuning,test_dataset['id']),
#                   columns=["Summary","id"])

In [37]:
# df.head()

**Saving the predictions**

In [38]:
# df.to_csv(os.path.join(RESULT_PATH,model_checkpoint+".csv"),index=False)