In [1]:
# Downloading the .json files from the Github Profile
!wget https://raw.githubusercontent.com/google-research-datasets/Disfl-QA/main/dev.json
!wget https://raw.githubusercontent.com/google-research-datasets/Disfl-QA/main/test.json
!wget https://raw.githubusercontent.com/google-research-datasets/Disfl-QA/main/train.json

--2024-09-05 03:50:38--  https://raw.githubusercontent.com/google-research-datasets/Disfl-QA/main/dev.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 201742 (197K) [text/plain]
Saving to: ‘dev.json’


2024-09-05 03:50:38 (8.22 MB/s) - ‘dev.json’ saved [201742/201742]

--2024-09-05 03:50:38--  https://raw.githubusercontent.com/google-research-datasets/Disfl-QA/main/test.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 771364 (753K) [text/plain]
Saving to: ‘test.json’


2024-09-05 03:50:39 (22.1 MB/s) - ‘test.json’ saved [7

In [2]:
# Installing required libraries
!pip install datasets evaluate transformers rouge-score nltk bert-score

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31

In [3]:
# Save the model on HuggingFace in my ID
# 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 - Copy the token
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#T5 MODEL

In [4]:
# Using the T5 model checkpoints
model_checkpoint = "t5-small"

In [5]:
# Import necessary libraries and load evaluation metric
from evaluate import load
from datasets import load_dataset
from bert_score import score
from nltk.translate.meteor_score import meteor_score
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
import json
import nltk
from nltk.corpus import wordnet
import random
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import concatenate_datasets

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('words')
metric = load("rouge")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [6]:
#Reading .json files and transposing them to make the IDs into separate columns
train_df = pd.read_json('train.json').T
test_df = pd.read_json('test.json').T
validation_df = pd.read_json('dev.json').T

In [7]:
# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
validation_dataset = Dataset.from_pandas(validation_df)

# Combine into a DatasetDict
raw_datasets = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': validation_dataset
})

# Rename the column
raw_datasets = raw_datasets.rename_column("__index_level_0__", "input_ids")

# Creating Synthetic Dataset

In [8]:
# Replacing words with synonyms
def synonym_replacement(sentence):
    words = nltk.word_tokenize(sentence)
    new_sentence = []

    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = random.choice(synonyms).lemmas()[0].name()
            if synonym != word:
                new_sentence.append(synonym)
            else:
                new_sentence.append(word)
        else:
            new_sentence.append(word)

    return ' '.join(new_sentence)

In [9]:
# Getting augmented data
def augment_data(text, methods):
    augmented_texts = [text]
    for method in methods:
        augmented_texts.append(method(text))
    return augmented_texts

def augment_dataset(dataset, methods):
    augmented_data = []
    for example in dataset:
        original_text = example['disfluent']
        augmented_examples = augment_data(original_text, methods)
        for aug_text in augmented_examples:
            augmented_data.append({
                'input_ids': example['input_ids'],
                'original': example['original'],
                'disfluent': aug_text
            })
    return Dataset.from_pandas(pd.DataFrame(augmented_data))

# Apply augmentation to the dataset
augmentation_methods = [synonym_replacement]
augmented_dataset = augment_dataset(raw_datasets['train'], augmentation_methods)

In [10]:
# Concatenate the original and augmented datasets
combined_train_dataset = concatenate_datasets([raw_datasets['train'], augmented_dataset])

# Update the raw_datasets dictionary with the combined dataset
raw_datasets['train'] = combined_train_dataset

In [11]:
# Display the dataset and its columns
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['original', 'disfluent', 'input_ids'],
        num_rows: 21546
    })
    test: Dataset({
        features: ['original', 'disfluent', 'input_ids'],
        num_rows: 3643
    })
    validation: Dataset({
        features: ['original', 'disfluent', 'input_ids'],
        num_rows: 1000
    })
})

In [12]:
# Initialize the tokenizer from the pre-trained model checkpoint
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [13]:
# Define a prefix to prepend to each input text
prefix = "Question Rewrite: "

In [14]:
# Define preprocessing function to tokenize inputs and labels for the model
max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    # Picking the disfluent values
    inputs = [prefix + doc for doc in examples["disfluent"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["original"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [15]:
# Apply the preprocessing function to the entire dataset in a batched manner
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/21546 [00:00<?, ? examples/s]

Map:   0%|          | 0/3643 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

# Fine Tuning

In [16]:
# Load the pre-trained sequence-to-sequence model
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [17]:
# Selecting hyperparameters to finetune
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-QA-Rewrite-3e-4",
    evaluation_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)



In [18]:
# Initialize the data collator for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [19]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE
    rouge_result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    rouge_result = {key: value * 100 for key, value in rouge_result.items()}

    # Compute BERTScore
    P, R, F1 = score(decoded_preds, decoded_labels, lang="en", rescale_with_baseline=True)
    bertscore_result = {
        "bert_score_precision": P.mean().item(),
        "bert_score_recall": R.mean().item(),
        "bert_score_f1": F1.mean().item()
    }

    # Compute METEOR
    meteor_scores = [meteor_score([nltk.word_tokenize(ref)], nltk.word_tokenize(pred))
                     for pred, ref in zip(decoded_preds, decoded_labels)]
    meteor_result = {"meteor": sum(meteor_scores) / len(meteor_scores)}

    # Combine all results
    final_result = {**rouge_result, **bertscore_result, **meteor_result}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    final_result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in final_result.items()}

In [20]:
# Set up the Seq2SeqTrainer with model, arguments, datasets, and metrics
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [21]:
# Start training the model
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bert Score Precision,Bert Score Recall,Bert Score F1,Meteor,Gen Len
1,0.4671,0.222415,95.7922,92.1999,94.8154,94.8137,0.9319,0.9255,0.9286,0.9492,13.741
2,0.3238,0.227479,96.0405,92.762,95.1483,95.161,0.9401,0.9306,0.9353,0.9522,13.715
3,0.2547,0.240514,95.9386,92.6352,95.0835,95.1054,0.9402,0.9312,0.9356,0.9518,13.715


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TrainOutput(global_step=8082, training_loss=0.3880724955301301, metrics={'train_runtime': 1060.528, 'train_samples_per_second': 60.949, 'train_steps_per_second': 7.621, 'total_flos': 654816413614080.0, 'train_loss': 0.3880724955301301, 'epoch': 3.0})

In [22]:
# Evaluate the model on the validation set
eval_results = trainer.evaluate(eval_dataset=tokenized_datasets['validation'])

print(f"Evaluation results: {eval_results}")



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation results: {'eval_loss': 0.24051402509212494, 'eval_rouge1': 95.9386, 'eval_rouge2': 92.6352, 'eval_rougeL': 95.0835, 'eval_rougeLsum': 95.1054, 'eval_bert_score_precision': 0.9402, 'eval_bert_score_recall': 0.9312, 'eval_bert_score_f1': 0.9356, 'eval_meteor': 0.9518, 'eval_gen_len': 13.715, 'eval_runtime': 41.1119, 'eval_samples_per_second': 24.324, 'eval_steps_per_second': 3.04, 'epoch': 3.0}


# Testing the model

In [23]:
# Checking out the model performance with random sentences
test_inputs = [
    "Uh, I need, like, help with my, um, homework.",
    "Could you, uh, let me know if, um, there's a, uh, library nearby?",
    "The, um, weather is, like, really nice today, uh, isn't it?"
]

for input_text in test_inputs:
    inputs = tokenizer(input_text, return_tensors="pt").to(trainer.model.device)
    outputs = trainer.model.generate(**inputs, max_new_tokens=50)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Input text: {input_text}")
    print(f"Generated output: {decoded_output}\n")


Input text: Uh, I need, like, help with my, um, homework.
Generated output: Uh, I need, like, help with my homework.

Input text: Could you, uh, let me know if, um, there's a, uh, library nearby?
Generated output: Could there's a library nearby?

Input text: The, um, weather is, like, really nice today, uh, isn't it?
Generated output: The weather is, like, really nice today, isn't it?



In [24]:
# Choosing 5 examples from the test dataset
num_examples = 5
validation_samples = raw_datasets['test'].select(range(num_examples))

for i, example in enumerate(validation_samples):
    input_text = example['disfluent']

    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt").to(trainer.model.device)

    # Generate the model's output
    outputs = trainer.model.generate(**inputs, max_new_tokens=50)

    # Decode the output to readable text
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Print input and generated output
    print(f"Example {i+1}:")
    print(f"Input text: {input_text}")
    print(f"Generated output: {decoded_output}")
    print("=" * 50)


Example 1:
Input text: In what country is Norse found no wait Normandy not Norse?
Generated output: In what country is Normandy found?
Example 2:
Input text: From which countries no tell me when were the Normans in Normandy?
Generated output: When were the Normans in Normandy?
Example 3:
Input text: From which Norse leader I mean countries did the Norse originate?
Generated output: From which countries did the Norse originate?
Example 4:
Input text: When I mean Who was the Norse leader?
Generated output: Who was the Norse leader?
Example 5:
Input text: When no what century did the Normans first gain their separate identity?
Generated output: What century did the Normans first gain their separate identity?


In [25]:
# Choosing 5 examples from the validation dataset
num_examples = 5
validation_samples = raw_datasets['validation'].select(range(num_examples))

for i, example in enumerate(validation_samples):
    input_text = example['disfluent']

    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt").to(trainer.model.device)

    # Generate the model's output
    outputs = trainer.model.generate(**inputs, max_new_tokens=50)

    # Decode the output to readable text
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Print input and generated output
    print(f"Example {i+1}:")
    print(f"Input text: {input_text}")
    print(f"Generated output: {decoded_output}")
    print("=" * 50)

Example 1:
Input text: Who did no What did the government want Thoreau to do?
Generated output: What did the government want Thoreau to do?
Example 2:
Input text: What makes the Bank of America Tower or wait the Wells Fargo Center stand out?
Generated output: What does the Wells Fargo Center stand out?
Example 3:
Input text: What was the Colonia Agrippina's original empire, sorry, name?
Generated output: What was the Colonia Agrippina's original name?
Example 4:
Input text: Extended authorization limitations, no sorry networking benefits helped those that could not connect to what platform?
Generated output: Extended networking benefits helped those that could not connect to what platform?
Example 5:
Input text: What is the no make that who is the emphasis on when there is a private finance initiative?
Generated output: Who is the emphasis on when there is a private finance initiative?
