# Problem: Not enough training data 😟

## Solution: 💡
### 1. Train a model to make questions ❓
### 2. Use the question-generating model to create questions on new texts 📜❓❓
### 3. Use a trained QA model to answer those questions, creating new training data. 📊
### 4. Train a new and improved QA model using the pseudo-labels. 💪
### 5. If unsatisfied with model, go to step 2 🔄
### 6. ??? 🤷‍♂️
### 7. Profit 💰

#### Nearly everything in this notebook is from here: https://github.com/huggingface/transformers/blob/master/examples/pytorch/summarization/run_summarization.py

🤗 💖

This is just a proof of concept using a small model. There are probably better hyperparameters and bigger, better models. If you have questions, comments, or feedback, you know where to leave them! 😊

# Seq2Seq for Question Generation

Just like how models can make a summary from a text, why not train a model to make a question? We already have the dataset -- we can just use the `context` and `question` columns.

In [None]:
%%capture
!pip install git+https://github.com/huggingface/transformers.git
!pip uninstall fsspec -qq -y
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq
!pip install -U wandb
!pip install rouge_score
# !pip install deepspeed

In [None]:
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import nltk  # Here to have a nice missing dependency error message early on
import numpy as np
from datasets import load_dataset, load_metric
import torch

import transformers
from filelock import FileLock
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

chaii_df = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/train.csv", usecols=["context","question"])
mlqa_df = pd.read_csv("../input/mlqa-hindi-processed/mlqa_hindi.csv", usecols=["context","question"])
xquad_df = pd.read_csv("../input/mlqa-hindi-processed/xquad.csv", usecols=["context","question"])

In [None]:
# just use chaii datasets for validation
train_df, val_df = train_test_split(chaii_df, test_size=0.2)
train_df = pd.concat([train_df, mlqa_df, xquad_df], axis=0, ignore_index=True)

In [None]:
train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
key = user_secrets.get_secret("wandb")

import wandb

wandb.login(key=key)
os.environ["WANDB_PROJECT"] = "chaii-qa"

In [None]:
class CFG:
    run_name = "mt5-base-q-gen-chaii-mlqa-xquad-hindi"
    
    seed = 0
    
    model_name_or_path = "google/mt5-base"
    train_file = "train.csv"
    validation_file = "val.csv"
    text_column = "context"
    summary_column = "question"
    
    max_source_length = 1024
    max_target_length = 128
    val_max_target_length = 128
    pad_to_max_length = False
    num_beams = 4
    output_dir = "output"
    per_device_train_batch_size = 1
    per_device_eval_batch_size = 1
    gradient_accumulation_steps = 2
    num_train_epochs = 5
    evaluation_strategy = 'steps'
    eval_steps = 75
    learning_rate = 3e-4
    weight_decay = 0.01
    warmup_ratio = 0.1
    logging_steps = 20
    save_total_limit = 2

    
    source_prefix = "question"
    ignore_pad_token_for_loss = True

In [None]:
set_seed(CFG.seed)

data_files = {}
if CFG.train_file is not None:
    data_files["train"] = CFG.train_file
    extension = CFG.train_file.split(".")[-1]
if CFG.validation_file is not None:
    data_files["validation"] = CFG.validation_file
    extension = CFG.validation_file.split(".")[-1]

raw_datasets = load_dataset(extension, data_files=data_files)

# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.

In [None]:
# Load pretrained model and tokenizer
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config = AutoConfig.from_pretrained(CFG.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path)
model = AutoModelForSeq2SeqLM.from_pretrained(
    CFG.model_name_or_path,
    config=config,
)

model.resize_token_embeddings(len(tokenizer))

if model.config.decoder_start_token_id is None:
    raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")

prefix = CFG.source_prefix if CFG.source_prefix is not None else ""

In [None]:
# Preprocessing the datasets.
# We need to tokenize inputs and targets.
column_names = raw_datasets["train"].column_names


# Get the column names for input/target.
text_column = CFG.text_column
summary_column = CFG.summary_column


# Temporarily set max_target_length for training.
max_target_length = CFG.max_target_length
padding = "max_length" if CFG.pad_to_max_length else False


def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[summary_column]
    inputs = [prefix + inp for inp in inputs]
    model_inputs = tokenizer(inputs, max_length=CFG.max_source_length, padding=padding, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length" and CFG.ignore_pad_token_for_loss:
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Train dataset
train_dataset = raw_datasets["train"]

train_dataset = train_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=column_names,
        desc="Running tokenizer on train dataset",
    )

# Eval dataset

max_target_length = CFG.val_max_target_length

eval_dataset = raw_datasets["validation"]


eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=column_names,
    desc="Running tokenizer on validation dataset",
)

In [None]:
# Data collator
label_pad_token_id = -100 if CFG.ignore_pad_token_for_loss else tokenizer.pad_token_id
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
)

# Metric
metric = load_metric("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if CFG.ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
training_args = Seq2SeqTrainingArguments(
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=CFG.per_device_train_batch_size,
    per_device_eval_batch_size=CFG.per_device_eval_batch_size,
    num_train_epochs=CFG.num_train_epochs,
    output_dir=CFG.output_dir,
    evaluation_strategy=CFG.evaluation_strategy,
    learning_rate=CFG.learning_rate,
    weight_decay=CFG.weight_decay,
    warmup_ratio=CFG.warmup_ratio,
    logging_steps=CFG.logging_steps,
    save_total_limit=CFG.save_total_limit,
    eval_steps=CFG.eval_steps,
    run_name=CFG.run_name,
    gradient_accumulation_steps=CFG.gradient_accumulation_steps
)

In [None]:
# Initialize our Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics if training_args.predict_with_generate else None,
)

# Training
if training_args.do_train:

    train_result = trainer.train()
    trainer.save_model()  # Saves the tokenizer too for easy upload

    metrics = train_result.metrics

    metrics["train_samples"] = len(train_dataset)

    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

# Evaluation
results = {}
if training_args.do_eval:
    print("*** Evaluate ***")

    metrics["eval_samples"] = len(eval_dataset)

    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)

In [None]:
import gc

del trainer
del train_dataset
del eval_dataset

gc.collect()
torch.cuda.empty_cache()

In [None]:
from transformers import MT5ForConditionalGeneration

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = MT5ForConditionalGeneration.from_pretrained(CFG.output_dir).to(device)

model.eval()

def show_context_and_question(context, model):
    prefix = CFG.source_prefix
    full_ctx = f"{prefix}: " + context
    
    with torch.no_grad():
        inputs = tokenizer(full_ctx, return_tensors='pt') 
        inputs = {k:v.to(device) for k, v in inputs.items()}

        output = model.generate(input_ids=inputs["input_ids"])

        question = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(output.detach().squeeze()))
        
        if question.startswith("<pad> <extra_id_0> "):
            question = question[len("<pad> <extra_id_0> "):]
        if question.endswith("</s>"):
            question = question[:-len("</s>")]
    
    return (context, question)

In [None]:
for i in range(5):
    ctx, q = show_context_and_question(val_df["context"].values[i], model)
    print("CONTEXT:", ctx, "\n\n", "QUESTION:", q, "\n\n", "*"*100, "\n")

### It might not be great, but I'm sure there are people out there who can make this a lot better! The concept is really what I'm trying to get across. 🔥