In [None]:
import os
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from termcolor import colored

from pathlib import Path
from tqdm.auto import tqdm

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
!ls ../input/chaii-hindi-and-tamil-question-answering

In [None]:
root = Path("../input/chaii-hindi-and-tamil-question-answering/")

train_df = pd.read_csv(root / "train.csv", encoding="utf8")
test_df = pd.read_csv(root / "test.csv", encoding="utf8")
print("Total training data: ", len(train_df))
print("Total test data: ", len(test_df))

In [None]:
train_df

In [None]:
test_df

### Reading the data

In [None]:
value_counts = train_df['language'].value_counts()
labels = value_counts.index.tolist()
plt.pie(value_counts, labels = labels,autopct='%1.2f%%')
plt.show()

In [None]:
train_df['answer_end'] = [row['answer_start'] + len(row['answer_text']) for index, row in train_df.iterrows()]
train_df

## Visualising Question and answers

In [None]:
len(train_df.question.unique())

In [None]:
len(train_df.context.unique())

In [None]:
def color_answer(question):
    answer_start, answer_end = question["answer_start"], question["answer_end"]
    context = question["context"]
    return colored(context[:answer_start], "white") + \
    colored(context[answer_start:answer_end + 1],  'white', 'on_red') + \
    colored(context[answer_end + 1:], "white")

In [None]:
tamil_df = train_df[train_df.language == 'tamil']
tamil_df['con_len'] = [len(row['context']) for index,row in tamil_df.iterrows()]
hindi_df = train_df[train_df.language == 'hindi']
hindi_df['con_len'] = [len(row['context']) for index,row in hindi_df.iterrows()]
print("The context with minimum length in Tamil Language is:",min(tamil_df.con_len))
print("The context with minimum length in Hindi Language is:",min(hindi_df.con_len))

In [None]:
tamil_df[tamil_df.con_len==446]

In [None]:
hindi_df[hindi_df.con_len==176]

In [None]:
sample_qa_pair = train_df.iloc[321]
print(sample_qa_pair["question"])
print("Answer", sample_qa_pair["answer_text"])
print("Context:")
print(color_answer(sample_qa_pair))

In [None]:
sample_qa_pair = train_df.iloc[678]
print(sample_qa_pair["question"])
print("Answer", sample_qa_pair["answer_text"])
print("Context:")
print(color_answer(sample_qa_pair))

## Building Baseline XLM-Roberta model

In [None]:
#pip install transformers==4.11.3

In [None]:
import transformers
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

print(transformers.__version__)

> ##  Understanding the Tokenizer

In [None]:
model_name = '../input/chaii-pretrained-models/models/deepset/xlm-roberta-large-squad2'

model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

question, text = 'Why is model conversion important?', 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
encoding = tokenizer(question, text, return_tensors='pt')

print(encoding)


In [None]:
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

start_scores, end_scores = model(input_ids, attention_mask=attention_mask, output_attentions=False)[:2] 
all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
answer = ' '.join(all_tokens[np.argmax(start_scores.detach()): np.argmax(end_scores.detach())+1])
answer = tokenizer.convert_tokens_to_ids(answer.split())
answer = tokenizer.decode(answer)
answer

In [None]:
tokenizer

In [None]:
with open("vocabulary.txt", 'w') as f:
    
    # For each token...
    for token in tokenizer.vocab.keys():
        
        # Write it out and escape any unicode characters.            
        f.write(token + '\n')

In [None]:
#tokenizer.save_pretrained("/kaggle/working/")

In [None]:
#tokenizer.vocab

In [None]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [None]:
example=train_df.iloc[42]
print(example)
print("The length of tokenized data is :",len(tokenizer(example["question"], example["context"])["input_ids"]))

In [None]:
print("The length of truncated tokenized data is :",len(tokenizer(example["question"], example["context"], max_length=max_length, truncation="only_second")["input_ids"]))

In [None]:
pad_on_right=tokenizer.padding_side=='right'

In [None]:
# Tokenize our examples with truncation and padding, but keep the overflows using a stride.
tokenized_example=tokenizer(
    example["question" if pad_on_right else "context"],
    example["context" if pad_on_right else "question"],
    truncation="only_second" if pad_on_right else "only_first",
    max_length=max_length,
    stride=doc_stride,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    padding="max_length",
    )
print(tokenized_example)

In [None]:
[len(x) for x in tokenized_example["input_ids"]]

In [None]:
sequence_ids = tokenized_example.sequence_ids()
print(sequence_ids)

It returns None for the special tokens, then 0 or 1 depending on whether the corresponding token comes from the first sentence past (the question) or the second (the context). Now with all of this, we can find the first and last token of the answer in one of our input feature (or if the answer is not in this feature):

 > #### Now that we have understood how the tokenizer works, let's prepare our dataset so that it is in the right format for our model to process

## Dataset preparation

In [None]:
def convert_answers(df):
    sample = df
    return {
        'answer_start':[sample['answer_start']],
        'text':[sample['answer_text']]
    }

In [None]:
train_df=train_df.sample(frac=1,random_state=42)     #shuffling the dataset
train_df['answers']=train_df.apply(convert_answers,axis=1)

In [None]:
train_df

In [None]:
df_train=train_df[:-512].reset_index(drop=True)
df_valid=train_df[-512:].reset_index(drop=True)

In [None]:
#pip install datasets

In [None]:
from datasets import Dataset  

train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_valid)

In [None]:
train_dataset[0]

In [None]:
def prepare_train_features(examples):
    # Removing the whitespaces present in the left portion of text
    examples["question"] = [q.lstrip() for q in examples["question"]]

    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [None]:
train_dataset

In [None]:
features = prepare_train_features(train_dataset[:5])

In [None]:
tokenized_train_ds=train_dataset.map(prepare_train_features,batched=True,remove_columns=train_dataset.column_names)
tokenized_valid_ds=valid_dataset.map(prepare_train_features,batched=True,remove_columns=valid_dataset.column_names)

In [None]:
tokenized_train_ds

This is a Tokenized dataset containing features

attention_mask
end_positions
input_ids
start_positions
Even better, the results are automatically cached by the 🤗 Datasets library to avoid spending time on this step the next time you run your notebook. The 🤗 Datasets library is normally smart enough to detect when the function you pass to map has changed (and thus requires to not use the cache data). For instance, it will properly detect if you change the task in the first cell and rerun the notebook. 🤗 Datasets warns you when it uses cached files, you can pass load_from_cache_file=False in the call to map to not use the cached files and force the preprocessing to be applied again.

Note that we passed batched=True to encode the texts by batches together. This is to leverage the full benefit of the fast tokenizer we loaded earlier, which will use multi-threading to treat the texts in a batch concurrently.

## Model Training

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer,default_data_collator

model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [None]:
%env WANDB_DISABLED=True

In [None]:
batch_size=4

args = TrainingArguments(
    f"chaii-qa",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    warmup_ratio=0.1,
    gradient_accumulation_steps=8,)

In [None]:
from transformers import default_data_collator

trainer=Trainer(model,
                args,
                train_dataset=tokenized_train_ds,
                eval_dataset=tokenized_valid_ds,
                data_collator=default_data_collator,
                tokenizer=tokenizer,
               )

In [None]:
trainer.train()
trainer.save_model("chaii_baseline")

In [None]:
def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples


In [None]:
validation_features = valid_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=train_dataset.column_names
)
valid_dataset


In [None]:
valid_feats_small = validation_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])
valid_feats_small


In [None]:
raw_predictions = trainer.predict(valid_feats_small)

In [None]:
max_answer_length = 30

import collections

examples = valid_dataset
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

## Postprocessing
Postprocessing converts the predictions of a question-answering model to answers that are substrings of the original contexts. The Postprocessing code consists of nested loops over the examples. It collects the indices of the features of ongoing examples in the loop and also the context. Furthermore, it loops over each of the features in the continuing example and collects predictions on the same features from the model consisting of two arrays containing the start logits and the end logits, respectively. The min_null_score is None which has to use during training on squad v2 data. The null answer is scored as the sum of the start_logit and end_logit associated with the [CLS] token that is our minimum null score. Any sensible combination of start and end logits, i.e. start_logit + end_logit can be considered a possible answer. Higher the combination score higher is the confidence of getting the best answer. If the End token falls before the start token, in this case, it should be excluded. Answers in which the start or end tokens are associated with question tokens are also excluded, as we know the answer to the question will not be obvious in the question. The number of best predictions for each example can be adjusted with the — n_best_size argument; the code goes through all possibilities to get the best answer. Answers with a length that is either less than 0 or greater than the max_answer_length are not included; neither answer out of scope is considered.

In [None]:
from tqdm.auto import tqdm

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        predictions[example["id"]] = best_answer["text"]

    return predictions


In [None]:
final_predictions = postprocess_qa_predictions(valid_dataset, validation_features, raw_predictions.predictions)

In [None]:
references = [{"id": ex["id"], "answer": ex["answers"]['text'][0]} for ex in valid_dataset]
result = pd.DataFrame(references)

In [None]:
def jaccard(row): 
    str1 = row[0]
    str2 = row[1]
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
result['prediction'] = result['id'].apply(lambda r: final_predictions[r])
result['jaccard'] = result[['answer', 'prediction']].apply(jaccard, axis=1)
result

In [None]:
result.jaccard.mean()

## Model Evaluation

In [None]:
test_dataset = Dataset.from_pandas(test_df)

In [None]:
test_features = test_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=test_dataset.column_names
)

test_feats_small = test_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])
test_feats_small

In [None]:
test_predictions = trainer.predict(test_feats_small)

In [None]:
test_features.set_format(type=test_features.format["type"], columns=list(test_features.features.keys()))

In [None]:
final_test_predictions = postprocess_qa_predictions(test_dataset, test_features, test_predictions.predictions)

In [None]:
sub_df = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/sample_submission.csv')
sub_df.head(1)

In [None]:
sub_df['PredictionString'] = sub_df['id'].apply(lambda r: final_test_predictions[r])
sub_df.head()

In [None]:
sub_df.to_csv('submission.csv', index=False)