# Group 4 Indian Language Q&A (Tamil, Hindi)

With nearly 1.4 billion people, India is the second-most populated country in the world. Yet Indian languages, like Hindi and Tamil, are underrepresented on the web. Popular Natural Language Understanding (NLU) models perform worse with Indian languages compared to English, the effects of which lead to subpar experiences in downstream web applications for Indian users.

# Import packages

In [None]:
import os
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from termcolor import colored

from pathlib import Path
from tqdm.auto import tqdm

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import transformers
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline, TrainingArguments, Trainer, default_data_collator

from datasets import Dataset

In [None]:
!ls ../input/chaii-hindi-and-tamil-question-answering

In [None]:
root = Path("../input/chaii-hindi-and-tamil-question-answering/")

train_df = pd.read_csv(root / "train.csv", encoding="utf8")
test_df = pd.read_csv(root / "test.csv", encoding="utf8")
print("Total training data: ", len(train_df))
print("Total test data: ", len(test_df))

We remark that, given such a small amount of data for training (~1000 samples), having the right external data could be a key thing to improve the learning of models.

# Understanding the dataset

In [None]:
train_df

In [None]:
test_df

Therefore, For each ID in the test set, we must predict the string that best answers the provided question based on the context.

In [None]:
value_counts = train_df['language'].value_counts()
labels = value_counts.index.tolist()
plt.pie(value_counts, labels = labels,autopct='%1.2f%%')
plt.show()

In [None]:
train_df['answer_end'] = [row['answer_start'] + len(row['answer_text']) for index, row in train_df.iterrows()]
train_df

## Exploring Question and answers

In [None]:
len(train_df.question.unique())

In [None]:
len(train_df.context.unique())

In [None]:
def color_answer(question):
    answer_start, answer_end = question["answer_start"], question["answer_end"]
    context = question["context"]
    return colored(context[:answer_start], "white") + \
    colored(context[answer_start:answer_end + 1],  'white', 'on_red') + \
    colored(context[answer_end + 1:], "white")

In [None]:
tamil_df = train_df[train_df.language == 'tamil']
tamil_df['con_len'] = [len(row['context']) for index,row in tamil_df.iterrows()]
hindi_df = train_df[train_df.language == 'hindi']
hindi_df['con_len'] = [len(row['context']) for index,row in hindi_df.iterrows()]
print("The context with minimum length in Tamil Language is:",min(tamil_df.con_len))
print("The context with minimum length in Hindi Language is:",min(hindi_df.con_len))

In [None]:
tamil_df[tamil_df.con_len==446]

In [None]:
import numbers
count=0
for i in tamil_df['answer_text']:
    if(i.isnumeric()):
        count+=1
count
    

In [None]:
import numbers
count=0
for i in hindi_df['answer_text']:
    if(i.isnumeric()):
        count+=1
count

In [None]:
count=0
for i in tamil_df['answer_text']:
    k=len(i.split(" "))
    if(k==1):
        count+=1
print(count)
        

In [None]:
count=0
for i in hindi_df['answer_text']:
    k=len(i.split(" "))
    if(k==1):
        count+=1
print(count)

In [None]:
tamil_df[tamil_df['answer_text']=='ஆமாம்']

In [None]:
tamil_df[tamil_df['answer_text']=='இல்லை']

In [None]:
hindi_df[hindi_df['answer_text']=='हां']

In [None]:
hindi_df[hindi_df['answer_text']=="नहीं"]

In [None]:
hindi_df[hindi_df.con_len==176]

In [None]:
sample_qa_pair = train_df.iloc[215]
print(sample_qa_pair["question"])
print("Answer", sample_qa_pair["answer_text"])
print("Context:")
print(color_answer(sample_qa_pair))

In [None]:
sample_qa_pair = train_df.iloc[420]
print(sample_qa_pair["question"])
print("Answer", sample_qa_pair["answer_text"])
print("Context:")
print(color_answer(sample_qa_pair))

## Towards the XLM-Roberta model

Essentially, our goal is the question answering task, which is the task of extracting the answer to a question from a given context. Multilingual Transformer models pre-trained on SQUAD data are completely dominating the competition. Therefore, we look towards finetuning the XLM-Roberta model

> ## Tokenizer

Before we can feed those texts to our model, we need to preprocess them. This is done by a 🤗 Transformers Tokenizer which will tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary) and put it in a format the model expects, as well as generate the other inputs that model requires.

In [None]:
model_name = '../input/chaii-pretrained-models/models/deepset/xlm-roberta-large-squad2'


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

question, text = 'Why is model conversion important?', 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
encoding = tokenizer(question, text, return_tensors='pt')
print(encoding)

input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

start_scores, end_scores = model(input_ids, attention_mask=attention_mask, output_attentions=False)[:2] 
all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
answer = ' '.join(all_tokens[np.argmax(start_scores.detach()): np.argmax(end_scores.detach())+1])
answer = tokenizer.convert_tokens_to_ids(answer.split())
answer = tokenizer.decode(answer)
print("\nAnswer from model: " + answer)

In [None]:
train_df['num_tokens_context'] = train_df['context'].apply(lambda t: len(tokenizer(t)['input_ids']))

We've been warned that the context length exceeds the maxium token length of the model, so we have to divide it before processing.

In [None]:
train_df['num_tokens_context'].hist();

> ## Preparing the dataset

In [None]:

max_length = 384 
doc_stride = 128 
pad_on_right = tokenizer.padding_side == "right" 

In [None]:


def prepare_train_features(examples):
    
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    
    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [None]:
#Helper function
def convert_answers(r):
    start = r[0]
    text = r[1]
    return {
        'answer_start': [start],
        'text': [text]
    }

train_df=train_df.sample(frac=1,random_state=42)     #shuffling the dataset
train_df['answers'] = train_df[['answer_start', 'answer_text']].apply(convert_answers, axis=1)

In [None]:
#Splitting to train and validation sets
df_train = train_df[:-64].reset_index(drop=True)
df_valid = train_df[-64:].reset_index(drop=True)

#Making Dataset objects for processing
train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_valid)

In [None]:
train_dataset[0]

We use the map method to apply the prepare_train_features, which we had earlier defined, on all the sentences. 

In [None]:
tokenized_train_ds = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
tokenized_valid_ds = valid_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)

In [None]:
tokenized_train_ds

This is a Tokenized dataset containing features

* attention_mask
* end_positions
* input_ids
* start_positions

That concludes data preparation

> ## Preparing the model

In [None]:
#Instantiating model
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [None]:
%env WANDB_DISABLED=True

In [None]:
batch_size=4

args = TrainingArguments(
    f"chaii-qa",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    warmup_ratio=0.1,
    gradient_accumulation_steps=8,)

In [None]:
trainer=Trainer(model,
                args,
                train_dataset=tokenized_train_ds,
                eval_dataset=tokenized_valid_ds,
                data_collator=default_data_collator, # Will batch processed examples together
                tokenizer=tokenizer,
               )

In [None]:
trainer.train()
trainer.save_model("chaii_xlm") 

In [None]:
def prepare_validation_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [None]:
validation_features = valid_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=train_dataset.column_names
)
valid_dataset

In [None]:
valid_feats_small = validation_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])
valid_feats_small


In [None]:
raw_predictions = trainer.predict(valid_feats_small)

In [None]:
max_answer_length = 30

import collections

examples = valid_dataset
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

### Postprocessing

In [None]:
from tqdm.auto import tqdm

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]

        min_null_score = None 
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}
        
        predictions[example["id"]] = best_answer["text"]

    return predictions

In [None]:
final_predictions = postprocess_qa_predictions(valid_dataset, validation_features, raw_predictions.predictions)

In [None]:
references = [{"id": ex["id"], "answer": ex["answers"]['text'][0]} for ex in valid_dataset]
result = pd.DataFrame(references)

In [None]:
def jaccard(row): 
    str1 = row[0]
    str2 = row[1]
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
result['prediction'] = result['id'].apply(lambda r: final_predictions[r])
result['jaccard'] = result[['answer', 'prediction']].apply(jaccard, axis=1)
result


In [None]:
result.jaccard.mean()

In [None]:
len(valid_dataset)