In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install datasets

In [None]:
import os
import torch
import pandas as pd
import numpy as np
import transformers
import collections
from transformers import AutoModelForQuestionAnswering, AutoTokenizer,TrainingArguments, Trainer,default_data_collator
from tqdm.auto import tqdm
from tqdm.notebook import tqdm
from datasets import Dataset
from preprocess import convert_answers, prepare_train_features, prepare_test_features
from postprocess import postprocess_qa_predictions
from metric import jaccard

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Hindi-Tamil_Question_Answering_System/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Hindi-Tamil_Question_Answering_System/test.csv')
mlqa = pd.read_csv('/content/drive/MyDrive/Hindi-Tamil_Question_Answering_System/mlqa_hindi.csv')
xquad = pd.read_csv('/content/drive/MyDrive/Hindi-Tamil_Question_Answering_System/xquad.csv')
train = pd.concat([train,xquad,mlqa], ignore_index=True)
train.head()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("deepset/xlm-roberta-large-squad2")

In [None]:
train = train.sample(frac=1, random_state=42)
train['answers'] = train[['answer_start', 'answer_text']].apply(convert_answers, axis=1)

In [None]:
df_train = train[:-100].reset_index(drop=True)
df_valid = train[-100:].reset_index(drop=True)
train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_valid)

In [None]:
tokenized_train_ds = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
tokenized_valid_ds = valid_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained("deepset/xlm-roberta-large-squad2")

In [None]:
%env WANDB_DISABLED=True
args = TrainingArguments(
    f"chaii-qa",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    warmup_ratio=0.1,
    gradient_accumulation_steps=8,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
)

In [None]:
data_collator = default_data_collator
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_valid_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
#trainer.save_model("/content/drive/MyDrive/Hindi-Tamil_Question_Answering_System/xlmroberta-squad-fine")

In [None]:
test['answers'] = test[['answer_start', 'answer_text']].apply(convert_answers, axis=1)
test_dataset = Dataset.from_pandas(test)
test_features = test_dataset.map(
    prepare_test_features,
    batched=True,
    remove_columns=test_dataset.column_names
)
test_f = test_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

In [None]:
raw_predictions = trainer.predict(test_f)

In [None]:
examples = test_dataset
features = test_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

In [None]:
final_predictions = postprocess_qa_predictions(test_dataset, test_features, raw_predictions.predictions)

In [None]:

references = [{"id": ex["id"], "answer": ex["answers"]['text'][0]} for ex in test_dataset]

In [None]:
res = pd.DataFrame(references)
res['prediction'] = res['id'].apply(lambda r: final_predictions[r])
res['jaccard'] = res[['answer', 'prediction']].apply(jaccard, axis=1)
res

In [None]:
print("The average Jaccard Score is ",res.jaccard.mean())