**NOTE:** This Notebook is downloaded from Kaggle and is therefore intended to be used as a Kaggle Kernel

# 📦 Packages and Basic Setup

In [1]:
%%capture
# -------- Offline Installs -------- #
!pip uninstall fsspec -qq -y
!pip install -U --no-build-isolation --no-deps ../input/transformers-master/ -qq
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq

In [2]:
%%capture
# -------- Basic Packages -------- #
import os
import gc
import sys
gc.enable()
import math
import time
import torch
import numpy as np
import pandas as pd
from string import punctuation
from sklearn import model_selection
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, SequentialSampler

# -------- Output Prettification ✨ -------- #
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from transformers import logging
logging.set_verbosity_warning()
logging.set_verbosity_error()

# -------- Custom Library -------- #
wrapperdir = "../input/coffee"
sys.path.append(wrapperdir)
from coffee.dataloader import Dataset
from coffee.helpers import make_model
from coffee.data_utils import prepare_test_features
from coffee.utils import optimal_num_of_loader_workers

2021-10-24 04:52:37.821250: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


# 📃 Configuration

In [3]:
CONFIG = dict(
    # Model
    model_type = 'bert',
    model_name_or_path = "../input/muril-large-pt/muril-large-cased",
    config_name = "../input/muril-large-pt/muril-large-cased",
    output_head_dropout_prob = 0.0, 
    gradient_accumulation_steps = 2,
    # Tokenizer
    tokenizer_name = "../input/muril-large-pt/muril-large-cased",
    max_seq_length = 400,
    doc_stride = 135,
    # Training
    epochs = 1,
    folds = 4,
    train_batch_size = 2,
    eval_batch_size = 8,
    # Optimizer
    optimizer_type = 'AdamW',
    learning_rate = 1.5e-5,
    weight_decay = 1e-2,
    epsilon = 1e-8,
    max_grad_norm = 1.0,
    # Scheduler
    decay_name = 'cosine-warmup',
    warmup_ratio = 0.1,
    logging_steps = 100,
    # Misc
    output_dir = 'output',
    seed = 21,
    # W&B 
    competition = 'chaii',
    _wandb_kernel = 'sauravm'
)

# 💿 Dataset

In [4]:
test = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')
base_model_path = '../input/bestmurilchaii/output/'

tokenizer = AutoTokenizer.from_pretrained(CONFIG["tokenizer_name"])

test_features = []
for i, row in test.iterrows():
    test_features += prepare_test_features(CONFIG, row, tokenizer)

args = CONFIG
test_dataset = Dataset(test_features, mode='test')
test_dataloader = DataLoader(
    test_dataset,
    batch_size=CONFIG["eval_batch_size"], 
    sampler=SequentialSampler(test_dataset),
    num_workers=optimal_num_of_loader_workers(),
    pin_memory=True, 
    drop_last=False
)

# 🔥 Inference

In [5]:
%%capture
def get_predictions(checkpoint_path):
    config, tokenizer, model = make_model(CONFIG)
    model.cuda();
    model.load_state_dict(
        torch.load(base_model_path + checkpoint_path)
    );
    
    start_logits = []
    end_logits = []
    for batch in test_dataloader:
        with torch.no_grad():
            outputs_start, outputs_end = model(batch['input_ids'].cuda(), batch['attention_mask'].cuda())
            start_logits.append(outputs_start.cpu().numpy().tolist())
            end_logits.append(outputs_end.cpu().numpy().tolist())
            del outputs_start, outputs_end
    del model, tokenizer, config
    gc.collect()
    return np.vstack(start_logits), np.vstack(end_logits)

start_logits1, end_logits1 = get_predictions('checkpoint-fold-0/pytorch_model.bin')
start_logits2, end_logits2 = get_predictions('checkpoint-fold-1/pytorch_model.bin')
start_logits3, end_logits3 = get_predictions('checkpoint-fold-2/pytorch_model.bin')
start_logits4, end_logits4 = get_predictions('checkpoint-fold-3/pytorch_model.bin')

start_logits = (start_logits1 + start_logits2 + start_logits3 + start_logits4) / 4
end_logits = (end_logits1 + end_logits2 + end_logits3 + end_logits4) / 4

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [6]:
import collections

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in examples.iterrows():
        feature_indices = features_per_example[example_index]

        min_null_score = None
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            sequence_ids = features[feature_index]["sequence_ids"]
            context_index = 1

            features[feature_index]["offset_mapping"] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(features[feature_index]["offset_mapping"])
            ]
            offset_mapping = features[feature_index]["offset_mapping"]
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}
        
        predictions[example["id"]] = best_answer["text"]
        
        
    return predictions

predictions = postprocess_qa_predictions(test, test_features, (start_logits, end_logits))

Post-processing 5 example predictions split into 50 features.


In [7]:
submission = []
for p1, p2 in predictions.items():
    p2 = " ".join(p2.split())
    p2 = p2.strip(punctuation)
    submission.append((p1, p2))
    
sample = pd.DataFrame(submission, columns=["id", "PredictionString"])

test_data =pd.merge(left=test,right=sample,on='id')

bad_starts = [".", ",", "(", ")", "-", "–",  ",", ";"]
bad_endings = ["...", "-", "(", ")", "–", ",", ";"]

tamil_ad = "கி.பி"
tamil_bc = "கி.மு"
tamil_km = "கி.மீ"
hindi_ad = "ई"
hindi_bc = "ई.पू"


cleaned_preds = []
for pred, context in test_data[["PredictionString", "context"]].to_numpy():
    if pred == "":
        cleaned_preds.append(pred)
        continue
    while any([pred.startswith(y) for y in bad_starts]):
        pred = pred[1:]
    while any([pred.endswith(y) for y in bad_endings]):
        if pred.endswith("..."):
            pred = pred[:-3]
        else:
            pred = pred[:-1]
    if pred.endswith("..."):
            pred = pred[:-3]
    
    if any([pred.endswith(tamil_ad), pred.endswith(tamil_bc), pred.endswith(tamil_km), pred.endswith(hindi_ad), pred.endswith(hindi_bc)]) and pred+"." in context:
        pred = pred+"."
        
    cleaned_preds.append(pred)

test_data["PredictionString"] = cleaned_preds
test_data[['id', 'PredictionString']].to_csv('submission.csv', index=False)

In [8]:
test_data

Unnamed: 0,id,context,question,language,PredictionString
0,22bff3dec,"ज्वाला गुट्टा (जन्म: 7 सितंबर 1983; वर्धा, महा...",ज्वाला गुट्टा की माँ का नाम क्या है,hindi,येलन
1,282758170,गूगल मानचित्र (Google Maps) (पूर्व में गूगल लो...,गूगल मैप्स कब लॉन्च किया गया था?,hindi,20 अप्रैल 2010
2,d60987e0e,गुस्ताव रॉबर्ट किरचॉफ़ (१२ मार्च १८२४ - १७ अक्...,गुस्ताव किरचॉफ का जन्म कब हुआ था?,hindi,१२ मार्च १८२४
3,f99c770dc,அலுமினியம் (ஆங்கிலம்: அலுமினியம்; வட அமெரிக்க ...,அலுமினியத்தின் அணு எண் என்ன?,tamil,13
4,40dec1964,"கூட்டுறவு இயக்க வரலாறு, இங்கிலாந்து நாட்டில் ...",இந்தியாவில் பசுமை புரட்சியின் தந்தை என்று கருத...,tamil,சுவாமிநாதன் மற்றும் வர்கீஸ் குரியன்
