# Please excuse how messy it is 😱

In [None]:
!pip install -U --no-build-isolation --no-deps ../input/transformers-master/ -qq

In [None]:
import sys
sys.path.append("../input/tez-lib/")
import collections
import numpy as np
import transformers
import pandas as pd
from datasets import Dataset
from functools import partial
from tqdm import tqdm
import json
import torch

from sklearn import metrics
import transformers
import torch
import torch.nn as nn
import numpy as np
import tez
from string import punctuation

In [None]:
class ChaiiModel(tez.Model):
    def __init__(self, model_name, num_train_steps, steps_per_epoch, learning_rate):
        super().__init__()
        self.learning_rate = learning_rate
        self.steps_per_epoch = steps_per_epoch
        self.model_name = model_name
        self.num_train_steps = num_train_steps
        self.step_scheduler_after = "batch"

        hidden_dropout_prob: float = 0.0

        config = transformers.AutoConfig.from_pretrained(model_name)
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "add_pooling_layer": False,
            }
        )
        self.transformer = transformers.AutoModelForQuestionAnswering.from_pretrained(model_name, config=config)

    def forward(self, ids, mask, token_type_ids=None, start_positions=None, end_positions=None):
        transformer_out = self.transformer(ids, mask)
        start_logits = transformer_out.start_logits
        end_logits = transformer_out.end_logits
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        return (start_logits, end_logits), 0, {}

In [None]:
def prepare_validation_features(examples, tokenizer, pad_on_right, max_length, doc_stride):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [None]:
special_tokens = {
    "muril": 2,
    "xlmr": 3,
    "rembert": 2,
}

def postprocess_qa_predictions(
    examples, tokenizer, features, raw_predictions, n_best_size=20, max_answer_length=30, squad_v2=False
):
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
    all_answers = []

    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]

        min_null_score = None  # Only used if squad_v2 is True.
        valid_answers = []

        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]

            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": float(start_logits[start_index] + end_logits[end_index]),
                            "text": context[start_char:end_char],
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer

        valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_size]
        all_answers.append({"id": example["id"], "predictions": valid_answers})
    return all_answers, predictions

In [None]:
def convert_to_one_array(logits, max_length, stride, offset):
    num_chunks = len(logits)
    ctx_size = max_length-offset-1
    max_ctx_size = num_chunks*ctx_size-(num_chunks-1)*stride
    final_size = offset+max_ctx_size

    full = np.zeros((final_size))
    full[0:max_length-1] = logits[0][:-1]

    left_idx = max_length-1-stride
    for idx in range(1, num_chunks):
        right_idx = left_idx+ctx_size

        full[left_idx:right_idx] += logits[idx][offset:-1]
        full[left_idx:left_idx+stride]/=2
        
        left_idx = right_idx-stride  
    return full

def token_level_to_char_level(text, offsets, preds):
    probas_char = np.ones(len(text))*-100
    for i, offset in enumerate(offsets):
        if offset[0] or offset[1]: # remove padding and sentiment
            probas_char[offset[0]:offset[1]] = preds[i]
    
    return probas_char


def new_postprocess_qa_predictions(
    examples, tokenizer, features, raw_predictions, max_seq_length, doc_stride, num_special_tokens, file_name, n_best_size=20, max_answer_length=30, squad_v2=False
):
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
        
    new_start_features = collections.OrderedDict()
    new_end_features = collections.OrderedDict()
    
    char_level_starts = {}
    char_level_ends = {}
    
    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
    all_answers = []

    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]

        min_null_score = None  # Only used if squad_v2 is True.
        valid_answers = []
        
        num_q_tokens = len(tokenizer(example["question"], add_special_tokens=False)["input_ids"])
        flat_input_ids = tokenizer(example["question"], example["context"], return_offsets_mapping=True)
        
        left_offset = num_q_tokens+num_special_tokens
        
        id_ = example["id"]
        
        new_start_features[id_] = convert_to_one_array(all_start_logits[feature_indices], max_seq_length, doc_stride, left_offset)
        new_end_features[id_] = convert_to_one_array(all_end_logits[feature_indices], max_seq_length, doc_stride, left_offset)        
        
        start_indexes = np.argsort(new_start_features[id_])[-1 : -n_best_size - 1 : -1].tolist()
        end_indexes = np.argsort( new_end_features[id_])[-1 : -n_best_size - 1 : -1].tolist()
        
        offset_mapping = flat_input_ids["offset_mapping"]
        for start_index in start_indexes:
            for end_index in end_indexes:
                if (
                    start_index >= len(offset_mapping)
                    or end_index >= len(offset_mapping)
                    or offset_mapping[start_index] is None
                    or offset_mapping[end_index] is None
                    or start_index < left_offset
                ):
                    continue
                if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                    continue

                start_char = offset_mapping[start_index][0]
                end_char = offset_mapping[end_index][1]
                valid_answers.append(
                    {
                        "score": float(new_start_features[id_][start_index] + new_end_features[id_][end_index]),
                        "text": example["context"][start_char:end_char],
                        "start": start_index,
                        "end": end_index
                    }
                )

        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer

        valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_size]
        all_answers.append({"id": example["id"], "predictions": valid_answers})

    
        char_level_starts[id_] = token_level_to_char_level(example["context"], offset_mapping, new_start_features[id_])
        char_level_ends[id_] = token_level_to_char_level(example["context"], offset_mapping, new_end_features[id_])


    with open(f"char-level-start-logits-{file_name}", "wb") as fp:
        pickle.dump(char_level_starts, fp)

    with open(f"char-level-end-logits-{file_name}", "wb") as fp:
        pickle.dump(char_level_ends, fp)
        
        
    return all_answers, predictions

In [None]:
test_data = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv")
test_data["len"] = [len(x) for x in test_data["context"]]

do_inference = len(test_data) != 5

char_threshold = 15_000

# short_data = test_data.copy()
short_data = test_data[test_data["len"]<char_threshold].reset_index(drop=True)
long_data = test_data[test_data["len"]>=char_threshold].reset_index(drop=True)

# MuRIL Large

In [None]:
if do_inference:
    tokenizer = transformers.AutoTokenizer.from_pretrained("../input/convert-to-pytorch-muril-large")
    
    pad_on_right = tokenizer.padding_side == "right"
    max_length = 384
    doc_stride = 128

    test_dataset = Dataset.from_pandas(short_data)
    test_features = test_dataset.map(
        partial(
            prepare_validation_features, 
            tokenizer=tokenizer,
            pad_on_right=pad_on_right, 
            max_length=max_length,
            doc_stride=doc_stride
        ),
        batched=True,
        remove_columns=test_dataset.column_names
    )
    test_feats_small = test_features.map(
        lambda example: example, remove_columns=['example_id', 'offset_mapping']
    )

    fin_start_logits = None
    fin_end_logits = None

    chunk1 = ["../input/convert-to-pytorch-muril-large/nbroad/flax-muril-large-chaii-f0",]*3
    chunk2 = ["../input/convert-to-pytorch-muril-large-f567/nbroad/flax-muril-large-chaii-f5"]*3
    models = chunk1+chunk2

    data_loader = torch.utils.data.DataLoader(
        test_feats_small.with_format("torch"), 
        batch_size=64,
        num_workers=4,
        pin_memory=True,
        shuffle=False
    )


    for model_name, fold in tqdm(zip(models, [0, 8, 9, 5, 6, 7])):
        model = ChaiiModel(model_name=model_name, num_train_steps=0, steps_per_epoch=0, learning_rate=0)
        model.transformer.load_state_dict(torch.load(f"{model_name[:-1]}{fold}/pytorch_model.bin"))
        model.to("cuda")
        model.eval()

        start_logits = []
        end_logits = []

        for b_idx, data in enumerate(data_loader):
            with torch.no_grad():
                for key, value in data.items():
                    data[key] = value.to("cuda")
                output, _, _ = model(ids=data["input_ids"], mask=data["attention_mask"])
                start = output[0].detach().cpu().numpy()
                end = output[1].detach().cpu().numpy()
                start_logits.append(start)
                end_logits.append(end)

        start_logits = np.vstack(start_logits)
        end_logits = np.vstack(end_logits)

        if fin_start_logits is None:
            fin_start_logits = start_logits
            fin_end_logits = end_logits
        else:
            fin_start_logits += start_logits
            fin_end_logits += end_logits
            
#         to_save, fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (start_logits, end_logits))      
#         with open(f"top-preds-muril-large-f{fold}.json", "w") as fp:
#             json.dump(to_save, fp)

        del model
        torch.cuda.empty_cache()

In [None]:
import pickle

if do_inference:
    fin_start_logits /= len(models)
    fin_end_logits /= len(models)
    
    # This is for voting
#     to_save, fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (fin_start_logits, fin_end_logits))
#     with open('muril-large-preds.json', "w") as fp:
#         json.dump(to_save, fp)

    all_answers, predictions = new_postprocess_qa_predictions(
    test_dataset, tokenizer, test_features, (fin_start_logits, fin_end_logits), max_length, doc_stride, num_special_tokens=2, file_name="muril-large", n_best_size=20, max_answer_length=30, squad_v2=False
)

#     short_data["PredictionString"] = long_data["id"].map(fin_preds)

# MuRIL BigBird Base  (I ended up not using this)
All data  
4k max length

In [None]:
# if do_inference:
#     tokenizer = transformers.AutoTokenizer.from_pretrained("../input/bb-base-chaii")
#     pad_on_right = tokenizer.padding_side == "right"
#     max_length = 4096
#     doc_stride = 2048

#     test_dataset = Dataset.from_pandas(test_data)
#     test_features = test_dataset.map(
#         partial(
#             prepare_validation_features, 
#             tokenizer=tokenizer,
#             pad_on_right=pad_on_right, 
#             max_length=max_length,
#             doc_stride=doc_stride
#         ),
#         batched=True,
#         remove_columns=test_dataset.column_names
#     )
#     test_feats_small = test_features.map(
#         lambda example: example, remove_columns=['example_id', 'offset_mapping']
#     )

#     fin_start_logits = None
#     fin_end_logits = None

#     models = [
#         "../input/bb-base-chaii",
#         "../input/nbroad-flax-muril-bb-base-chaii-f2",
#         "../input/nbroad-flax-muril-bb-base-chaii-f3",
#         "../input/nbroad-flax-muril-bb-base-chaii-f4",
#         "../input/nbroad-flax-bb-base-chaii-f5",
#         "../input/nbroad-flax-muril-bb-base-chaii-f6",
#         "../input/nbroad-flax-muril-bb-base-chaii-f7", 
#     ]

#     data_loader = torch.utils.data.DataLoader(
#         test_feats_small.with_format("torch"), 
#         batch_size=16,
#         num_workers=4,
#         pin_memory=True,
#         shuffle=False
#     )


#     for fold, model_name in tqdm(enumerate(models)):
#         model = ChaiiModel(model_name=model_name, num_train_steps=0, steps_per_epoch=0, learning_rate=0)
#         model.transformer.load_state_dict(torch.load(f"{model_name}/pytorch_model.bin"))
#         model.to("cuda")
#         model.eval()

#         start_logits = []
#         end_logits = []

#         for b_idx, data in enumerate(data_loader):
#             with torch.no_grad():
#                 for key, value in data.items():
#                     data[key] = value.to("cuda")
#                 output, _, _ = model(ids=data["input_ids"], mask=data["attention_mask"])
#                 start = output[0].detach().cpu().numpy()
#                 end = output[1].detach().cpu().numpy()
#                 start_logits.append(start)
#                 end_logits.append(end)

#         start_logits = np.vstack(start_logits)
#         end_logits = np.vstack(end_logits)

#         if fin_start_logits is None:
#             fin_start_logits = start_logits
#             fin_end_logits = end_logits
#         else:
#             fin_start_logits += start_logits
#             fin_end_logits += end_logits
            
# #         to_save, fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (start_logits, end_logits))      
# #         with open(f"top-preds-bb-f{fold}.json", "w") as fp:
# #             json.dump(to_save, fp)

#         del model
#         torch.cuda.empty_cache()

In [None]:
# # if do_inference:
# #     to_save, fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (fin_start_logits, fin_end_logits))
# if do_inference:
#     fin_start_logits /= len(models)
#     fin_end_logits /= len(models)
    
# #     to_save, fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (fin_start_logits, fin_end_logits))
# #     with open('muril-large-preds.json', "w") as fp:
# #         json.dump(to_save, fp)

#     all_answers, predictions = new_postprocess_qa_predictions(
#     test_dataset, tokenizer, test_features, (fin_start_logits, fin_end_logits), max_length, doc_stride, num_special_tokens=2, file_name="bb-4k", n_best_size=20, max_answer_length=30, squad_v2=False
# )

In [None]:
# if do_inference:
#     long_data["PredictionString"] = long_data["id"].map(fin_preds)

# MuRIL BigBird Large
1k max length  
ALL data

In [None]:
if do_inference:
    tokenizer = transformers.AutoTokenizer.from_pretrained("../input/murilbasecased")
    
    pad_on_right = tokenizer.padding_side == "right"
    max_length = 1024
    doc_stride = 512

    test_dataset = Dataset.from_pandas(test_data)
    test_features = test_dataset.map(
        partial(
            prepare_validation_features, 
            tokenizer=tokenizer,
            pad_on_right=pad_on_right, 
            max_length=max_length,
            doc_stride=doc_stride
        ),
        batched=True,
        remove_columns=test_dataset.column_names
    )
    test_feats_small = test_features.map(
        lambda example: example, remove_columns=['example_id', 'offset_mapping']
    )

    fin_start_logits = None
    fin_end_logits = None

    data_loader = torch.utils.data.DataLoader(
        test_feats_small.with_format("torch"), 
        batch_size=16,
        num_workers=4,
        pin_memory=True,
        shuffle=False
    )

    
    model_name = "../input/muril-large-bigbird-1k-6f/nbroad/1k-shuf-squad-chaii-6f0"
    for fold in tqdm(range(6)):
        model = ChaiiModel(model_name=model_name, num_train_steps=0, steps_per_epoch=0, learning_rate=0)
        model.transformer.load_state_dict(torch.load(f"../input/muril-large-bigbird-1k-6f/nbroad/1k-shuf-squad-chaii-6f{fold}/pytorch_model.bin"))
        model.to("cuda")
        model.eval()

        start_logits = []
        end_logits = []

        for b_idx, data in enumerate(data_loader):
            with torch.no_grad():
                for key, value in data.items():
                    data[key] = value.to("cuda")
                output, _, _ = model(ids=data["input_ids"], mask=data["attention_mask"])
                start = output[0].detach().cpu().numpy()
                end = output[1].detach().cpu().numpy()
                start_logits.append(start)
                end_logits.append(end)

        start_logits = np.vstack(start_logits)
        end_logits = np.vstack(end_logits)

        if fin_start_logits is None:
            fin_start_logits = start_logits
            fin_end_logits = end_logits
        else:
            fin_start_logits += start_logits
            fin_end_logits += end_logits
            
#         to_save, fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (start_logits, end_logits))      
#         with open(f"top-preds-muril-large-f{fold}.json", "w") as fp:
#             json.dump(to_save, fp)

        del model
        torch.cuda.empty_cache()

In [None]:
if do_inference:
    
    fin_start_logits /= len(models)
    fin_end_logits /= len(models)
    
    all_answers, predictions = new_postprocess_qa_predictions(
    test_dataset, tokenizer, test_features, (fin_start_logits, fin_end_logits), max_length, doc_stride, num_special_tokens=2, file_name="bb-1k", n_best_size=20, max_answer_length=30, squad_v2=False
)

# XLMR-Large 6 folds

384_128  
Short data

In [None]:
if do_inference:
    tokenizer = transformers.AutoTokenizer.from_pretrained("../input/xlmrob")
    pad_on_right = tokenizer.padding_side == "right"
    max_length = 384
    doc_stride = 128

    test_dataset = Dataset.from_pandas(short_data)
    test_features = test_dataset.map(
        partial(
            prepare_validation_features, 
            tokenizer=tokenizer,
            pad_on_right=pad_on_right, 
            max_length=max_length,
            doc_stride=doc_stride
        ),
        batched=True,
        remove_columns=test_dataset.column_names
    )
    test_feats_small = test_features.map(
        lambda example: example, remove_columns=['example_id', 'offset_mapping']
    )

    fin_start_logits = None
    fin_end_logits = None


    data_loader = torch.utils.data.DataLoader(
        test_feats_small.with_format("torch"), 
        batch_size=16,
        num_workers=4,
        pin_memory=True,
        shuffle=False
    )

    model_name = "../input/convert-to-pytorch-xlmr-large-chaii-6f/nbroad/xlmr-large-chaii-6f0"
    for fold in tqdm(range(6)):
        model = ChaiiModel(model_name=model_name, num_train_steps=0, steps_per_epoch=0, learning_rate=0)
        model.transformer.load_state_dict(torch.load(f"../input/convert-to-pytorch-xlmr-large-chaii-6f/nbroad/xlmr-large-chaii-6f{fold}/pytorch_model.bin"))
        model.to("cuda")
        model.eval()

        start_logits = []
        end_logits = []

        for b_idx, data in enumerate(data_loader):
            with torch.no_grad():
                for key, value in data.items():
                    data[key] = value.to("cuda")
                output, _, _ = model(ids=data["input_ids"], mask=data["attention_mask"])
                start = output[0].detach().cpu().numpy()
                end = output[1].detach().cpu().numpy()
                start_logits.append(start)
                end_logits.append(end)

        start_logits = np.vstack(start_logits)
        end_logits = np.vstack(end_logits)

        if fin_start_logits is None:
            fin_start_logits = start_logits
            fin_end_logits = end_logits
        else:
            fin_start_logits += start_logits
            fin_end_logits += end_logits
            
#         to_save, fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (start_logits, end_logits))      
#         with open(f"top-preds-bb-f{fold}.json", "w") as fp:
#             json.dump(to_save, fp)

        del model
        torch.cuda.empty_cache()

In [None]:
if do_inference:
    fin_start_logits /= 6
    fin_end_logits /= 6
    
    all_answers, predictions = new_postprocess_qa_predictions(
    test_dataset, tokenizer, test_features, (fin_start_logits, fin_end_logits), max_length, doc_stride, num_special_tokens=3, file_name="xlmr", n_best_size=20, max_answer_length=30, squad_v2=False
)

# RemBERT
384_128  
Short Data

In [None]:
if do_inference:
    import tensorflow as tf
    import tensorflow.keras.backend as K

    rembert_model_path = "../input/rembert-tf"
    
    tokenizer = transformers.AutoTokenizer.from_pretrained(rembert_model_path)

    strategy = tf.distribute.get_strategy()
    AUTO     = tf.data.experimental.AUTOTUNE

    max_length = 384
    doc_stride = 128

    pad_on_right = tokenizer.padding_side == "right"

In [None]:
if do_inference:
    def build_model():
        roberta = transformers.TFAutoModel.from_pretrained(rembert_model_path)

        input_ids = tf.keras.layers.Input(shape = (max_length, ), name = 'input_ids', dtype = tf.int32)
        attention_mask = tf.keras.layers.Input(shape = (max_length, ), name = 'attention_mask', dtype = tf.int32)

        embeddings = roberta(input_ids=input_ids, attention_mask=attention_mask)[0]

        x1 = tf.keras.layers.Dropout(0.1)(embeddings)
        x1 = tf.keras.layers.Dense(1, dtype=tf.float32)(x1)
        x1 = tf.keras.layers.Flatten()(x1)
        x1 = tf.keras.layers.Activation('softmax', name='start_positions', dtype=tf.float32)(x1)

        x2 = tf.keras.layers.Dropout(0.1)(embeddings)
        x2 = tf.keras.layers.Dense(1, dtype=tf.float32)(x2)
        x2 = tf.keras.layers.Flatten()(x2)
        x2 = tf.keras.layers.Activation('softmax', name='end_positions', dtype=tf.float32)(x2)

        model = tf.keras.models.Model(inputs = [input_ids, attention_mask], outputs = [x1, x2])

        model.compile()

        return model

In [None]:
if do_inference:
    
    fin_start_logits = None
    fin_end_logits = None

    batch_size = 128
    
    strategy = tf.distribute.get_strategy()
    

    test_dataset = Dataset.from_pandas(short_data)

    test_features = test_dataset.map(
            partial(
                prepare_validation_features, 
                tokenizer=tokenizer,
                pad_on_right=pad_on_right, 
                max_length=max_length,
                doc_stride=doc_stride
            ),
            batched=True,
            remove_columns=test_dataset.column_names
        )

    test_features = test_features.with_format('tensorflow')

    test_x = {x: test_features[x] for x in ['input_ids', 'attention_mask']}
    test_slices = tf.data.Dataset.from_tensor_slices((test_x)).batch(batch_size)

    models = [
            "../input/rembert-0-3hi-v2/rembert-fit-chaii/fold0/tf_model.h5",
            "../input/rembert-0-3hi-v2/rembert-fit-chaii/fold1/tf_model.h5",
            "../input/rembert-0-3hi-v2/rembert-fit-chaii/fold2/tf_model.h5",
            "../input/rembert-4-6-ta/rembert-fit-chaii/fold4/tf_model.h5", 
            "../input/rembert-4-6-ta/rembert-fit-chaii/fold5/tf_model.h5",
    ]

    for model_name in models:
        K.clear_session()
        strategy = tf.distribute.get_strategy()

        with strategy.scope():
            model = build_model()
            model.load_weights(model_name)


        temp_start_logits, temp_end_logits = model.predict(test_slices, batch_size=batch_size, verbose=1)


        start_logits = np.vstack(temp_start_logits)
        end_logits = np.vstack(temp_end_logits)

        if fin_start_logits is None:
            fin_start_logits = start_logits
            fin_end_logits = end_logits
        else:
            fin_start_logits += start_logits
            fin_end_logits += end_logits


    fin_start_logits /= 5
    fin_end_logits /= 5

    all_answers, predictions = new_postprocess_qa_predictions(
        test_dataset, tokenizer, test_features.with_format("numpy"), (fin_start_logits, fin_end_logits), max_length, doc_stride, num_special_tokens=2, file_name="remb", n_best_size=20, max_answer_length=30, squad_v2=False
    )

In [None]:
if do_inference:
    def move_to_boundary(start, end, context):
        initial_start = start
        start_offset = 0
        end_offset = 0
        if not context[start].isspace():
            start_offset += 1
            while start-start_offset > 0 and not context[start-start_offset].isspace():
                start_offset+=1
        if not context[end].isspace():
            end_offset += 1
            while end+end_offset < len(context) and not context[end+end_offset].isspace():
                end_offset += 1
        return context[start-start_offset:end+end_offset]

## Char-level averaging that never got finished

In [None]:
if do_inference:
    from scipy.special import softmax   
    
    def fix_duplicate_scores(scores, starts=True):
        # if starts=True, will keep the value with the lower index
        # [1,2,2,3,4] -> [1,2,-10, 3, 4]
        # otherwise, keep the value with higher index
        # [1,2,2,3,4] -> [1,-10, 2, 3, 4]   

        if not starts:
            scores = scores[::-1]

        prev = scores[0]
        new_scores = [prev]
        for val in scores[1:]:
            if abs(val-prev) < 1e-5:
                new_scores.append(-100)
            else:
                new_scores.append(val)
            prev = val

        return np.array(new_scores) if starts else np.array(new_scores[::-1])
    
    def process_char_level_logits(filenames, data, multipliers, n_best_size=20, max_answer_length=35):
        all_starts = {}
        for filename in filenames:
            with open(f"char-level-start-logits-{filename}", "rb") as fp:
                all_starts[filename] = pickle.load(fp)

        all_ends = {}
        for filename in filenames:
            with open(f"char-level-end-logits-{filename}", "rb") as fp:
                all_ends[filename] = pickle.load(fp)

        predictions = collections.OrderedDict()
        all_answers = []
        for id_, context in data[["id", "context"]].values:
            start_logits = sum([softmax(logits[id_])*multipliers[filename] for filename, logits in all_starts.items()])
            end_logits = sum([softmax(logits[id_])*multipliers[filename] for filename, logits in all_ends.items()])
            
            start_logits = fix_duplicate_scores(start_logits, starts=True)
            end_logits = fix_duplicate_scores(end_logits, starts=False)

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()

            valid_answers = []
            for start_index in start_indexes:
                if start_logits[start_index] == -100: continue
                for end_index in end_indexes:
                    if end_logits[end_index] == -100: continue
                    if end_index < start_index or end_index - start_index + 1 > 30:
                        continue

                    text = context[start_index:end_index]
                    valid_answers.append(
                        {
                            "score": float(start_logits[start_index] + end_logits[end_index]),
                            "text": text,
                            "start": start_index,
                            "end": end_index,
                            "moved": move_to_boundary(start_index, end_index, context)
                        }
                    )
        
            sorted(valid_answers, key=lambda x: x["score"], reverse=True)

            if len(valid_answers) > 0:
                best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
            else:
                best_answer = {"text": "", "score": 0.0}

            answer = best_answer["moved"]
            predictions[id_] = answer

            valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_size]
            all_answers.append({"id": id_, "predictions": valid_answers})

        return all_answers, predictions

    multipliers = {
        "bb-4k": 0.5,
        "muril-large": 0.8,
        "bb-1k": 1.3,
        "xlmr": 1.1,
        "remb": 1.2
    }
    filenames = [
        'muril-large', 
        #"bb-4k",
        "bb-1k", 
        "xlmr", 
        "remb"
    ]
    output_short = process_char_level_logits(filenames, short_data, multipliers)
    
    filenames = [
#         "bb-4k", 
        "bb-1k"
    ]
    output_long = process_char_level_logits(filenames, long_data, multipliers)

In [None]:
if do_inference:
    short_data["PredictionString"] = short_data["id"].map(output_short[1])
    long_data["PredictionString"] = long_data["id"].map(output_long[1])

In [None]:
# sub2 = pd.DataFrame(submission, columns=["id", "PredictionString"])

# final = pd.concat([sub1, sub2], axis=0, ignore_index=True)
# final = final.merge(test_data[["context", "question", "id"]], on="id")

## Attempts at voting that didn't work

In [None]:
# # folds have been averaged and have only 1 prediction file 
# if do_inference:
#     model_fold_preds = {}
#     with open('xlmr-large-preds.json') as fp:
#         model_fold_preds["xlmr"] = json.load(fp)
#     with open('muril-large-preds.json') as fp:
#         model_fold_preds["muril"] = json.load(fp)

#     from collections import Counter

#     voted_preds = {}
#     top_k = 4
#     for i in range(len(short_data)):
#         cnt = Counter()
#         for fold_, preds in enumerate(model_fold_preds.values()):
#             cnt.update([text["text"] for text in preds[i]["predictions"]][:top_k])
            
#         most_common = cnt.most_common(top_k)
#         voted_preds[preds[i]["id"]] = most_common[0][0]
        
#     short_data["PredictionString"] = short_data["id"].map(voted_preds)

In [None]:
# if do_inference:
#     model_fold_preds = {}
#     model = "xlmr"
#     for fold_ in range(10):
#         with open(f"top-preds-{model}-f{fold_}.json") as fp:
#             model_fold_preds[f"{model}-{fold_}"] = json.load(fp)

#     from collections import Counter

#     voted_preds = {}
#     top_k = 100
#     for i in range(len(short_data)):
#         cnt = Counter()
#         for fold_, preds in enumerate(model_fold_preds.values()):
#             cnt.update([text["text"] for text in preds[i]["predictions"]][:top_k])
            
#         most_common = cnt.most_common(10)
#         voted_preds[preds[i]["id"]] = most_common[0][0]
        
#     short_data["PredictionString"] = short_data["id"].map(voted_preds)

In [None]:
# if do_inference:
#     model_fold_preds = {}
    
#     model = "bb"
#     for fold_ in range(7):
#         with open(f"top-preds-{model}-f{fold_}.json") as fp:
#             model_fold_preds[f"{model}-{fold_}"] = json.load(fp)

#     from collections import Counter

#     voted_preds = {}
#     for i in range(len(long_data)):
#         cnt = Counter()
#         for fold_, preds in enumerate(model_fold_preds.values()):
#             cnt.update([text["text"] for text in preds[i]["predictions"]][:top_k])
            
#         most_common = cnt.most_common(10)
#         voted_preds[preds[i]["id"]] = most_common[0][0]
        
#     long_data["PredictionString"] = long_data["id"].map(voted_preds)

In [None]:
if do_inference:
    test_data = pd.concat([short_data, long_data], axis=0, ignore_index=True)

In [None]:
# if do_inference:
#     test_data = short_data.copy()

In [None]:
test_data

In [None]:
bad_starts = [".", ",", "(", ")", "-", "–",  ",", ";", ":"]
bad_endings = ["-", "(", ")", "–", ",", ";", ":"]

tamil_ad = "கி.பி"
tamil_bc = "கி.மு"
tamil_km = "கி.மீ"
hindi_ad = "ई"
hindi_bc = "ई.पू"

cleaned_preds = []
if do_inference:
    for pred, context in test_data[["PredictionString", "context"]].values:
        pred = pred.strip()
        if pred == "":
            cleaned_preds.append(pred)
            continue

        # I haven't check sure if this makes a difference, but there is one answer in the training set that ends like this and I think it is an annotator mistake
        # see my notebook here for details https://www.kaggle.com/nbroad/chaii-qa-character-token-languages-eda 
        if pred.endswith("..."):
            pred = pred[:-3]

        pred = pred.lstrip("".join(bad_starts))
        pred = pred.rstrip("".join(bad_endings))

        if any([pred.endswith(tamil_ad), pred.endswith(tamil_bc), pred.endswith(tamil_km), pred.endswith(hindi_ad), pred.endswith(hindi_bc)]) and pred+"." in context:
            pred = pred+"."


        cleaned_preds.append(pred)

    test_data["PredictionString"] = cleaned_preds

In [None]:
if do_inference:
    test_data["pred_len"] = [len(x) for x in test_data["PredictionString"]]
# do something if the prediction is too short

In [None]:
if do_inference:
    test_data[["id", "PredictionString"]].to_csv("submission.csv", index=False)
else:
    test_data["PredictionString"] = "lol"
    test_data[["id", "PredictionString"]].to_csv("submission.csv", index=False)

In [None]:
if do_inference:
    print(test_data[["id", "PredictionString"]])