In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### This is Inference notebook of using only 🤗 trainer, compute_loss, datasets.

Checkout my Training [notebook](http://https://www.kaggle.com/code/raghavendrakotala/training-baseline-deberta-trainer-compute-loss) , if you want detailed training instructions.

### Load tokenizer and data

In [None]:
import shutil
from pathlib import Path

transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path / convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in [
    "tokenization_deberta_v2.py",
    "tokenization_deberta_v2_fast.py",
    "deberta__init__.py",
]:
    if str(filename).startswith("deberta"):
        filepath = deberta_v2_path / str(filename).replace("deberta", "")
    else:
        filepath = deberta_v2_path / filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir / filename, filepath)
    
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForTokenClassification




In [None]:
import datasets
from torch import cuda
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import torch

In [None]:
df_patients = pd.read_csv("../input/nbme-score-clinical-patient-notes/patient_notes.csv")
df_features = pd.read_csv("../input/nbme-score-clinical-patient-notes/features.csv")

In [None]:
config = {
         'max_length': 512,
         'valid_batch_size':16,
          "folds":5,
         'device': 'cuda' if cuda.is_available() else 'cpu'
}

### Preprocess data tokenize it

In [None]:
test_df = pd.read_csv("../input/nbme-score-clinical-patient-notes/test.csv")
results = []
for row in test_df.iterrows():
    hist, feat = df_patients[df_patients["pn_num"] ==row[1]['pn_num']]['pn_history'].values[0].lower(), df_features[(df_features["feature_num"] ==row[1]['feature_num']) & (df_features["case_num"] ==row[1]['case_num'])]['feature_text'].values[0]
    results.append([hist, feat])
test_df_ = pd.DataFrame.from_records(results, columns=['pn_history', 'feature_text'])
def process_feature_text(text):
    return text.replace("-OR-", " or ").replace("-", " ").lower()
test_df_["feature_text"] = test_df_['feature_text'].apply(process_feature_text)

test_dataset = datasets.Dataset.from_pandas(test_df_)

In [None]:
test_df_.head()

In [None]:
example = test_df_.iloc[0]

In [None]:
example['pn_history'][:20], example['pn_history'][4:7]

In [None]:
def predict_tokenize(example, tokenizer):
    tokens = tokenizer(example['feature_text'],
                                example['pn_history'],
                                truncation='only_second',
                                max_length = config['max_length'],
                                padding='max_length',
                                return_offsets_mapping=True)
    tokens['seq_ids'] = tokens.sequence_ids()
    tokens['ids_to_tokens'] = tokenizer.convert_ids_to_tokens(tokens['input_ids'])
    tokens['word_ids'] = tokens.word_ids()
    return tokens

### Override trainer class with compute_loss function.

In [None]:
class BinaryClassificationTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss(reduction="none")
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                        labels.float().view(-1, self.model.config.num_labels))
        loss = torch.masked_select(loss, labels.view(-1, 1) > -1).mean()
        return (loss, outputs) if return_outputs else loss

In [None]:
args = TrainingArguments('test_trainer',
                         per_device_eval_batch_size=config['valid_batch_size']
                        )

### Do the prediction and write support functions for results submission.

Make sure to take care of white-space, as its encoded into offset-mapping in deberta

In [None]:
fold_results = []
for fold in range(config['folds']):
    model_path = f"../input/dberta-5fold-on-mlm-colab/fold_{fold}"
    print(model_path)
    tokenizer = DebertaV2TokenizerFast.from_pretrained(model_path)
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    trainer = BinaryClassificationTrainer(
        model=model,
        args=args,
        tokenizer=tokenizer)
    tokenized = test_dataset.map(predict_tokenize, fn_kwargs={"tokenizer":tokenizer})
    tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask'], output_all_columns=True)
    results = trainer.predict(tokenized)
    fold_results.append([tokenized, torch.sigmoid(torch.tensor(results.predictions))])

In [None]:
def combine_offsets(output_idx_m):
    """
    Take the offset mapping and combines them if they are adjacent into single span.
    """
    final_out = []
#     print(output_idx_m)
    if output_idx_m:
        for a in output_idx_m:
#             print(a)
            if final_out and (a[0]-1 == final_out[-1][-1] or a[0] == final_out[-1][-1]):
                final_out[-1].extend([a[0], a[1]])
            else:
                final_out.append([a[0], a[1]])
    return final_out

In [None]:
import re
def return_output(tokens, preds):
    offset_mapping = tokens['offset_mapping']
    output_idx = []
    ids_to_tokens = tokens['ids_to_tokens']
    seq_ids = tokens["seq_ids"]
    word_ids = tokens['word_ids']
    set_word_ids = []
    for idx, (ids_, pred, (start, end), word_id) in enumerate(zip(seq_ids, preds, offset_mapping, word_ids)):
        if ids_ is not None and ids_ != 0:
            if pred > 0.5:
                # here we are removing 1 from start index due to deberta codes space also into its offset-mapping
                if re.match(r'^▁', ids_to_tokens[idx]):
                    output_idx.append((start+1, end))
                else:
                    output_idx.append((start, end))
    final_out = combine_offsets(output_idx)
    results_out = []
    for a in final_out:
        results_out.append(f"{a[0]} {a[-1]}")
    return ";".join(results_out)

### Do the mean of prediction across folds and submit the results.

In [None]:
pred_mean = torch.mean(torch.stack([i[1] for i in fold_results]), axis=0)

In [None]:
pred_mean.shape

In [None]:
results = []
for i in range(len(pred_mean)):
    tok, pred = fold_results[0][0][i], pred_mean[i]
#     print(tok, pred.shape)
    out = return_output(tok, pred)
    results.append(out)

In [None]:
sub_df = test_df[["id"]]
sub_df['location'] = results
sub_df.head()

In [None]:
sub_df.to_csv("submission.csv", index=False)

### I hope you leant a new way of using trainer class from hugginface, Upvote if you find it usefull. Happy learning!