## Models and baseline

Thanks @abhishek and @chryzal

**two longformers are better than 1**

https://www.kaggle.com/abhishek/two-longformers-are-better-than-1

**Feedback Prize 2021 👏 Pytorch [better parameters]**

https://www.kaggle.com/chryzal/feedback-prize-2021-pytorch-better-parameters

# 1. Import & Set & Class & Def & Load

In [None]:
import gc
gc.enable()

import os
import re
import random
import sys
sys.path.append("../input/tez-lib/")

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 80)
pd.set_option("display.precision", 12)

import tez
import torch
import torch.nn as nn

from joblib import Parallel, delayed
from transformers import AutoConfig, AutoModel, AutoTokenizer

from spacy import displacy

In [None]:
target_id_map = {
    "B-Lead": 0,
    "I-Lead": 1,
    "B-Position": 2,
    "I-Position": 3,
    "B-Evidence": 4,
    "I-Evidence": 5,
    "B-Claim": 6,
    "I-Claim": 7,
    "B-Concluding Statement": 8,
    "I-Concluding Statement": 9,
    "B-Counterclaim": 10,
    "I-Counterclaim": 11,
    "B-Rebuttal": 12,
    "I-Rebuttal": 13,
    "O": 14,
    "PAD": -100,
}

id_target_map = {v: k for k, v in target_id_map.items()}

COLORS = {
    'Lead': '#8000ff',
    'Position': '#2b7ff6',
    'Evidence': '#2adddd',
    'Claim': '#80ffb4',
    'Concluding Statement': 'd4dd80',
    'Counterclaim': '#ff8042',
    'Rebuttal': '#ff0000'
}

COMP_DIR = "../input/feedback-prize-2021/"

In [None]:
class args1:
    input_path = COMP_DIR
    model = "../input/longformerlarge4096/longformer-large-4096/"
    tez_model= "../input/fblongformerlarge1536/"
    output = "."
    batch_size = 8
    max_len = 4096
    
    
class args2:
    input_path = COMP_DIR
    model = "../input/longformerlarge4096/longformer-large-4096/"
    tez_model= "../input/tez-fb-large/"
    output = "."
    batch_size = 8
    max_len = 4096

    
class FeedbackDataset:
    def __init__(self, samples, max_len, tokenizer):
        self.samples = samples
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.length = len(samples)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        input_ids = self.samples[idx]["input_ids"]

        # add start token id to the input_ids
        input_ids = [self.tokenizer.cls_token_id] + input_ids

        if len(input_ids) > self.max_len - 1:
            input_ids = input_ids[: self.max_len - 1]

        # add end token id to the input_ids
        input_ids = input_ids + [self.tokenizer.sep_token_id]
        attention_mask = [1] * len(input_ids)

        return {
            "ids": input_ids,
            "mask": attention_mask,
        }

    
class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        output = dict()
        output["ids"] = [sample["ids"] for sample in batch]
        output["mask"] = [sample["mask"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["ids"]]
            output["mask"] = [s + (batch_max - len(s)) * [0] for s in output["mask"]]
        else:
            output["ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["ids"]]
            output["mask"] = [(batch_max - len(s)) * [0] + s for s in output["mask"]]

        # convert to tensors
        output["ids"] = torch.tensor(output["ids"], dtype=torch.long)
        output["mask"] = torch.tensor(output["mask"], dtype=torch.long)

        return output
    
    
class FeedbackModel(tez.Model):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.model_name = model_name
        self.num_labels = num_labels
        config = AutoConfig.from_pretrained(model_name)

        hidden_dropout_prob: float = 0.2
        layer_norm_eps: float = 17589e-7
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )
        self.transformer = AutoModel.from_config(config)
        self.output = nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, ids, mask):
        transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state
        logits = self.output(sequence_output)
        logits = torch.softmax(logits, dim=-1)
        return logits, 0, {}

In [None]:
def _prepare_test_data_helper(args, tokenizer, ids):
    test_samples = []
    for idx in ids:
        filename = os.path.join(args.input_path, "test", idx + ".txt")
        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True,
        )
        input_ids = encoded_text["input_ids"]
        offset_mapping = encoded_text["offset_mapping"]

        sample = {
            "id": idx,
            "input_ids": input_ids,
            "text": text,
            "offset_mapping": offset_mapping,
        }

        test_samples.append(sample)
    return test_samples


def prepare_test_data(df, tokenizer, args):
    test_samples = []
    ids = df["id"].unique()
    ids_splits = np.array_split(ids, 4)

    results = Parallel(n_jobs=4, backend="multiprocessing")(
        delayed(_prepare_test_data_helper)(args, tokenizer, idx) for idx in ids_splits
    )
    for result in results:
        test_samples.extend(result)

    return test_samples


def jn(pst, start, end):
    return " ".join([str(x) for x in pst[start:end]])


def link_evidence(oof):
    thresh = 1
    thresh2 = 30  # Baseline: 26
    
    idu = oof['id'].unique()
    idc = idu[1]
    
    eoof = oof[oof['class'] == "Evidence"]
    neoof = oof[oof['class'] != "Evidence"]
    
    retval = []
    
    for idv in idu:
        for c in  ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
               'Counterclaim', 'Rebuttal']:
            q = eoof[(eoof['id'] == idv) & (eoof['class'] == c)]
            if len(q) == 0:
                continue
            pst = []
            for i,r in q.iterrows():
                pst = pst +[-1] + [int(x) for x in r['predictionstring'].split()]
            start = 1
            end = 1
            for i in range(2,len(pst)):
                cur = pst[i]
                end = i

                if (cur == -1 and c != 'Evidence') or ((cur == -1) and ((pst[i+1] > pst[end-1] + thresh) or (pst[i+1] - pst[start] > thresh2))):
                    retval.append((idv, c, jn(pst, start, end)))
                    start = i + 1
            v = (idv, c, jn(pst, start, end+1))

            retval.append(v)
            
    roof = pd.DataFrame(retval, columns = ['id', 'class', 'predictionstring']) 
    roof = roof.merge(neoof, how='outer')
    
    return roof


def visualize(example):
    id_col = "id"
    class_col = "class"
    predict_col = "predictionstring"

    example_file = example + ".txt"
    example_path = os.path.join(COMP_DIR, 'test', example_file)
    
    if not os.path.isfile(example_path):
        return None
    
    with open(example_path, 'r') as file:
        data = file.read()
        
    inds_dict = {i: [ele.start(), ele.end()] for i, ele in enumerate(re.finditer(r'\S+', data))}
    
    example_df = submission[submission[id_col] == example].copy()
    example_df['indx'] = example_df[predict_col].apply(lambda s: int([x for x in s.split()][0]))
    example_df = example_df.sort_values(by='indx')
                                                              
    ents = []
    for i, row in example_df.iterrows():
        discourse = row[predict_col]
        first_word = int([x for x in discourse.split()][0])
        last_word = int([x for x in discourse.split()][-1])
        ents.append({
            'start': inds_dict.get(first_word)[0],
            'end': inds_dict.get(last_word)[1],
            'label': row[class_col]
        })

    doc2 = {
        "text": data,
        "ents": ents,
        "title": None
    }
    
    options = {"ents": list(COLORS.keys()), "colors": COLORS}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

In [None]:
df = pd.read_csv("../input/feedback-prize-2021/sample_submission.csv")
df_ids = df["id"].unique()

In [None]:
%whos module

In [None]:
%whos function

In [None]:
%whos dict or str or DataFrame or ndarray

# 2. Create test_samples (Prediction & Scores)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(args1.model)
test_samples = prepare_test_data(df, tokenizer, args1)

collate = Collate(tokenizer=tokenizer)

raw_preds = []
for fold_ in range(10):
    current_idx = 0
    test_dataset = FeedbackDataset(test_samples, args1.max_len, tokenizer)
    
    if fold_ < 5:
        model = FeedbackModel(model_name=args1.model, num_labels=len(target_id_map) - 1)
        model.load(os.path.join(args1.tez_model, f"model_{fold_}.bin"), weights_only=True)
        preds_iter = model.predict(test_dataset, batch_size=args1.batch_size, n_jobs=-1, collate_fn=collate)
    else:
        model = FeedbackModel(model_name=args2.model, num_labels=len(target_id_map) - 1)
        model.load(os.path.join(args2.tez_model, f"model_{fold_-5}.bin"), weights_only=True)
        preds_iter = model.predict(test_dataset, batch_size=args2.batch_size, n_jobs=-1, collate_fn=collate)
        
    current_idx = 0
    
    for preds in preds_iter:
        preds = preds.astype(np.float16)
        preds = preds / 10
        if fold_ == 0:
            raw_preds.append(preds)
        else:
            raw_preds[current_idx] += preds
            current_idx += 1
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
final_preds = []
final_scores = []

for rp in raw_preds:
    pred_class = np.argmax(rp, axis=2)
    pred_scrs = np.max(rp, axis=2)
    for pred, pred_scr in zip(pred_class, pred_scrs):
        pred = pred.tolist()
        pred_scr = pred_scr.tolist()
        final_preds.append(pred)
        final_scores.append(pred_scr)

for j in range(len(test_samples)):
    tt = [id_target_map[p] for p in final_preds[j][1:]]
    tt_score = final_scores[j][1:]
    test_samples[j]["preds"] = tt
    test_samples[j]["pred_scores"] = tt_score

In [None]:
n_samples = len(test_samples)
n_samples

In [None]:
check_sample = test_samples[random.randint(0, n_samples-1)]
max_len_values = 180

print(check_sample.keys(), "\n")
print(check_sample.get('id'), "\n")
print(check_sample.get('text')[:max_len_values * 5], "... \n")
print(check_sample.get('input_ids')[:max_len_values], "... \n")
print(check_sample.get('offset_mapping')[:max_len_values], "... \n")
print(check_sample.get('preds')[:max_len_values], "... \n")
print(check_sample.get('pred_scores')[:max_len_values], "... \n")

# 3. Create & Save Submission

In [None]:
# ********************************* Baseline:
proba_thresh = {
    "Lead": 0.6875,                 # 0.6875
    "Position": 0.5550,             # 0.5375
    "Evidence": 0.5300,             # 0.6375
    "Claim": 0.5300,                # 0.5375
    "Concluding Statement": 0.7100, # 0.6875
    "Counterclaim": 0.5375,         # 0.5375
    "Rebuttal": 0.5375              # 0.5375
}

min_thresh = {
    "Lead": 9,                      # 9
    "Position": 5,                  # 5
    "Evidence": 7,                  # 14
    "Claim": 2,                     # 3
    "Concluding Statement": 5,      # 11
    "Counterclaim": 6,              # 6
    "Rebuttal": 4                   # 4
}

In [None]:
submission = []

for sample_idx, sample in enumerate(test_samples):
    preds = sample["preds"]
    offset_mapping = sample["offset_mapping"]
    sample_id = sample["id"]
    sample_text = sample["text"]
    sample_input_ids = sample["input_ids"]
    sample_pred_scores = sample["pred_scores"]
    sample_preds = []

    if len(preds) < len(offset_mapping):
        preds = preds + ["O"] * (len(offset_mapping) - len(preds))
        sample_pred_scores = sample_pred_scores + [0] * (len(offset_mapping) - len(sample_pred_scores))
    
    idx = 0
    phrase_preds = []
    while idx < len(offset_mapping):
        start, _ = offset_mapping[idx]
        if preds[idx] != "O":
            label = preds[idx][2:]
        else:
            label = "O"
        phrase_scores = []
        phrase_scores.append(sample_pred_scores[idx])
        idx += 1
        while idx < len(offset_mapping):
            if label == "O":
                matching_label = "O"
            else:
                matching_label = f"I-{label}"
            if preds[idx] == matching_label:
                _, end = offset_mapping[idx]
                phrase_scores.append(sample_pred_scores[idx])
                idx += 1
            else:
                break
        if "end" in locals():
            phrase = sample_text[start:end]
            phrase_preds.append((phrase, start, end, label, phrase_scores))

    temp_df = []
    for phrase_idx, (phrase, start, end, label, phrase_scores) in enumerate(phrase_preds):
        word_start = len(sample_text[:start].split())
        word_end = word_start + len(sample_text[start:end].split())
        word_end = min(word_end, len(sample_text.split()))
        ps = " ".join([str(x) for x in range(word_start, word_end)])
        if label != "O":
            if sum(phrase_scores) / len(phrase_scores) >= proba_thresh[label]:
                if len(ps.split()) >= min_thresh[label]:
                    temp_df.append((sample_id, label, ps))
    
    submission.extend(temp_df)

submission = pd.DataFrame(submission, columns=["id", "class", "predictionstring"])
submission = link_evidence(submission)

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)

# 4. Visualisation the result

In [None]:
for g_id, g_data in submission.groupby('id'):
    sorted_predict = g_data['predictionstring'].str.split(expand=True) \
                                [0].astype(int).sort_values().index
    
    display(g_data.loc[sorted_predict, :])
    visualize(g_id)