# Imports and helpers

## Imports

In [None]:
import gc
gc.enable()
import sys
import os
import pickle
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from joblib import Parallel, delayed
from transformers import AutoConfig, AutoModel, AutoTokenizer
from tqdm.notebook import tqdm

## Helpers/functions from the original "2 longformers" Notebook

In [None]:
target_id_map = {
    "B-Lead": 0,
    "I-Lead": 1,
    "B-Position": 2,
    "I-Position": 3,
    "B-Evidence": 4,
    "I-Evidence": 5,
    "B-Claim": 6,
    "I-Claim": 7,
    "B-Concluding Statement": 8,
    "I-Concluding Statement": 9,
    "B-Counterclaim": 10,
    "I-Counterclaim": 11,
    "B-Rebuttal": 12,
    "I-Rebuttal": 13,
    "O": 14,
    "PAD": -100,
}


id_target_map = {v: k for k, v in target_id_map.items()}

class args1:
    input_path = "../input/feedback-prize-2021/"
    model = "../input/longformerlarge4096/longformer-large-4096/"
    tez_model= "../input/fblongformerlarge1536/"
    output = "."
    batch_size = 8
    max_len = 4096
    
class args2:
    input_path = "../input/feedback-prize-2021/"
    model = "../input/longformerlarge4096/longformer-large-4096/"
    tez_model= "../input/tez-fb-large/"
    output = "."
    batch_size = 8
    max_len = 4096

In [None]:
def _prepare_test_data_helper(args, tokenizer, ids, train_or_test):
    test_samples = []
    for idx in ids:
        filename = os.path.join(args.input_path, train_or_test, idx + ".txt")
        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True,
        )
        input_ids = encoded_text["input_ids"]
        offset_mapping = encoded_text["offset_mapping"]

        sample = {
            "id": idx,
            "input_ids": input_ids,
            "text": text,
            "offset_mapping": offset_mapping,
        }

        test_samples.append(sample)
    return test_samples


def prepare_test_data(ids, tokenizer, args, train_or_test):
    test_samples = []
    ids_splits = np.array_split(ids, 4)

    results = Parallel(n_jobs=4, backend="multiprocessing")(
        delayed(_prepare_test_data_helper)(args, tokenizer, idx, train_or_test) for idx in ids_splits
    )
    for result in results:
        test_samples.extend(result)

    return test_samples

## Metric calculation functions

In [None]:
CLASSES_LIST = [
    "Lead",
    "Position",
    "Claim",
    "Counterclaim",
    "Rebuttal",
    "Evidence",
    "Concluding Statement"
]

In [None]:
# Code adapted from Rob Mulla (@robikscube) (https://www.kaggle.com/robikscube/student-writing-competition-twitch)
def is_match(row):
    """
    Returns  if prediction and ground truth are matching.
    Only used internally in get_scores.
    """
    set_pred = set(row.predictionstring_pred.split(" "))
    set_gt = set(row.predictionstring_gt.split(" "))
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len(set_gt)
    overlap_2 = inter / len(set_pred)
    return overlap_1 >= 0.5 and overlap_2 >= 0.5

In [None]:
def get_scores(pred_df, gt_df):
    """
    Returns precision, recall and f1 scores. Only used internally in kaggle_score
    for one class at a time.
    """

    # Checking DataFrames emptiness before proceeding with calculations:
    nan_metrics_nb = 0
    
    if pred_df.empty:
        precision = np.nan # Precision has no mathematical meaning in that case
        recall = 0
        nan_metrics_nb += 1
    
    if gt_df.empty:
        precision = 0
        recall = np.nan # Recall has no mathematical meaning in that case
        nan_metrics_nb += 1
    
    if nan_metrics_nb > 0:
        return {
            "precision" : precision,
            "recall" : recall,
            "f1" : np.nan if nan_metrics_nb == 2 else 0
        }
    
    # If no DataFrame is empty, we proceed:
    gt_df = gt_df[["id", "discourse_type", "predictionstring"]].reset_index(drop=True).copy()
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    pred_df["pred_id"] = pred_df.index
    gt_df["gt_id"] = gt_df.index

    # All ground truths and predictions for a given class are compared.
    joined = pred_df.merge(
        gt_df,
        left_on=["id", "class"],
        right_on=["id", "discourse_type"],
        how="outer",
        suffixes=("_pred", "_gt"),
    ).fillna("")

    # Purposedly ignoring multiple match possibilty (very unlikely) for efficiency
    tp_df = joined[joined.apply(is_match, axis=1)]

    TP = tp_df.shape[0]
    FP = pred_df.drop(tp_df["pred_id"]).shape[0]
    FN = gt_df.drop(tp_df["gt_id"]).shape[0]
    
    # Returning metrics
    return {
        "precision" : TP / (TP + FP),
        "recall" : TP / (TP + FN),
        "f1" : TP / (TP + 0.5 * (FP + FN))
    }

In [None]:
def kaggle_score(pred_df, gt_df, return_details=False):
    """
    A function that scores for the kaggle Student Writing Competition

    Uses the steps in the evaluation page, with a simplified (= random)
    calculation when 2 matches exist for the same discourse element. 
    See https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    scores = [
        get_scores(
            pred_df[pred_df["class"] == class_],
            gt_df[gt_df["discourse_type"] == class_]
        )
        for class_ in CLASSES_LIST   
    ]
    f1_score = np.nanmean([class_scores["f1"] for class_scores in scores])
    if return_details:
        return f1_score, dict(zip(CLASSES_LIST, scores))
    return f1_score

# Generating predictions

In [None]:
def jn(pst, start, end):
    return " ".join([str(x) for x in pst[start:end]])


def link_evidence(oof):
    thresh = 1
    thresh2 = 26
    idu = oof['id'].unique()
    eoof = oof[oof['class'] == "Evidence"]
    neoof = oof[oof['class'] != "Evidence"]
    retval = []
    for idv in idu:
        q = eoof[(eoof['id'] == idv)]
        if len(q) == 0:
            continue
        pst = []
        c="Evidence"
        for i,r in q.iterrows():
            pst = pst +[-1] + [int(x) for x in r['predictionstring'].split()]
        start = 1
        end = 1
        for i in range(2,len(pst)):
            cur = pst[i]
            end = i
            #if pst[start] == 205:
            #   print(cur, pst[start], cur - pst[start])
            if (cur == -1 and c != 'Evidence') or ((cur == -1) and ((pst[i+1] > pst[end-1] + thresh) or (pst[i+1] - pst[start] > thresh2))):
                retval.append((idv, c, jn(pst, start, end)))
                start = i + 1
        v = (idv, c, jn(pst, start, end+1))
        #print(v)
        retval.append(v)
    roof = pd.DataFrame(retval, columns = ['id', 'class', 'predictionstring']) 
    roof = roof.merge(neoof, how='outer')
    return roof

In [None]:
# Calculating a weighted average of the 2 models

def weighting_pred(dict_preds,alpha=.5):

    for fold in range(5):
        dict_preds[f"fold{fold}"]["model_avg"] = []
        preds_model0 = dict_preds[f"fold{fold}"]["model0"]
        preds_model1 = dict_preds[f"fold{fold}"]["model1"]
        for batch_preds0, batch_preds1 in zip(preds_model0,preds_model1):
            batch_preds_avg = batch_preds0*alpha0 + batch_preds1*(1-alpha0)
            dict_preds[f"fold{fold}"]["model_avg"].append(batch_preds_avg)
        #del dict_preds[f"fold{fold}"]["model0"]
        #del dict_preds[f"fold{fold}"]["model1"]
        
    return dict_preds

In [None]:
### CREATE FOLDS 

train_df = pd.read_csv("../input/feedback-prize-2021/train.csv")

def creating_predictions_per_fold(dict_preds):
    """ Returns a list of the test_samples var for the 5 folds, directly from dict with all preds"""
    
    folds = {}
    folds_ids = {}
    
    for fold_number in range(5):
        
        ids_fold = pd.read_csv("../input/ids-fold/ids_folds.csv")
        ids_fold_i = ids_fold.id[ids_fold.kfold == fold_number].values

        # We need the preprocessing stuff
        # In order to make the code of the cells below work
        tokenizer = AutoTokenizer.from_pretrained(args1.model)
        train_or_test = "train"
        test_samples = prepare_test_data(ids_fold_i, tokenizer, args1, "train")

        # Creating raw_preds for the fold
        raw_preds = dict_preds[f"fold{fold_number}"]["model_avg"]

        final_preds = []
        final_scores = []

        for rp in raw_preds:
            pred_class = np.argmax(rp, axis=2)
            pred_scrs = np.max(rp, axis=2)
            for pred, pred_scr in zip(pred_class, pred_scrs):
                pred = pred.tolist()
                pred_scr = pred_scr.tolist()
                final_preds.append(pred)
                final_scores.append(pred_scr)

        for j in range(len(test_samples)):
            tt = [id_target_map[p] for p in final_preds[j][1:]]
            tt_score = final_scores[j][1:]
            test_samples[j]["preds"] = tt
            test_samples[j]["pred_scores"] = tt_score
        
        folds[fold_number] = test_samples
        folds_ids[fold_number] = ids_fold_i
    
    return folds,folds_ids

In [None]:
def prepare_submission_df_per_fold(test_samples,proba_thresh,min_thresh):
    """Returns the submission df from a given fold (test_samples) with params thresh"""
    
    submission = []
    for sample_idx, sample in enumerate(test_samples):
        preds = sample["preds"]
        offset_mapping = sample["offset_mapping"]
        sample_id = sample["id"]
        sample_text = sample["text"]
        sample_input_ids = sample["input_ids"]
        sample_pred_scores = sample["pred_scores"]
        sample_preds = []

        if len(preds) < len(offset_mapping):
            preds = preds + ["O"] * (len(offset_mapping) - len(preds))
            sample_pred_scores = sample_pred_scores + [0] * (len(offset_mapping) - len(sample_pred_scores))

        idx = 0
        phrase_preds = []
        while idx < len(offset_mapping):
            start, _ = offset_mapping[idx]
            if preds[idx] != "O":
                label = preds[idx][2:]
            else:
                label = "O"
            phrase_scores = []
            phrase_scores.append(sample_pred_scores[idx])
            idx += 1
            while idx < len(offset_mapping):
                if label == "O":
                    matching_label = "O"
                else:
                    matching_label = f"I-{label}"
                if preds[idx] == matching_label:
                    _, end = offset_mapping[idx]
                    phrase_scores.append(sample_pred_scores[idx])
                    idx += 1
                else:
                    break
            if "end" in locals():
                phrase = sample_text[start:end]
                phrase_preds.append((phrase, start, end, label, phrase_scores))

        temp_df = []
        for phrase_idx, (phrase, start, end, label, phrase_scores) in enumerate(phrase_preds):
            word_start = len(sample_text[:start].split())
            word_end = word_start + len(sample_text[start:end].split())
            word_end = min(word_end, len(sample_text.split()))
            ps = " ".join([str(x) for x in range(word_start, word_end)])
            if label != "O":
                if sum(phrase_scores) / len(phrase_scores) >= proba_thresh[label]:
                    if len(ps.split()) >= min_thresh[label]:
                        temp_df.append((sample_id, label, ps))

        temp_df = pd.DataFrame(temp_df, columns=["id", "class", "predictionstring"])
        submission.append(temp_df)

    submission = pd.concat(submission).reset_index(drop=True)
    submission = link_evidence(submission)
    return submission

In [None]:
train_df = pd.read_csv("../input/feedback-prize-2021/train.csv")

def score_per_fold(fold_submission,fold_ids,train_df=train_df):
    
    train_fold_i = train_df[train_df.id.isin(fold_ids)]
    score = kaggle_score(fold_submission, train_fold_i, return_details=False)
    return score

In [None]:
def grid_search(dict_preds,min_thresh_list,proba_tresh_list,alpha_list):
    
    results=[]
    
    for alpha in alpha_list:    
        dict_preds = weighting_pred(dict_preds,alpha=alpha)

        folds,folds_id = creating_predictions_per_fold(dict_preds)
        
        for pt in proba_thresh_list:
            for mt in min_thresh_list:
                
                scores=[]
                for f in tqdm(folds.values()):
                    fold_submission = prepare_submission_df_per_fold(test_samples=f,proba_thresh=pt,min_thresh=mt)
                    scores.append(score_per_fold(fold_submission,folds_ids,train_df=train_df))
                result=dict(alpha=alpha,proba_thresh=pt,min_thresh=mt,scores=scores)
                results.append(result)
        
    return results

In [None]:
proba_thresh_list = [{
    "Lead": 0.7,
    "Position": 0.55,
    "Evidence": 0.65,
    "Claim": 0.55,
    "Concluding Statement": 0.7,
    "Counterclaim": 0.5,
    "Rebuttal": 0.55,
}]

min_thresh_list = [{
    "Lead": 9,
    "Position": 5,
    "Evidence": 14,
    "Claim": 3,
    "Concluding Statement": 11,
    "Counterclaim": 6,
    "Rebuttal": 4,
}]

alpha_list = [.5]

In [None]:
# Loading predictions
with open("../input/preds-cv/dict_all_preds.pickle", 'rb') as f:
    dict_preds = pickle.load(f)

In [None]:
dict_preds = weighting_pred(dict_preds,alpha=.5)

In [None]:
folds,folds_id = creating_predictions_per_fold(dict_preds)

In [None]:
%%time
res = grid_search(dict_preds,min_thresh_list,proba_thresh_list,alpha_list)