## Context

In this notebook, I try to choose (manually or randomly) the parameters and visualize what happened.

To do this, I saved the previously prepared data in a dataset and try to check the parameters without using the GPU.

## Models and baseline

Thanks @abhishek and @chryzal

**two longformers are better than 1**

https://www.kaggle.com/abhishek/two-longformers-are-better-than-1

**Feedback Prize 2021 👏 Pytorch [better parameters]**

https://www.kaggle.com/chryzal/feedback-prize-2021-pytorch-better-parameters

## Dataset

**Custom Dataset for Evaluating Student Writing Competition**

https://www.kaggle.com/renokan/dataset-student-writing-2

# 1. Import & Def & Load

In [None]:
import numpy as np
import pandas as pd

import re
import random
import pickle

from spacy import displacy

In [None]:
def get_proba_thresh(param_list):
    result = []
    for param in param_list:
        if param == None:
            param = round(random.uniform(0.5, 0.70), 2)
            
        result.append(param)
        
    return result


def get_min_thresh(param_list):
    result = []
    for param in param_list:
        if param == None:
            param = random.randint(3, 12)
            
        result.append(param)
        
    return result


def visualize(txt_id, df, dir_path=None, prefix_id=None):
    if not dir_path:
        dir_path = "../input/dataset-student-writing-2/samples"

    colors = {
        'Lead': '#8000ff',
        'Position': '#2b7ff6',
        'Evidence': '#2adddd',
        'Claim': '#80ffb4',
        'Concluding Statement': 'd4dd80',
        'Counterclaim': '#ff8042',
        'Rebuttal': '#ff0000'
    }
    
    with open(f'{dir_path}/{txt_id}.txt', 'r') as file:
        data = file.read()
    
    inds_dict = {i: [ele.start(), ele.end()] for i, ele in enumerate(re.finditer(r'\S+', data))}
    
    example_df = df[df['id'] == txt_id].copy()
    example_df['indx'] = example_df['predictionstring'].apply(lambda s: int([x for x in s.split()][0]))
    example_df = example_df.sort_values(by='indx')
                                                              
    ents = []
    for i, row in example_df.iterrows():
        discourse = row['predictionstring']
        words_list = [x for x in discourse.split()]
        first_word = int(words_list[0])
        last_word = int(words_list[-1])
        
        start_inds = inds_dict.get(first_word)
        end_inds = inds_dict.get(last_word)
        
        if start_inds and end_inds:        
            ents.append({
                'start': start_inds[0],
                'end': end_inds[1],
                'label': row['class']
            })

    if prefix_id:
        txt_id = prefix_id + " " + txt_id
    
    doc2 = {
        "text": data,
        "ents": ents,
        "title": txt_id
    }
    
    options = {"ents": list(colors.keys()), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)


def jn(pst, start, end):
    return " ".join([str(x) for x in pst[start:end]])


def link_evidence(oof):
    thresh = 1
    idu = oof['id'].unique()
    idc = idu[1]
    eoof = oof[oof['class'] == "Evidence"]
    neoof = oof[oof['class'] != "Evidence"]
    for thresh2 in range(26,27, 1):
        retval = []
        for idv in idu:
            for c in  ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
                   'Counterclaim', 'Rebuttal']:
                q = eoof[(eoof['id'] == idv) & (eoof['class'] == c)]
                if len(q) == 0:
                    continue
                pst = []
                for i,r in q.iterrows():
                    pst = pst +[-1] + [int(x) for x in r['predictionstring'].split()]
                start = 1
                end = 1
                for i in range(2,len(pst)):
                    cur = pst[i]
                    end = i
                    if (cur == -1 and c != 'Evidence') or ((cur == -1) and ((pst[i+1] > pst[end-1] + thresh) or (pst[i+1] - pst[start] > thresh2))):
                        retval.append((idv, c, jn(pst, start, end)))
                        start = i + 1
                v = (idv, c, jn(pst, start, end+1))
                #print(v)
                retval.append(v)
        roof = pd.DataFrame(retval, columns = ['id', 'class', 'predictionstring']) 
        roof = roof.merge(neoof, how='outer')
        return roof

In [None]:
prepared_data_path = "../input/dataset-student-writing-2/prepared_data_2lf.pickle"

with open(prepared_data_path, 'rb') as f:
    prepared_data = pickle.load(f)

In [None]:
print(type(prepared_data))
print(len(prepared_data))

In [None]:
check_sample = prepared_data[0]
max_len_values = 105

print(check_sample.keys(), "\n")
print(check_sample.get('id'), "\n")
print(check_sample.get('text')[:max_len_values * 10], "... \n")
print(check_sample.get('input_ids')[:max_len_values], "... \n")
print(check_sample.get('offset_mapping')[:max_len_values], "... \n")
print(check_sample.get('preds')[:max_len_values], "... \n")
print(check_sample.get('pred_scores')[:max_len_values], "... \n")

In [None]:
prediction_path = "../input/dataset-student-writing-2/sample_prediction.csv"
prediction = pd.read_csv(prediction_path)
prediction.head()

# 2. Select thresh weights

In [None]:
min_thresh_list = get_min_thresh([None, 5, 14, None, 11, 6, None])
min_thresh_config = {
    "Lead": [9 , min_thresh_list[0]],
    "Position": [5 , min_thresh_list[1]],
    "Evidence": [14 , min_thresh_list[2]],
    "Claim": [3 , min_thresh_list[3]],
    "Concluding Statement": [11 , min_thresh_list[4]],
    "Counterclaim": [6 , min_thresh_list[5]],
    "Rebuttal": [4 , min_thresh_list[6]]
}

pd.DataFrame(min_thresh_config).T

In [None]:
proba_thresh_weights = get_proba_thresh([None, 0.5375, 0.6375, None, 0.6875, 0.5375, None])
proba_thresh_config = {
    "Lead": (0.6875 , proba_thresh_weights[0]),
    "Position": (0.5375 , proba_thresh_weights[1]),
    "Evidence": (0.6375 , proba_thresh_weights[2]),
    "Claim": (0.5375 , proba_thresh_weights[3]),
    "Concluding Statement": (0.6875 , proba_thresh_weights[4]),
    "Counterclaim": (0.5375 , proba_thresh_weights[5]),
    "Rebuttal": (0.5375 , proba_thresh_weights[6])
}

pd.DataFrame(proba_thresh_config).T

In [None]:
# ****************** 1 - choose (manually or randomly), 0 - baseline
proba_thresh_value = 1
proba_thresh = {
    "Lead": proba_thresh_config.get('Lead')[proba_thresh_value],
    "Position": proba_thresh_config.get('Position')[proba_thresh_value],
    "Evidence": proba_thresh_config.get('Evidence')[proba_thresh_value],
    "Claim": proba_thresh_config.get('Claim')[proba_thresh_value],
    "Concluding Statement": proba_thresh_config.get('Concluding Statement')[proba_thresh_value],
    "Counterclaim": proba_thresh_config.get('Counterclaim')[proba_thresh_value],
    "Rebuttal": proba_thresh_config.get('Rebuttal')[proba_thresh_value]
}

# **************** 0 - baseline, 1 - choose (manually or randomly) 
min_thresh_value = 0
min_thresh = {
    "Lead": min_thresh_config.get('Lead')[min_thresh_value],
    "Position": min_thresh_config.get('Position')[min_thresh_value],
    "Evidence": min_thresh_config.get('Evidence')[min_thresh_value],
    "Claim": min_thresh_config.get('Claim')[min_thresh_value],
    "Concluding Statement": min_thresh_config.get('Concluding Statement')[min_thresh_value],
    "Counterclaim": min_thresh_config.get('Counterclaim')[min_thresh_value],
    "Rebuttal": min_thresh_config.get('Rebuttal')[min_thresh_value]
}

# 3. Create submission data

In [None]:
submission = []

for sample_idx, sample in enumerate(prepared_data):
    preds = sample["preds"]
    offset_mapping = sample["offset_mapping"]
    sample_id = sample["id"]
    sample_text = sample["text"]
    sample_input_ids = sample["input_ids"]
    sample_pred_scores = sample["pred_scores"]
    sample_preds = []

    if len(preds) < len(offset_mapping):
        preds = preds + ["O"] * (len(offset_mapping) - len(preds))
        sample_pred_scores = sample_pred_scores + [0] * (len(offset_mapping) - len(sample_pred_scores))
    
    idx = 0
    phrase_preds = []
    while idx < len(offset_mapping):
        start, _ = offset_mapping[idx]
        if preds[idx] != "O":
            label = preds[idx][2:]
        else:
            label = "O"
        phrase_scores = []
        phrase_scores.append(sample_pred_scores[idx])
        idx += 1
        while idx < len(offset_mapping):
            if label == "O":
                matching_label = "O"
            else:
                matching_label = f"I-{label}"
            if preds[idx] == matching_label:
                _, end = offset_mapping[idx]
                phrase_scores.append(sample_pred_scores[idx])
                idx += 1
            else:
                break
        if "end" in locals():
            phrase = sample_text[start:end]
            phrase_preds.append((phrase, start, end, label, phrase_scores))

    temp_df = []
    for phrase_idx, (phrase, start, end, label, phrase_scores) in enumerate(phrase_preds):
        word_start = len(sample_text[:start].split())
        word_end = word_start + len(sample_text[start:end].split())
        word_end = min(word_end, len(sample_text.split()))
        ps = " ".join([str(x) for x in range(word_start, word_end)])
        if label != "O":
            if sum(phrase_scores) / len(phrase_scores) >= proba_thresh[label]:
                if len(ps.split()) >= min_thresh[label]:
                    temp_df.append((sample_id, label, ps))
    
    submission.extend(temp_df)

submission = pd.DataFrame(submission, columns=["id", "class", "predictionstring"])
submission = link_evidence(submission)

In [None]:
submission.head()

# 4. Visualize difference

In [None]:
random_choice = False

In [None]:
if random_choice:
    ids_list = submission['id'].unique()    
    ids_list = np.random.choice(ids_list, 100, replace=False)
else:
    ids_list = [
        '44D73D95E310', 'E05C7F5C1156', 'A0215B1294B9',
        '4768A4B9E142', '291AAEA628E1', '4C587D63904C',
        'F0095A6449E9', 'D5D689030CA2', '271E7282493D',
        'DB1B143B6EB1', '0E424AAB5B39', '47D025408E0F',
        '4B81975FA4E8', 'D514ED6A3665', '47D025408E0F'
    ]

In [None]:
n_ids = 15

for ex in ids_list[:n_ids]:
    visualize(ex, submission, prefix_id="SUBM:")
    visualize(ex, prediction, prefix_id="PRED:")  # predictionstring from train data
    print('\n')