### Format oof predictions

In [1]:
import os
import sys

import pandas as pd
import numpy as np
import torch

import pickle

from tqdm import tqdm

In [2]:
with open('word_probs_lf.pickle', 'rb') as handle:
    word_probs_lf = pickle.load(handle)
    
with open('word_probs_rl.pickle', 'rb') as handle:
    word_probs_rl = pickle.load(handle)

with open('word_probs_db.pickle', 'rb') as handle:
    word_probs_db = pickle.load(handle)
    
word_probs_all = {}

for idx in word_probs_lf.keys():
    
    word_probs_all[idx] = np.mean((word_probs_lf[idx], word_probs_rl[idx], word_probs_db[idx]), axis=0)

## OOF

In [3]:
from transformers import RobertaTokenizerFast
# LOAD PREPROCESSED DATA
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large')
# load saved processed data
DATA_PATH = '/storage/backe/feedback/data/roberta_preprocessed.csv'
data = pd.read_csv(DATA_PATH)
data['input_ids'] = data['input_ids'].apply(eval)
data['attention_mask'] = data['attention_mask'].apply(eval)
data['token_to_word'] = data['token_to_word'].apply(eval)
data['target'] = data['target'].apply(eval)

In [4]:
TEXT_FILES = os.listdir('../data/train')
TEXT_FILES = [f'../data/train/{file}' for file in TEXT_FILES]

text_data = dict()

for file_path in TEXT_FILES:
    with open(file_path, 'r') as file:
        idx = os.path.basename(file_path).split('.txt')[0]
        text_data[idx] = file.read()

# preprocessing steps
# 1. delete spaces from end of the texts
for key, value in text_data.items():
    text_data[key] = value.rstrip()

In [5]:
from decode import discourse_map_reverse
from decode import lead_info, pos_info, conc_info, claim_info, evidence_info, count_info, rebuttal_info

In [6]:
# add discourse start and ending positions

def get_lead_pos_cs_preds(idx, word_probs, label_idx, p_start, p_end, min_conf, min_words, offset_mapping, token_to_word):
        
    sample_preds = []

    class_probs = word_probs[:, label_idx]
        
    # start index candidates
    start_candidates = np.where(class_probs >= p_start)[0]
    # end index candidates
    end_candidates = np.where(class_probs >= p_end)[0]
        
    if (len(start_candidates)>0) and (len(end_candidates)>0):
        start_idx = start_candidates[0]
        end_idx = end_candidates[-1]
        num_word = end_idx - start_idx + 1
        confidence = class_probs[start_idx:end_idx+1].mean()

        if (confidence>=min_conf) and (num_word>=min_words):
            # format prediction
            this_preds = [str(idx) for idx in range(start_idx, end_idx+1)]
            this_preds = ' '.join(this_preds)
            
            discourse_start = offset_mapping[np.where(token_to_word == start_idx)[0][0]][0]
            discourse_end = offset_mapping[np.where(token_to_word == end_idx)[0][-1]][1]            
            
            sample_preds.append([idx, discourse_map_reverse[label_idx], this_preds, discourse_start, discourse_end, confidence])

    return sample_preds


In [7]:
def get_claim_evidence_preds(idx, word_probs, start_label, body_label, min_conf, min_words, offset_mapping, token_to_word):
    
    sample_preds = []
    
    start_probs = word_probs[:, start_label]
    body_probs = word_probs[:, body_label]

    word_preds = word_probs.argmax(1)
    
    num_words = len(word_preds)
    
    # clean word preds
    for i in range(1, num_words-1):
        if (word_preds[i-1] == 1 and  word_preds[i+1] == 1):
            if word_preds[i] == 3:
                word_preds[i] = 1
    # clean word preds
    for i in range(1, num_words-1):
        if (word_preds[i-1] == 3 and  word_preds[i+1] == 3):
            if word_preds[i] == 1:
                word_preds[i] = 3       

    start_positions = np.where(word_preds==start_label)[0]
    
    add_start = [i for i in range(1, num_words-1)
             if (word_preds[i] == body_label and word_preds[i-1] != body_label and word_preds[i-1] != start_label)]
    add_start = np.array(add_start)
    start_positions = np.append(start_positions, add_start).astype(int)

    for start_idx in start_positions:
        end_idx = start_idx
        
        while (end_idx != num_words-1) and (word_preds[end_idx+1] == body_label):
            end_idx += 1
        
        sample_num_word = end_idx - start_idx + 1
        confidence = np.append(start_probs[start_idx], body_probs[start_idx+1:end_idx+1]).mean()
        
        if (confidence>=min_conf) and (sample_num_word>=min_words):
            # format prediction
            this_preds = [str(idx) for idx in range(start_idx, end_idx+1)]
            this_preds = ' '.join(this_preds)
            
            discourse_start = offset_mapping[np.where(token_to_word == start_idx)[0][0]][0]
            discourse_end = offset_mapping[np.where(token_to_word == end_idx)[0][-1]][1]            
            
            sample_preds.append([idx, discourse_map_reverse[body_label], this_preds, discourse_start, discourse_end, confidence])
    
    
    return sample_preds


In [8]:
def get_count_reb_preds(idx, word_probs, label_idx, min_conf, min_words, offset_mapping, token_to_word):
        
    sample_preds = []
    
    class_probs = word_probs[:, label_idx]

    word_preds = word_probs.argmax(1)
    
    num_words = len(word_preds)
    
    start_positions = [i for i in range(1, num_words-1)
             if (word_preds[i] == label_idx and word_preds[i-1] != label_idx)]
    start_positions = np.array(start_positions)
    
    if word_preds[0] == label_idx:
        start_positions = np.append(0, start_positions).astype(int)
    
    for start_idx in start_positions:
        end_idx = start_idx
        
        while (end_idx != num_words-1) and (word_preds[end_idx+1] == label_idx):
            end_idx += 1
        
        sample_num_word = end_idx - start_idx + 1
        confidence = class_probs[start_idx:end_idx+1].mean()
        
        if (confidence>=min_conf) and (sample_num_word>=min_words):
            # format prediction
            this_preds = [str(idx) for idx in range(start_idx, end_idx+1)]
            this_preds = ' '.join(this_preds)
            
            discourse_start = offset_mapping[np.where(token_to_word == start_idx)[0][0]][0]
            discourse_end = offset_mapping[np.where(token_to_word == end_idx)[0][-1]][1]            
            
            sample_preds.append([idx, discourse_map_reverse[label_idx], this_preds, discourse_start, discourse_end, confidence])
    
    
    return sample_preds


In [9]:
def decode_predictions(idx, word_probs, offset_mapping, token_to_word):
    
    preds_decoded = []
    
    # 1. Lead
    preds_decoded += get_lead_pos_cs_preds(idx, word_probs, 7, lead_info['p_start'], lead_info['p_end'], lead_info['min_conf'], lead_info['min_words'],
                                          offset_mapping, token_to_word)
        
    # 2. Position
    preds_decoded += get_lead_pos_cs_preds(idx, word_probs, 5, pos_info['p_start'], pos_info['p_end'], pos_info['min_conf'], pos_info['min_words'],
                                          offset_mapping, token_to_word)    
        
    # 3. Concluding Statement
    preds_decoded += get_lead_pos_cs_preds(idx, word_probs, 6, conc_info['p_start'], conc_info['p_end'], conc_info['min_conf'], conc_info['min_words'],
                                          offset_mapping, token_to_word)
    
    # 4. Claim
    preds_decoded += get_claim_evidence_preds(idx, word_probs, 2, 1, claim_info['min_conf'], claim_info['min_words'],
                                             offset_mapping, token_to_word)
    
    # 5. Evidence
    preds_decoded += get_claim_evidence_preds(idx, word_probs, 4, 3, evidence_info['min_conf'], evidence_info['min_words'],
                                             offset_mapping, token_to_word)
    
    # 6. Counterclaim
    preds_decoded += get_count_reb_preds(idx, word_probs, 8, count_info['min_conf'], count_info['min_words'],
                                        offset_mapping, token_to_word)

    # 7. Rebuttal
    preds_decoded += get_count_reb_preds(idx, word_probs, 9, rebuttal_info['min_conf'], rebuttal_info['min_words'],
                                        offset_mapping, token_to_word)
    
    return preds_decoded


In [10]:
# PREDICTIONS FORMATTING

oof_preds = []

for idx, word_probs in tqdm(word_probs_all.items()):
    
    # 1. GET INPUTS
    text = text_data[idx]
    inputs = tokenizer(text,
                       add_special_tokens=True,
                       return_offsets_mapping=True,
                       return_length=True)    

    offset_mapping = inputs['offset_mapping']
    token_to_word = np.array(data.loc[data['id'] == idx, 'token_to_word'].item())
    
    sample_formated = decode_predictions(idx, word_probs, offset_mapping, token_to_word)
    
    oof_preds += sample_formated
    
preds_df = pd.DataFrame(oof_preds, columns=['id', 'discourse_type', 'predictionstring', 'discourse_start' , 'discourse_end', 'confidence'])
preds_df.shape

  0%|          | 0/15594 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (562 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 15594/15594 [02:41<00:00, 96.80it/s] 


(131009, 6)

In [33]:
preds_df.head()

Unnamed: 0,id,discourse_type,predictionstring,discourse_start,discourse_end,confidence
0,0C0E56A1FB05,Lead,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...,0,247,0.980532
1,0C0E56A1FB05,Position,42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 5...,248,342,0.96598
2,0C0E56A1FB05,Concluding Statement,238 239 240 241 242 243 244 245 246 247 248 24...,1374,1580,0.988033
3,0C0E56A1FB05,Claim,59 60 61 62 63 64 65 66 67 68 69 70,344,413,0.833445
4,0C0E56A1FB05,Claim,131 132 133 134 135 136 137 138 139 140 141 14...,755,878,0.8007


In [34]:
preds_df.to_csv('../data/oof_lf_rl_db.csv', index=False)