### Imports

In [1]:
import os
import sys

import pandas as pd
import numpy as np

import torch
import torch.nn as nn

import transformers
from transformers import DebertaConfig, DebertaTokenizerFast

from tqdm import tqdm
import matplotlib.pyplot as plt

from param_deberta import param
from processing_deberta import preprocess, discourse_map
from dataset_deberta import DebertaDataset
from model_deberta import init_deberta

sys.path.append('/home/backe/projects/feedback/')
from utils import seed_everything, moving_average, score_feedback_comp

seed_everything(param['random_seed'])

os.environ['CUDA_VISIBLE_DEVICES'] = '2'
transformers.logging.set_verbosity_error()

pd.set_option('display.max_colwidth', None)

### Data loading

In [2]:
tokenizer = DebertaTokenizerFast.from_pretrained(param['model_name'])

TRAIN_PATH = '../data/train_clean.csv'
train_df = pd.read_csv(TRAIN_PATH)
print(train_df.shape)
train_df.head()

TEXT_FILES = os.listdir('../data/train')
TEXT_FILES = [f'../data/train/{file}' for file in TEXT_FILES]

text_data = dict()
for file_path in TEXT_FILES:
    with open(file_path, 'r') as file:
        idx = os.path.basename(file_path).split('.txt')[0]
        text_data[idx] = file.read()
        
data = preprocess(text_data, tokenizer, train_df)
deberta_df = pd.DataFrame(data, columns=['id', 'input_ids', 'attention_mask', 'token_to_word', 'target'])
folds_df = pd.read_csv('../data/folds.csv')
deberta_df = deberta_df.merge(folds_df, on='id')
deberta_df.to_csv('/DATA/backe/feedback/data/deberta_preprocessed.csv', index=False)

(144293, 13)


100%|██████████| 15594/15594 [04:48<00:00, 53.99it/s]


In [None]:
tokenizer = DebertaTokenizerFast.from_pretrained(param['model_name'])

TRAIN_PATH = '../data/train_clean.csv'
train_df = pd.read_csv(TRAIN_PATH)
print(train_df.shape)
train_df.head()

TEXT_FILES = os.listdir('../data/train')
TEXT_FILES = [f'../data/train/{file}' for file in TEXT_FILES]

text_data = dict()
for file_path in TEXT_FILES:
    with open(file_path, 'r') as file:
        idx = os.path.basename(file_path).split('.txt')[0]
        text_data[idx] = file.read()


In [None]:
train_df.sample(1)

In [None]:
for idx, text in tqdm(text_data.items()):
    pass

In [None]:
idx = '4BB688100D15'
text = text_data[idx]

In [None]:
print(idx)
print(text)

In [None]:
# right strip the text
text = text.rstrip()

# 1. GET INPUTS
inputs = tokenizer(text,
                   add_special_tokens=True,
                   return_offsets_mapping=True,
                   return_length=True)    

inputs.keys()

In [None]:
def clean_offset(pos: tuple, text:str) -> tuple:
    
    if pos[0] == pos[1]:
        return pos
    elif text[pos[0]] == ' ':
        new_start = pos[0] + 1
        return (new_start, pos[1])
    else:
        return pos
        

In [None]:
inputs['offset_mapping'] = [clean_offset(pos, text) for pos in inputs['offset_mapping']]

In [None]:
# split text into words
words = text.split()

token_to_word = [] # list to store token -> word mapping
word_pos = 0 # starting word position

tokens = inputs['input_ids'][1:-1]  # exclude <s> and </s> tokens
start = 0
end = 1

for _ in tokens:

    word = tokenizer.decode(tokens[start:end]).strip()

    # if striped word is an empty string, that token doesn't belong to any word
    if word == '':
        token_to_word.append(-1)
        start += 1
        end += 1
        continue

    # still no match
    # continue adding tokens
    if word != words[word_pos]:
        end += 1
        token_to_word.append(word_pos)
    # match 
    else:
        token_to_word.append(word_pos)
        start = end
        end = start + 1
        word_pos += 1

# add -1 position for the <s> and </s> tokens        
token_to_word = [-1] + token_to_word + [-1]


In [None]:
# initialize target 0s (all Fillers)
target = np.full(inputs['length'][0], 0)
id_filt = (train_df['id'] == idx)
sample_df = train_df[id_filt]
sample_df[:6]

In [None]:

# helper numpy array
token_to_word_np = np.array(token_to_word)

# iterate discourses
for row in sample_df.iterrows():
    discourse_type = row[1]['discourse_type']
    start = row[1]['new_start']
    end = row[1]['new_end']

    # this discourse's token positions
    # set their targets
    discourse_pos = [True if ((pos[0] >= start) and (pos[1] <= end)) else False for pos in inputs['offset_mapping']]
    target[discourse_pos] = discourse_map[discourse_type]

    # special first word's token's target for Claim and Evidence
    # set their target to Claim_S / Evidence_S 
    if (discourse_type == 'Claim') or (discourse_type == 'Evidence'):
        first_word_id = int(row[1]['predictionstring'].split()[0])
        target[token_to_word_np == first_word_id] = discourse_map[discourse_type + '_S']

# tokens that doesn't belong to any word set to -1
# easier this way at the end...
target[token_to_word_np == -1] = -1
target = list(target)


In [None]:
print(target)

In [None]:
print(target[108:150])

In [None]:
print(target[65:80])

In [None]:
print(tokenizer.convert_ids_to_tokens(inputs['input_ids'][108]))

In [None]:
inputs['offset_mapping'][65]

In [None]:
text[291:410]

In [None]:
tokenizer.convert_tokens_to_string(inputs['input_ids'][111])

In [None]:
inputs['input_ids'][111]