### Imports

In [1]:
import os
import sys

import pandas as pd
import numpy as np

import torch
import torch.nn as nn

import transformers
from transformers import RobertaConfig, RobertaTokenizerFast

from tqdm import tqdm
import matplotlib.pyplot as plt

# from param_roberta_base import param
from param_roberta import param
from processing_roberta import preprocess, discourse_map
from dataset_roberta import RobertaDataset
from model_roberta import init_roberta

sys.path.append('/home/backe/projects/feedback/')
from utils import seed_everything, moving_average, score_feedback_comp

seed_everything(param['random_seed'])

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
transformers.logging.set_verbosity_error()

### Data loading

In [2]:
tokenizer = RobertaTokenizerFast.from_pretrained(param['model_name'])

TRAIN_PATH = '../data/train_clean.csv'
train_df = pd.read_csv(TRAIN_PATH)
print(train_df.shape)
train_df.head()

TEXT_FILES = os.listdir('../data/train')
TEXT_FILES = [f'../data/train/{file}' for file in TEXT_FILES]

text_data = dict()
for file_path in TEXT_FILES:
    with open(file_path, 'r') as file:
        idx = os.path.basename(file_path).split('.txt')[0]
        text_data[idx] = file.read()
        
data = preprocess(text_data, tokenizer, train_df)
roberta_df = pd.DataFrame(data, columns=['id', 'input_ids', 'attention_mask', 'token_to_word', 'target'])
folds_df = pd.read_csv('../data/folds.csv')
roberta_df = roberta_df.merge(folds_df, on='id')
roberta_df.to_csv('/DATA/backe/feedback/data/roberta_preprocessed_new.csv', index=False)

(144293, 13)


100%|██████████| 15594/15594 [04:43<00:00, 55.09it/s]


In [None]:
# LOAD PREPROCESSED DATA
tokenizer = RobertaTokenizerFast.from_pretrained(param['model_name'])
# load saved processed data
DATA_PATH = '/DATA/backe/feedback/data/roberta_preprocessed.csv'
data = pd.read_csv(DATA_PATH)
data['input_ids'] = data['input_ids'].apply(eval)
data['attention_mask'] = data['attention_mask'].apply(eval)
data['token_to_word'] = data['token_to_word'].apply(eval)
data['target'] = data['target'].apply(eval)

# GET DATALOADERS
dataset = RobertaDataset(data, tokenizer, param)

train_dataloader, val_dataloader = dataset.get_dataloaders(param['fold_idx'])

# MODEL INITIALIZATION
model = init_roberta(param)

# VAL_DF
TRAIN_PATH = '../data/train.csv'
train_df = pd.read_csv(TRAIN_PATH)

fold_ids = data.loc[data['kfold'] == param['fold_idx'], 'id']
val_df = train_df[train_df['id'].isin(fold_ids)]


In [None]:
from transformers import AdamW, get_polynomial_decay_schedule_with_warmup


In [None]:
lrs = []

In [None]:
# define optimizer - Adam
optimizer = AdamW(model.parameters(), lr=model.param['lr'])
# define lr scheduler
num_train_steps = model.param['epochs'] * len(train_dataloader)
scheduler = get_polynomial_decay_schedule_with_warmup(optimizer,
                                                      model.param['warmup_steps'],
                                                      num_train_steps,
                                                      lr_end=model.param['lr_end'],
                                                     power=0.25)


In [None]:
for batch in tqdm(train_dataloader):
    
    lrs.append(optimizer.param_groups[0]['lr'])
    
    scheduler.step()
    

In [None]:
pd.Series(lrs).plot()

In [None]:
pd.Series(lrs).plot()

In [None]:
pd.Series(lrs).plot()

In [None]:
# PLOT MOVING AVERAGE TRAIN LOSS
ma_loss = moving_average(np.array(losses), window_size=50)
plt.rcParams['figure.figsize'] = (20, 5)
plt.plot(ma_loss[:])
plt.hlines(0.4 , xmin=0, xmax=len(ma_loss[:]), colors='r');

### Get predictions

In [None]:
# GET WORD LEVEL PROBABILITIES 

model.eval()

word_probs_all = dict()

for batch in tqdm(val_dataloader):

    with torch.no_grad():
        output = model(batch['input_ids'].cuda(),
                       attention_mask=batch['attention_mask'].cuda())
    
    sample_probs = torch.softmax(output, axis=-1).cpu()
    sample_probs = sample_probs.view(-1, param['num_labels'])
    
    loss_mask = batch['loss_mask'].view(-1) == 1

    token_probs = sample_probs[loss_mask]
    
    token_to_word = batch['token_to_word'].reshape(-1)
    token_to_word = token_to_word[loss_mask]
    
    num_words = token_to_word.max().item()+1

    word_probs = [token_probs[token_to_word == word_id].mean(0).numpy() for word_id in range(num_words)]
    word_probs_all[batch['id']] = np.array(word_probs)


### Evaluate

In [None]:
# PREDICTIONS FORMATTING

val_preds = []

for idx in tqdm(word_probs_all.keys()):
    
    word_probs = word_probs_all[idx]
    
    preds_decoded = decode_predictions(idx, word_probs)
    
    val_preds += preds_decoded
    
preds_df = pd.DataFrame(val_preds, columns=['id', 'class', 'predictionstring'])
preds_df.head()

In [None]:
TRAIN_PATH = '../data/train.csv'
train_df = pd.read_csv(TRAIN_PATH)
train_df.head()

# VALID DATAFRAME
true_df = train_df.loc[train_df['id'].isin(preds_df['id'].unique())]

In [None]:
f1s = []

CLASSES = preds_df['class'].unique()
for c in CLASSES:
    pred_df = preds_df.loc[preds_df['class']==c].copy()
    gt_df = true_df.loc[true_df['discourse_type']==c].copy()
    f1 = score_feedback_comp(pred_df, gt_df)
    print(c,f1)
    f1s.append(f1)
print()
print('Overall', np.mean(f1s))

In [None]:
# Position 0.7157822428301562
# Concluding Statement 0.8666420391577392
# Claim 0.6496393267432541
# Evidence 0.7577560806516841
# Lead 0.8326596604688763
# Counterclaim 0.5556544968833482
# Rebuttal 0.5057610673135233

# Overall 0.6976992734355116


### Save

In [None]:
# # saving

# model.save_pretrained(param['save_dir'])
# tokenizer.save_pretrained(param['save_dir'])