## Goal: 0.690+

### Imports

In [None]:
import os
import sys

import pandas as pd
import numpy as np

import torch
import torch.nn as nn

import transformers
from transformers import LongformerTokenizerFast

from tqdm import tqdm
import matplotlib.pyplot as plt

from param_longformer import param
from processing_longformer import preprocess, discourse_map, discourse_map_reverse
from dataset_longformer import LongformerDataset, Collate
from model_longformer import init_model

# postprocess
from pp_longformer import decode_predictions

sys.path.append('/home/backe/projects/feedback/')
from utils import seed_everything, moving_average, score_feedback_comp

seed_everything(param['random_seed'])

os.environ['CUDA_VISIBLE_DEVICES'] = param['gpu_idx']
transformers.logging.set_verbosity_error()

### Data loading

In [None]:
# PREPROCESS THE DATA

# tokenizer = LongformerTokenizerFast.from_pretrained(param['model_name'])

# TRAIN_PATH = '../data/train_clean.csv'
# train_df = pd.read_csv(TRAIN_PATH)
# print(train_df.shape)
# train_df.head()

# TEXT_FILES = os.listdir('../data/train')
# TEXT_FILES = [f'../data/train/{file}' for file in TEXT_FILES]

# text_data = dict()
# for file_path in TEXT_FILES:
#     with open(file_path, 'r') as file:
#         idx = os.path.basename(file_path).split('.txt')[0]
#         text_data[idx] = file.read()
        
# data = preprocess(text_data, tokenizer, train_df)
# longformer_df = pd.DataFrame(data, columns=['id', 'input_ids', 'attention_mask', 'token_to_word', 'target'])
# folds_df = pd.read_csv('../data/folds.csv')
# longformer_df = longformer_df.merge(folds_df, on='id')
# longformer_df.to_csv('/DATA/backe/feedback/longformer_preprocessed.csv', index=False)

In [None]:
%%time

# LOAD PREPROCESSED DATA

tokenizer = LongformerTokenizerFast.from_pretrained(param['model_name'])

# load saved processed data
DATA_PATH = '/DATA/backe/feedback/longformer_preprocessed.csv'
data = pd.read_csv(DATA_PATH)
data['input_ids'] = data['input_ids'].apply(eval)
data['attention_mask'] = data['attention_mask'].apply(eval)
data['token_to_word'] = data['token_to_word'].apply(eval)
data['target'] = data['target'].apply(eval)
data.head(1)

### Create torch dataset

In [None]:
collate_fn = Collate(tokenizer, purpose='train')
dataset = LongformerDataset(data, param, purpose='train')

train_dataloader, val_dataloader = dataset.get_dataloaders(collate_fn, param['fold_idx'])
print(len(train_dataloader), len(val_dataloader))

### Model definition

In [None]:
model = init_model(param)

In [None]:
TRAIN_PATH = '../data/train.csv'
train_df = pd.read_csv(TRAIN_PATH)

fold_ids = data.loc[data['kfold'] == param['fold_idx'], 'id']
val_df = train_df[train_df['id'].isin(fold_ids)]

In [None]:
losses = model.train_eval_pipeline(train_dataloader, val_dataloader, val_df, score_feedback_comp, decode_predictions)

In [None]:
# PLOT MOVING AVERAGE TRAIN LOSS
losses = np.concatenate(losses)
ma_loss = moving_average(losses, window_size=50)
plt.rcParams['figure.figsize'] = (20, 5)
plt.plot(ma_loss[0:])
plt.hlines(0.4 , xmin=0, xmax=len(ma_loss[0:]), colors='r');

### Save

In [None]:
# saving

# model.save_pretrained(param['save_dir'])
# tokenizer.save_pretrained(param['save_dir'])