In [1]:
import os
import sys
import pickle

import pandas as pd
import numpy as np

import torch
import torch.nn as nn

import transformers

from tqdm import tqdm
import matplotlib.pyplot as plt

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

### 1. Longformer

In [None]:
LF_DATA_PATH = '/storage/backe/feedback/data/longformer_preprocessed.csv'
data_lf = pd.read_csv(LF_DATA_PATH)
data_lf['input_ids'] = data_lf['input_ids'].apply(eval)
data_lf['attention_mask'] = data_lf['attention_mask'].apply(eval)
data_lf['token_to_word'] = data_lf['token_to_word'].apply(eval)
data_lf['target'] = data_lf['target'].apply(eval)

In [None]:
LF_PATH_0 = '/storage/backe/feedback/longformer/longformer-0fold/'
LF_PATH_1 = '/storage/backe/feedback/longformer/longformer-1fold/'
LF_PATH_2 = '/storage/backe/feedback/longformer/longformer-2fold/'
LF_PATH_3 = '/storage/backe/feedback/longformer/longformer-3fold/'
LF_PATH_4 = '/storage/backe/feedback/longformer/longformer-4fold/'

sys.path.append(LF_PATH_0)
sys.path.append(LF_PATH_1)
sys.path.append(LF_PATH_2)
sys.path.append(LF_PATH_3)
sys.path.append(LF_PATH_4)

from param_longformer_0 import param as param_lf_0
from param_longformer_1 import param as param_lf_1
from param_longformer_2 import param as param_lf_2
from param_longformer_3 import param as param_lf_3
from param_longformer_4 import param as param_lf_4

param_lf_0['kaggle_path'] = LF_PATH_0
param_lf_1['kaggle_path'] = LF_PATH_1
param_lf_2['kaggle_path'] = LF_PATH_2
param_lf_3['kaggle_path'] = LF_PATH_3
param_lf_4['kaggle_path'] = LF_PATH_4

from transformers import LongformerConfig, LongformerTokenizerFast
from dataset_longformer import LongformerDataset, Collate
from model_longformer import load_longformer

tokenizer_lf = LongformerTokenizerFast.from_pretrained(param_lf_0['model_name'])

In [None]:
# store longformer fold's predictions here
word_probs_lf = []

params_lf = [param_lf_0, param_lf_1, param_lf_2, param_lf_3, param_lf_4]

for e, param_lf in enumerate(params_lf):
        
    collate_fn = Collate(tokenizer_lf, purpose='train')
    dataset_lf = LongformerDataset(data_lf, param_lf, purpose='train')

    _, val_dataloader_lf = dataset_lf.get_dataloaders(collate_fn, param_lf['fold_idx'])

    model_lf = load_longformer(param_lf)
    
    word_probs_lf.append(model_lf.get_words_probabilities(val_dataloader_lf))
    
    print(f'{e}. finished!')

In [None]:
z = {**word_probs_lf[0], **word_probs_lf[1], **word_probs_lf[2], **word_probs_lf[3], **word_probs_lf[4]}
with open('word_probs_lf.pickle', 'wb') as handle:
    pickle.dump(z, handle, protocol=pickle.HIGHEST_PROTOCOL)

### 2. Roberta

In [None]:
# load saved processed data
RL_DATA_PATH = '/DATA/backe/feedback/data/roberta_preprocessed.csv'
data_rl = pd.read_csv(RL_DATA_PATH)
data_rl['input_ids'] = data_rl['input_ids'].apply(eval)
data_rl['attention_mask'] = data_rl['attention_mask'].apply(eval)
data_rl['token_to_word'] = data_rl['token_to_word'].apply(eval)
data_rl['target'] = data_rl['target'].apply(eval)

In [None]:
RF_PATH_0 = '/DATA/backe/feedback/roberta/roberta-0fold/'
RF_PATH_1 = '/DATA/backe/feedback/roberta/roberta-1fold/'
RF_PATH_2 = '/DATA/backe/feedback/roberta/roberta-2fold/'
RF_PATH_3 = '/DATA/backe/feedback/roberta/roberta-3fold/'
RF_PATH_4 = '/DATA/backe/feedback/roberta/roberta-4fold/'

sys.path.append(RF_PATH_0)
sys.path.append(RF_PATH_1)
sys.path.append(RF_PATH_2)
sys.path.append(RF_PATH_3)
sys.path.append(RF_PATH_4)

from param_roberta_0 import param as param_rl_0
from param_roberta_1 import param as param_rl_1
from param_roberta_2 import param as param_rl_2
from param_roberta_3 import param as param_rl_3
from param_roberta_4 import param as param_rl_4

param_rl_0['kaggle_path'] = RF_PATH_0
param_rl_1['kaggle_path'] = RF_PATH_1
param_rl_2['kaggle_path'] = RF_PATH_2
param_rl_3['kaggle_path'] = RF_PATH_3
param_rl_4['kaggle_path'] = RF_PATH_4

from transformers import RobertaConfig, RobertaTokenizerFast
from dataset_roberta import RobertaDataset
from model_roberta import load_roberta

tokenizer_rl = RobertaTokenizerFast.from_pretrained(param_rl_0['model_name'])

In [None]:
# store roberta fold's predictions here
word_probs_rl = []

params_rl = [param_rl_0, param_rl_1, param_rl_2, param_rl_3, param_rl_4]

for e, param_rl in enumerate(params_rl):
        
    dataset_rl = RobertaDataset(data_rl, tokenizer_rl, param_rl)

    _, val_dataloader_rl = dataset_rl.get_dataloaders(param_rl['fold_idx'])

    model_rl = load_roberta(param_rl)
    
    word_probs_rl.append(model_rl.get_words_probabilities(val_dataloader_rl))
    
    print(f'{e}. finished!')

In [None]:
z = {**word_probs_rl[0], **word_probs_rl[1], **word_probs_rl[2], **word_probs_rl[3], **word_probs_rl[4]}
with open('word_probs_rl.pickle', 'wb') as handle:
    pickle.dump(z, handle, protocol=pickle.HIGHEST_PROTOCOL)

### 3. Deberta

In [2]:
# load saved processed data
DB_DATA_PATH = '/DATA/backe/feedback/data/deberta_preprocessed.csv'
data_db = pd.read_csv(DB_DATA_PATH)
data_db['input_ids'] = data_db['input_ids'].apply(eval)
data_db['attention_mask'] = data_db['attention_mask'].apply(eval)
data_db['token_to_word'] = data_db['token_to_word'].apply(eval)
data_db['target'] = data_db['target'].apply(eval)

In [3]:
DB_PATH_0 = '/DATA/backe/feedback/deberta/deberta-0fold/'
DB_PATH_1 = '/DATA/backe/feedback/deberta/deberta-1fold/'
DB_PATH_2 = '/DATA/backe/feedback/deberta/deberta-2fold/'
DB_PATH_3 = '/DATA/backe/feedback/deberta/deberta-3fold/'
DB_PATH_4 = '/DATA/backe/feedback/deberta/deberta-4fold/'

sys.path.append(DB_PATH_0)
sys.path.append(DB_PATH_1)
sys.path.append(DB_PATH_2)
sys.path.append(DB_PATH_3)
sys.path.append(DB_PATH_4)

from param_deberta_0 import param as param_db_0
from param_deberta_1 import param as param_db_1
from param_deberta_2 import param as param_db_2
from param_deberta_3 import param as param_db_3
from param_deberta_4 import param as param_db_4

param_db_0['kaggle_path'] = DB_PATH_0
param_db_1['kaggle_path'] = DB_PATH_1
param_db_2['kaggle_path'] = DB_PATH_2
param_db_3['kaggle_path'] = DB_PATH_3
param_db_4['kaggle_path'] = DB_PATH_4

from transformers import DebertaConfig, DebertaTokenizerFast
from dataset_deberta import DebertaDataset
from model_deberta import load_deberta

tokenizer_db = DebertaTokenizerFast.from_pretrained(param_db_0['model_name'])

In [4]:
# store deberta fold's predictions here
word_probs_db = []

params_db = [param_db_0, param_db_1, param_db_2, param_db_3, param_db_4]

for e, param_db in enumerate(params_db):
        
    dataset_db = DebertaDataset(data_db, tokenizer_db, param_db)

    _, val_dataloader_db = dataset_db.get_dataloaders(param_db['fold_idx'])

    model_db = load_deberta(param_db)
    
    word_probs_db.append(model_db.get_words_probabilities(val_dataloader_db))
    
    print(f'{e}. finished!')

0. finished!
1. finished!
2. finished!
3. finished!
4. finished!


In [5]:
z = {**word_probs_db[0], **word_probs_db[1], **word_probs_db[2], **word_probs_db[3], **word_probs_db[4]}
with open('word_probs_db.pickle', 'wb') as handle:
    pickle.dump(z, handle, protocol=pickle.HIGHEST_PROTOCOL)