In [1]:
# ========================================
# library
# ========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, KFold,GroupKFold
from sklearn.metrics import mean_squared_error
%matplotlib inline
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
import transformers
from transformers import LongformerTokenizer, LongformerModel,AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler
import logging
from ast import literal_eval
import sys
from contextlib import contextmanager
import time
import random
from tqdm import tqdm
import os

2022-03-28 22:09:52.959807: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
# ==================
# Constant
# ==================
TRAIN_PATH = "../data/train.csv"
DATA_DIR = "../data/longformer-large-4096/"
DATA_PATH = "../data/train/"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# ==================
# Constant
# ==================
OUTPUT_DIR = f"../output/team_share"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

ex_pred1 = "019"
ex_pred2 = "046"
ex_pred3 = "048"
ex_pred4 = "051"
ex_pred5 = "064"
ex_pred6 = "067"
pred1_path = f"../output/exp/ex{ex_pred1}"
pred2_path = f"../output/exp/ex{ex_pred2}"
pred3_path = f"../output/exp/ex{ex_pred3}"
pred4_path = f"../output/exp/ex{ex_pred4}"
pred5_path = f"../output/exp/ex{ex_pred5}"
pred6_path = f"../output/exp/ex{ex_pred6}"

In [4]:
# ===============
# Configs
# ===============
SEED = 0
N_SPLITS = 5
SHUFFLE = True
max_len = 2048
LABEL_ALL_SUBTOKENS = True
MODEL_PATH = 'allenai/longformer-large-4096'
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [5]:
# ===============
# Functions
# ===============
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [6]:
class TestDataset(Dataset):
    def __init__(self, ids, max_len, tokenizer):
        self.ids = ids
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        # GET TEXT AND WORD LABELS 
        name = f'{DATA_PATH}{self.ids[index]}.txt'
        txt = open(name, 'r').read()
        tokens = self.tokenizer.encode_plus(txt, max_length=self.max_len, padding='max_length',
                                   truncation=True, return_offsets_mapping=True)
        return {
          'token': torch.tensor(tokens['input_ids'], dtype=torch.long),
          'mask': torch.tensor(tokens['attention_mask'], dtype=torch.long),
           }

    def __len__(self):
        return len(self.ids)

In [7]:
target_map_rev = {0:'Lead', 1:'Position', 2:'Evidence', 3:'Claim', 4:'Concluding Statement',
             5:'Counterclaim', 6:'Rebuttal', 7:'blank'}

def collate(d,train=True):
    mask_len = int(d["mask"].sum(axis=1).max())
    if train:
        return {"token" : d['token'][:,:mask_len],
                 "mask" : d['mask'][:,:mask_len],
                 "y" : d['y'][:,:mask_len],
                  "max_len" : mask_len}
    else:
        return {"token" : d['token'][:,:mask_len],
                 "mask" : d['mask'][:,:mask_len],
                  "max_len" : mask_len}

In [8]:
# ================================
# Main
# ================================
train = pd.read_csv(TRAIN_PATH)
IDS = train.id.unique()
id_array = np.array(IDS)

In [9]:
targets = np.load(DATA_DIR + f"targets_{max_len}.npy")
train_tokens = np.load(DATA_DIR + f"tokens_{max_len}.npy")
train_attention = np.load(DATA_DIR + f"attention_{max_len}.npy")

In [10]:
# ================================
# train
# ================================
pred_len = np.ndarray((0))
pred_id = np.ndarray((0))
kf = KFold(n_splits=N_SPLITS, shuffle=SHUFFLE, random_state=SEED)
for fold, (train_idx, valid_idx) in enumerate(kf.split(id_array)):
    print(f"fold{fold}:start")
    x_val_id  = id_array[valid_idx]
    pred_dataset = TestDataset(x_val_id, max_len, tokenizer)
    pred_loader = DataLoader(pred_dataset, 
                     batch_size=8,
                     shuffle=False, 
                     pin_memory=True, drop_last=False)
    pred_len_ = np.ndarray((0))
    with torch.no_grad():  
        for d in tqdm(pred_loader,total=len(pred_loader)):
            d = collate(d,train=False)
            ids = d['token']
            pred_len_ = np.concatenate([pred_len_,np.array([d["max_len"] for i in range(len(ids))])],axis=0)
    pred_len = np.concatenate([pred_len, pred_len_],axis=0)
    pred_id = np.concatenate([pred_id, x_val_id],axis=0)

fold0:start


100%|██████████| 390/390 [00:10<00:00, 37.86it/s]


fold1:start


100%|██████████| 390/390 [00:06<00:00, 62.67it/s]


fold2:start


100%|██████████| 390/390 [00:06<00:00, 61.62it/s]


fold3:start


100%|██████████| 390/390 [00:06<00:00, 61.94it/s]


fold4:start


100%|██████████| 390/390 [00:06<00:00, 61.82it/s]


In [11]:
# pred1
for i in range(5):
    if i == 0:
        oof_pred1 = np.load(pred1_path + f"/ex{ex_pred1}_oof_npy_{i}.npy")
        fold = np.zeros(len(oof_pred1))
    else:
        oof_pred_ = np.load(pred1_path + f"/ex{ex_pred1}_oof_npy_{i}.npy")
        oof_pred1 = np.concatenate([oof_pred1,oof_pred_],axis=0)
        fold_ = np.array([i for _ in range(len(oof_pred_))])
        fold = np.concatenate([fold,fold_],axis=0)

In [12]:
# pred2

for i in range(5):
    if i == 0:
        oof_pred2 = np.load(pred2_path + f"/ex{ex_pred2}_oof_npy_{i}.npy")
    else:
        oof_pred_ = np.load(pred2_path + f"/ex{ex_pred2}_oof_npy_{i}.npy")
        oof_pred2 = np.concatenate([oof_pred2,oof_pred_],axis=0)

In [13]:
# pred3

for i in range(5):
    if i == 0:
        oof_pred3 = np.load(pred3_path + f"/ex{ex_pred3}_oof_npy_{i}.npy")
    else:
        oof_pred_ = np.load(pred3_path + f"/ex{ex_pred3}_oof_npy_{i}.npy")
        oof_pred3 = np.concatenate([oof_pred3,oof_pred_],axis=0)

In [14]:
# pred4

for i in range(5):
    if i == 0:
        oof_pred4 = np.load(pred4_path + f"/ex{ex_pred4}_oof_npy_{i}.npy")
    else:
        oof_pred_ = np.load(pred4_path + f"/ex{ex_pred4}_oof_npy_{i}.npy")
        oof_pred4 = np.concatenate([oof_pred4,oof_pred_],axis=0)

In [15]:
# pred5

for i in range(5):
    if i == 0:
        oof_pred5 = np.load(pred5_path + f"/ex{ex_pred5}_oof_npy_{i}.npy")
    else:
        oof_pred_ = np.load(pred5_path + f"/ex{ex_pred5}_oof_npy_{i}.npy")
        oof_pred5 = np.concatenate([oof_pred5,oof_pred_],axis=0)

In [16]:
# pred6

for i in range(5):
    if i == 0:
        oof_pred6 = np.load(pred6_path + f"/ex{ex_pred6}_oof_npy_{i}.npy")
    else:
        oof_pred_ = np.load(pred6_path + f"/ex{ex_pred6}_oof_npy_{i}.npy")
        oof_pred6 = np.concatenate([oof_pred6,oof_pred_],axis=0)

In [17]:
np.save(OUTPUT_DIR + "/ex019_longformer_large_2048.npy",oof_pred1)
np.save(OUTPUT_DIR + "/ex046_roberta_large_512.npy",oof_pred2)
np.save(OUTPUT_DIR + "/ex048_bart_large_512.npy",oof_pred3)
np.save(OUTPUT_DIR + "/ex051_funnel_large_512.npy",oof_pred4)
np.save(OUTPUT_DIR + "/ex064_distilbart_cnn_12_6_512.npy",oof_pred5)
np.save(OUTPUT_DIR + "/ex067_deberta_large_1024.npy",oof_pred6)
np.save(OUTPUT_DIR + "/pred_len.npy",pred_len)
np.save(OUTPUT_DIR + "/pred_id.npy",pred_id)
np.save(OUTPUT_DIR + "/fold.npy",fold)