In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #disable all tensorflow logging output
os.environ["CUDA_VISIBLE_DEVICES"]="0" #0,1,2,3 for four gpu

import gc
import pickle

import numpy as np
import pandas as pd

import tensorflow as tf
from transformers import *

from datasets import list_datasets, load_dataset

In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# USE MULTIPLE GPUS
if os.environ["CUDA_VISIBLE_DEVICES"].count(',') == 0:
    strategy = tf.distribute.get_strategy()
    print('single strategy')
else:
    strategy = tf.distribute.MirroredStrategy()
    print('multiple strategy')

Num GPUs Available:  1
single strategy


In [3]:
train = pd.read_csv('input/feedback-prize-2021/train.csv')
print('df shape', train.shape)
print('discourse types: ', train['discourse_type'].unique())
print('mean len: ', train['discourse_end'].mean())
train.head()

df shape (144293, 8)
discourse types:  ['Lead' 'Position' 'Evidence' 'Claim' 'Concluding Statement'
 'Counterclaim' 'Rebuttal']
mean len:  1200.791202622442


Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [4]:
# tokenizer = AutoTokenizer.from_pretrained('allenai/longformer-base-4096')
# # tokens = tokenizer.encode_plus("rejects", max_length=16, padding='max_length',
# #                                 truncation=True, return_offsets_mapping=True)
# tokens = tokenizer.encode_plus("This is a test")
# ids = tokens['input_ids']
# mask = tokens['attention_mask']
# print(ids)
# print(mask)

# tokenizer.convert_ids_to_tokens([29])

In [4]:
def load_conll2003(set_='train'):
    dataset = load_dataset("conll2003", revision="master")
    tokenizer = AutoTokenizer.from_pretrained('allenai/longformer-base-4096')
    seq_len = 128
    
    # saved vars
    saved = {
        'ids': np.ones((len(dataset[set_]['tokens']), seq_len), dtype='int32'),
        'attention': np.zeros((len(dataset[set_]['tokens']), seq_len), dtype='int32'),
        'pos_tags': np.zeros((len(dataset[set_]['tokens']), seq_len, 48), dtype='int32'),
        'chunk_tags': np.zeros((len(dataset[set_]['tokens']), seq_len, 24), dtype='int32'),
        'ner_tags': np.zeros((len(dataset[set_]['tokens']), seq_len, 10), dtype='int32'),
        }
    
    for i in range(len(dataset[set_]['tokens'])):
        print(i)
        idx = 0
        for j in range(len(dataset[set_]['tokens'][i])):
            tokens = tokenizer.encode_plus(dataset[set_]['tokens'][i][j], max_length=16, padding='max_length',
                                       truncation=True, return_offsets_mapping=True)
            
            # labels
            pt = dataset[set_]['pos_tags'][i][j]
            ct = dataset[set_]['chunk_tags'][i][j]
            nt = dataset[set_]['ner_tags'][i][j]
            
            # assign samples
            if idx >= seq_len:
                break
            for k in range(len(tokens['input_ids'][1:])):
                if tokens['input_ids'][k] in [1, 2] or idx >= seq_len:
                    break
                saved['ids'][i][idx] = tokens['input_ids'][k]
                saved['attention'][i][idx] = 1
                saved['pos_tags'][i][idx][pt] = 1
                saved['chunk_tags'][i][idx][ct] = 1
                saved['ner_tags'][i][idx][nt] = 1
                idx += 1
    
    # set other label
    saved['pos_tags'][:, :, -1] = 1 - np.max(saved['pos_tags'], axis=-1)
    saved['chunk_tags'][:, :, -1] = 1 - np.max(saved['chunk_tags'], axis=-1)
    saved['ner_tags'][:, :, -1] = 1 - np.max(saved['ner_tags'], axis=-1)
    
    return saved

def integrate_conll2003():
    # need 15594 samples
    print('tokenizing train set...')
    train_saved = load_conll2003(set_='train')
    print('tokenizing validation set...')
    val_saved = load_conll2003(set_='validation')
    
    # concatenate
    num_add = 15594 - len(train_saved['ids'])
    print('Num to concatenate {} ...'.format(num_add))
    train_saved['ids'] = np.concatenate((train_saved['ids'], val_saved['ids'][:num_add, :]), axis=0)
    train_saved['attention'] = np.concatenate((train_saved['attention'], val_saved['attention'][:num_add, :]), axis=0)
    train_saved['pos_tags'] = np.concatenate((train_saved['pos_tags'], val_saved['pos_tags'][:num_add, :, :]), axis=0)
    train_saved['chunk_tags'] = np.concatenate((train_saved['chunk_tags'], val_saved['chunk_tags'][:num_add, :, :]), axis=0)
    train_saved['ner_tags'] = np.concatenate((train_saved['ner_tags'], val_saved['ner_tags'][:num_add, :, :]), axis=0)
    
    # save
    print('Saving...')
    with open('tokenized_data_longformer_conll2003.pkl', 'wb') as f:
        pickle.dump(train_saved, f)
        

def load_wiki_data(path='../input/dbpedia-classes/DBPEDIA_train.csv', MODEL_NAME="bert-base-cased", MAX_LEN=1024):
    # load csv file
    df = pd.read_csv(path)
    
    # construct tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # saved vars
    saved = {
        'ids': np.zeros((len(df['text']), MAX_LEN), dtype='int32'),
        'attention': np.zeros((len(df['text']), MAX_LEN), dtype='int32'),
        'labels_l1': None,
        'labels_l2': None,
        'labels_l3': None
        }
    
    # label index
    label_to_ind = {
        'l1': dict(),
        'l2': dict(),
        'l3': dict()
    }
    
    for level in ['l1', 'l2', 'l3']:
        ind = 0
        for label in wiki_train[level].unique(): # l1, l2, l3
            label_to_ind[level][label] = ind
            ind += 1
        saved['labels_{}'.format(level)] = np.zeros((len(df['text']), len(label_to_ind[level])), dtype='int32')

    # tokenize
    for i in range(len(df['text'])):
        tokens = tokenizer.encode_plus(df['text'][i], max_length=MAX_LEN, padding='max_length',
                                       truncation=True, return_offsets_mapping=True)
        saved['ids'][i, :] = tokens['input_ids']
        saved['attention'][i, :] = tokens['attention_mask']
        for level in ['l1', 'l2', 'l3']:
            saved['labels_{}'.format(level)][i, label_to_ind[level][df[level][i]]] = 1
    
    # save
    with open('tokenized_test_data_longformer_wiki.pkl', 'wb') as f:
        pickle.dump(saved, f)
    
# wiki_train = pd.read_csv('../input/dbpedia-classes/DBPEDIA_train.csv')
# print('df shape', wiki_train.shape)
# print([k for k in wiki_train])
# print('discourse types: ', wiki_train['l1'].unique())
# print('mean len: ', np.mean([len(i.split()) for i in wiki_train['text']]))
# for level in ["l1", "l2", "l3"]:
#     print("{} len {}".format(level, len(wiki_train[level].unique())))
# wiki_train.head()

# MAX_LEN = 512
# load_wiki_data(path='../input/dbpedia-classes/DBPEDIA_test.csv', MODEL_NAME='../input/feedbacksaved/LongFormer', MAX_LEN=MAX_LEN)

# integrate_conll2003()

In [7]:
ends = [text.split()[-1][-1] for text in train['discourse_text']]
end_count = dict()
for end in ends:
    if end_count.get(end) == None:
        end_count[end] = 1
    else:
        end_count[end] += 1
# for end in end_count:
#     print('{}: {}'.format(end, end_count[end]))
for t in sorted(end_count.items(), key=lambda x: x[1], reverse=True):
    print(t)

('.', 98659)
('s', 8154)
('e', 6753)
(',', 5586)
('t', 2990)
('?', 2942)
('n', 2400)
('y', 1965)
('d', 1935)
('"', 1767)
('g', 1599)
('l', 1478)
('!', 1279)
('r', 1266)
('m', 1190)
('o', 728)
('k', 624)
('h', 559)
('a', 394)
('p', 293)
('u', 252)
(')', 251)
('w', 215)
('c', 180)
(';', 152)
('f', 144)
("'", 85)
('b', 67)
('1', 47)
(':', 40)
('2', 27)
('A', 21)
('0', 17)
('C', 17)
('i', 16)
('B', 16)
('E', 16)
('S', 12)
('5', 12)
('x', 11)
('3', 10)
('¨', 10)
('\x94', 10)
('-', 10)
('6', 10)
('8', 9)
('9', 8)
('7', 7)
('D', 7)
('4', 6)
('I', 6)
(']', 6)
('R', 5)
('%', 4)
('\\', 4)
('O', 3)
('/', 2)
('*', 2)
('z', 2)
('(', 2)
('Y', 1)
('~', 1)
('®', 1)
('G', 1)
('N', 1)
('+', 1)
('T', 1)
('H', 1)
('L', 1)
('K', 1)
('v', 1)


In [8]:
labels = [l for l in train['discourse_type']]
label_count = dict()
for label in labels:
    if label_count.get(label) == None:
        label_count[label] = 1
    else:
        label_count[label] += 1

'''
Claim, Counterclaim, Rebuttal ~ 60k

Lead, Position, Concluding Statement ~38k

Evidence ~ 45k
'''

for t in sorted(label_count.items(), key=lambda x: x[1], reverse=True):
    print(t)

('Claim', 50208)
('Evidence', 45702)
('Position', 15419)
('Concluding Statement', 13505)
('Lead', 9305)
('Counterclaim', 5817)
('Rebuttal', 4337)


In [69]:
# functions for loading and train/val data

def load_train_data(MODEL_NAME="bert-base-cased", MAX_LEN=1024):
    # construct tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # load csv file
    df = pd.read_csv('input/feedback-prize-2021/train.csv')
    IDS = df.id.unique()
    train_ids = np.zeros((len(IDS), MAX_LEN), dtype='int32')
    train_attention = np.zeros((len(IDS), MAX_LEN), dtype='int32')
    
    # init labels
    label_to_ind = {
        'Lead_b': 0,
        'Lead_i': 1,
        'Position_b': 2,
        'Position_i': 3,
        'Evidence_b': 4,
        'Evidence_i': 5,
        'Claim_b': 6,
        'Claim_i': 7,
        'Concluding Statement_b': 8,
        'Concluding Statement_i': 9,
        'Counterclaim_b': 10,
        'Counterclaim_i': 11,
        'Rebuttal_b': 12,
        'Rebuttal_i': 13,
        'other': 14
    }    
    train_labels = np.zeros((len(IDS), MAX_LEN, len(label_to_ind)), dtype='int32')
    
    # form samples
    for i in range(len(IDS)):
        if i % 1000 == 0:
            print(i)
        # read txt file
        filename = 'input/feedback-prize-2021/train/{}.txt'.format(IDS[i])
        txt = open(filename, 'r').read()
        
        # tokenize
        tokens = tokenizer.encode_plus(txt, max_length=MAX_LEN, padding='max_length',
                                       truncation=True, return_offsets_mapping=True)
        train_ids[i, :] = tokens['input_ids']
        train_attention[i, :] = tokens['attention_mask']
        offsets = tokens['offset_mapping']
        
        # extract labels for each token
        curr_df = df.loc[df.id==IDS[i]]
        offset_ind = 0
        for index,row in curr_df.iterrows():
            label = row.discourse_type + '_b'
            
            w_start = row.discourse_start
            w_end = row.discourse_end
            
            if offset_ind >= len(offsets):
                break
            
            # set labels
            t_start = offsets[offset_ind][0]
            while w_end > t_start:
                # exit condition
                if offset_ind >= len(offsets):
                    break
                
                # get current token index
                t_start = offsets[offset_ind][0]
                t_end = offsets[offset_ind][1]
                
                # set label if within range
                if t_end <= w_end:
                    train_labels[i, offset_ind, label_to_ind[label]] = 1
                    label = row.discourse_type + '_i'
                
                # update global var(s)
                offset_ind += 1
    train_labels[:, :, 14] = 1 - np.max(train_labels, axis=-1)
    return train_ids, train_attention, train_labels

def load_test_data(MODEL_NAME="bert-base-cased", MAX_LEN=1024, INDEX = range(5)):
    # construct tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    df = pd.read_csv('input/feedback-prize-2021/train.csv')
    IDS = df.id.unique()
    IDS = IDS[INDEX]
    train_ids = np.zeros((len(IDS), MAX_LEN), dtype='int32')
    train_attention = np.zeros((len(IDS), MAX_LEN), dtype='int32')

    label_to_ind = {
        'Lead_b': 0,
        'Lead_i': 1,
        'Position_b': 2,
        'Position_i': 3,
        'Evidence_b': 4,
        'Evidence_i': 5,
        'Claim_b': 6,
        'Claim_i': 7,
        'Concluding Statement_b': 8,
        'Concluding Statement_i': 9,
        'Counterclaim_b': 10,
        'Counterclaim_i': 11,
        'Rebuttal_b': 12,
        'Rebuttal_i': 13,
        'other': 14
    }
    train_labels = np.zeros((len(IDS), MAX_LEN, len(label_to_ind)), dtype='int32')
    # form samples
    for i in range(len(IDS)):
        if i % 1000 == 0:
            print(i)
        # read txt file
        filename = 'input/feedback-prize-2021/train/{}.txt'.format(IDS[i])
        txt = open(filename, 'r').read()
        
        # tokenize
        tokens = tokenizer.encode_plus(txt, max_length=MAX_LEN, padding='max_length',
                                       truncation=True, return_offsets_mapping=True)
        train_ids[i, :] = tokens['input_ids']
        train_attention[i, :] = tokens['attention_mask']
        offsets = tokens['offset_mapping']

        # extract labels for each token
        curr_df = df.loc[df.id==IDS[i]]
        offset_ind = 0
        for index,row in curr_df.iterrows():
            label = row.discourse_type + '_b'

            w_start = row.discourse_start
            w_end = row.discourse_end

            if offset_ind >= len(offsets):
                break

            # set labels
            t_start = offsets[offset_ind][0]
            while w_end > t_start:
                # exit condition
                if offset_ind >= len(offsets):
                    break

                # get current token index
                t_start = offsets[offset_ind][0]
                t_end = offsets[offset_ind][1]

                # set label if within range
                if t_end <= w_end:
                    train_labels[i, offset_ind, label_to_ind[label]] = 1
                    label = row.discourse_type + '_i'

                # update global var(s)
                offset_ind += 1
    train_labels[:, :, 14] = 1 - np.max(train_labels, axis=-1)
    
    return train_ids, train_attention, IDS, train_labels

In [87]:
def download_save_model(MODEL_NAME="allenai/longformer-base-4096"):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    config = AutoConfig.from_pretrained(MODEL_NAME)
    backbone = TFAutoModel.from_pretrained(MODEL_NAME, config=config)
    backbone.trainable = True
    
    # save the model
    os.mkdir('model')
    backbone.save_pretrained('model')
    config.save_pretrained('model')
    tokenizer.save_pretrained('model')

# connection port
def build_model(MODEL_NAME="allenai/longformer-base-4096", MAX_LEN=1024, LR=1e-4):
#     model = LongFormer(MODEL_NAME=MODEL_NAME, MAX_LEN=MAX_LEN, LR=LR) # baseline
    model = LongFormerMultitask(MODEL_NAME=MODEL_NAME, MAX_LEN=MAX_LEN, LR=LR) # baseline
#   model = LongFormerMultitaskWiki(MODEL_NAME=MODEL_NAME, MAX_LEN=MAX_LEN, LR=LR) # baseline
    return model

# define models
def LongFormerMultitask(MODEL_NAME="allenai/longformer-base-4096", MAX_LEN=1024, LR=1e-4):
    # construct input
    input_ids = tf.keras.layers.Input(shape=(MAX_LEN,), name='input_ids', dtype='int32')
    mask = tf.keras.layers.Input(shape=(MAX_LEN,), name='attention_mask', dtype='int32')
    
    # pretrained/finetuned model (Transformers)
    config = AutoConfig.from_pretrained(MODEL_NAME)
    backbone = TFAutoModel.from_pretrained(MODEL_NAME, config=config)
    backbone.trainable = True
    
    # downstream output layer(s)
    out = backbone(input_ids, attention_mask=mask)[0]
#     out = tf.keras.layers.Dense(256, activation='relu')(out)

    # multitask configuration
    tasks = ["main_task"]
    out_size = [15]
    tasks_weight = [1.0]

    tasks = ["main_task", "coarse_class", "binary_class"]
    out_size = np.array([15, 7, 3])
    tasks_weight = [1.0, 0.6, 0.4]
#     tasks_weight = np.exp(out_size/5) / np.exp(out_size/5).sum()
    print('task weigts', tasks_weight)

    # construct multihead output
    outputs = list()
    loss = dict()
    loss_weights = dict()
    mets = dict()

    for i in range(len(tasks)):
        subout = tf.keras.layers.Dense(256, activation='relu')(out)
        outputs.append(tf.keras.layers.Dense(out_size[i], activation='softmax', dtype='float32', name=tasks[i])(subout))
        loss[tasks[i]] = tf.keras.losses.CategoricalCrossentropy()
        loss_weights[tasks[i]] = tasks_weight[i]
        mets[tasks[i]] = tf.keras.metrics.CategoricalAccuracy()
    
    # integration
    model = tf.keras.Model(inputs=[input_ids,mask], outputs=outputs)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = LR),
                  loss = loss,
                  metrics = mets,
                  loss_weights = loss_weights
                 )
    
    return model

def LongFormerMultitaskWiki(MODEL_NAME="allenai/longformer-base-4096", MAX_LEN=1024, LR=1e-4):
    # construct input
    input_ids = tf.keras.layers.Input(shape=(MAX_LEN,), name='input_ids', dtype='int32')
    mask = tf.keras.layers.Input(shape=(MAX_LEN,), name='attention_mask', dtype='int32')
    
    wiki_input_ids = tf.keras.layers.Input(shape=(512,), name='wiki_input_ids', dtype='int32')
    wiki_mask = tf.keras.layers.Input(shape=(512,), name='wiki_attention_mask', dtype='int32')
    
    # pretrained/finetuned model (Transformers)
    config = AutoConfig.from_pretrained(MODEL_NAME)
    backbone = TFAutoModel.from_pretrained(MODEL_NAME, config=config)
    backbone.trainable = True
    
    # downstream output layer(s)
    out = backbone(input_ids, attention_mask=mask)[0]
    wiki_out = backbone(wiki_input_ids, attention_mask=wiki_mask)[0]
    
    # config multitasks
    tasks = {
        "main_task": {
            "out": tf.keras.Sequential(
                [
                    tf.keras.layers.Dense(256, activation="relu"),
                    tf.keras.layers.Dense(15, activation="softmax"),
                ],
                name="main_task"
            )(out),
            "task_weight": 1.0,
            "loss": tf.keras.losses.CategoricalCrossentropy(),
            "met": tf.keras.metrics.CategoricalAccuracy()
        },
        "coarse_class": {
            "out": tf.keras.Sequential(
                [
                    tf.keras.layers.Dense(256, activation="relu"),
                    tf.keras.layers.Dense(7, activation="softmax"),
                ],
                name="coarse_class"
            )(out),
            "task_weight": 0.6,
            "loss": tf.keras.losses.CategoricalCrossentropy(),
            "met": tf.keras.metrics.CategoricalAccuracy()
        },
        "binary_class": {
            "out": tf.keras.Sequential(
                [
                    tf.keras.layers.Dense(256, activation="relu"),
                    tf.keras.layers.Dense(3, activation="softmax"),
                ],
                name="binary_class"
            )(out),
            "task_weight": 0.4,
            "loss": tf.keras.losses.CategoricalCrossentropy(),
            "met": tf.keras.metrics.CategoricalAccuracy()
        },
        "wiki_l3": {
            "out": tf.keras.Sequential(
                [
                    tf.keras.layers.GlobalAveragePooling1D(),
                    tf.keras.layers.Dense(256, activation="relu"),
                    tf.keras.layers.Dense(219, activation="softmax"),
                ],
                name="wiki_l3"
            )(wiki_out),
            "task_weight": 1.0,
            "loss": tf.keras.losses.CategoricalCrossentropy(),
            "met": tf.keras.metrics.CategoricalAccuracy()
        },
        "wiki_l2": {
            "out": tf.keras.Sequential(
                [
                    tf.keras.layers.GlobalAveragePooling1D(),
                    tf.keras.layers.Dense(256, activation="relu"),
                    tf.keras.layers.Dense(70, activation="softmax"),
                ],
                name="wiki_l2"
            )(wiki_out),
            "task_weight": 1e-1,
            "loss": tf.keras.losses.CategoricalCrossentropy(),
            "met": tf.keras.metrics.CategoricalAccuracy()
        },
        "wiki_l1": {
            "out": tf.keras.Sequential(
                [
                    tf.keras.layers.GlobalAveragePooling1D(),
                    tf.keras.layers.Dense(256, activation="relu"),
                    tf.keras.layers.Dense(9, activation="softmax"),
                ],
                name="wiki_l1"
            )(wiki_out),
            "task_weight": 1e-2,
            "loss": tf.keras.losses.CategoricalCrossentropy(),
            "met": tf.keras.metrics.CategoricalAccuracy()
        },
    }

    # construct multihead output
    outputs = [tasks[task]["out"] for task in tasks]
    loss = dict()
    loss_weights = dict()
    mets = dict()

    for task in tasks:
        loss[task] = tasks[task]["loss"]
        loss_weights[task] = tasks[task]["task_weight"]
        mets[task] = tasks[task]["met"]
    
    # integration
    model = tf.keras.Model(inputs=[input_ids, mask, wiki_input_ids, wiki_mask], outputs=outputs)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = LR),
                  loss = loss,
                  metrics = mets,
                  loss_weights = loss_weights
                 )
    
    return model

def coarse_class(labels):
    N, L, C = labels.shape

    # 0, 1 are Argument;, 2, 3 are Declaration; 4, 5 are Evidence; 6 is other
    ind_to_ind = {
        6: 0, 
        7: 1,
        10: 0, 
        11: 1, 
        12: 0, 
        13: 1,
        0: 2,
        1: 3,
        2: 2, 
        3: 3,
        8: 2, 
        9: 3, 
        4: 4, 
        5: 5,
        14: 6
    }
    coarse_labels = np.zeros((N, L, 7), dtype='int32')
    old_labels = labels.argmax(axis=2)
    for i in range(N):
        for j in range(L):
            coarse_labels[i, j][ind_to_ind[old_labels[i, j]]] = 1
    return coarse_labels

def binary_class(labels):
    N, L, C = labels.shape

    # 0 for begin, 1 for inside, 2 for other
    ind_to_ind = {14: 2}
    for i in range(14):
        ind_to_ind[i] = 0 if i % 2 == 0 else 1

    binary_labels = np.zeros((N, L, 3), dtype='int32')
    old_labels = labels.argmax(axis=2)
    for i in range(N):
        for j in range(L):
            binary_labels[i, j][ind_to_ind[old_labels[i, j]]] = 1
    return binary_labels

In [88]:
gc.collect()
# MODEL_NAME = 'allenai/longformer-base-4096'
MODEL_NAME = 'input/LongFormer'
MAX_LEN = 1024

LR=0.25e-4
BATCH_SIZE = 2
EPOCHS = 5

# # processing data
# ids, attention, labels = load_train_data(MODEL_NAME=MODEL_NAME, MAX_LEN=MAX_LEN)

# with open('tokenized_data_longformer.pkl', 'wb') as f:
#     saved = {
#         'train_ids': train_ids,
#         'train_attention': train_attention,
#         'train_labels': train_labels
#     }
#     pickle.dump(saved, f)

# load saved data and build model
with open('input/tokenized_data_longformer.pkl', 'rb') as f:
    saved = pickle.load(f)
    ids = saved['train_ids'][:, :MAX_LEN]
    attention = saved['train_attention'][:, :MAX_LEN]
    labels = saved['train_labels'][:, :MAX_LEN, :]

print('input seq shape', ids.shape)
print('attention shape', attention.shape)
print('labels shape', labels.shape)

with open('input/tokenized_train_data_longformer_wiki.pkl', 'rb') as f:
    saved = pickle.load(f)
    wiki_ids = saved['ids']
    wiki_attention = saved['attention']
    wiki_labels_l1 = saved['labels_l1']
    wiki_labels_l2 = saved['labels_l2']
    wiki_labels_l3 = saved['labels_l3']

print('wiki input seq shape', wiki_ids.shape)
print('wiki attention shape', wiki_attention.shape)
print('wiki labels l1 shape', wiki_labels_l1.shape)
print('wiki labels l2 shape', wiki_labels_l2.shape)
print('wiki labels l3 shape', wiki_labels_l3.shape)

# construct model
with strategy.scope():
    model = build_model(MODEL_NAME=MODEL_NAME, MAX_LEN=MAX_LEN, LR=LR)
print(model.summary())

input seq shape (15594, 1024)
attention shape (15594, 1024)
labels shape (15594, 1024, 15)


loading configuration file input/LongFormer\config.json
Model config LongformerConfig {
  "_name_or_path": "input/LongFormer",
  "architectures": [
    "LongformerModel"
  ],
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

load

wiki input seq shape (240942, 512)
wiki attention shape (240942, 512)
wiki labels l1 shape (240942, 9)
wiki labels l2 shape (240942, 70)
wiki labels l3 shape (240942, 219)


Input ids are automatically padded from 5 to 512 to be a multiple of `config.attention_window`: 512
All model checkpoint layers were used when initializing TFLongformerModel.

All the layers of TFLongformerModel were initialized from the model checkpoint at input/LongFormer.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLongformerModel for predictions without further training.


task weigts [1.0, 0.6, 0.4]
Model: "model_22"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 1024)]       0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 1024)]       0           []                               
                                                                                                  
 tf_longformer_model_22 (TFLong  TFLongformerBaseMod  148659456  ['input_ids[0][0]',              
 formerModel)                   elOutputWithPooling               'attention_mask[0][0]']         
                                (last_hidden_state=                                               
                                (None, 1024, 768),             

In [89]:
# construct labels
print(labels)
coarse_labels = coarse_class(labels)
binary_labels = binary_class(labels)

# ============================== SPLIT_LINE ==================================

# TRAIN VALID SPLIT 80% 20%
train_size = 0.8

# split dataset
np.random.seed(42)
inds = [i for i in range(len(ids))]
np.random.shuffle(inds)
split_point = int(train_size * len(inds))
train_idx = inds[:split_point]
val_idx = inds[split_point:]
print('Train size',len(train_idx),', Valid size',len(val_idx))

[[[1 0 0 ... 0 0 0]
  [0 1 0 ... 0 0 0]
  [0 1 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]]

 [[0 0 1 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]]

 [[1 0 0 ... 0 0 0]
  [0 1 0 ... 0 0 0]
  [0 1 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]]

 ...

 [[1 0 0 ... 0 0 0]
  [0 1 0 ... 0 0 0]
  [0 1 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[1 0 0 ... 0 0 0]
  [0 1 0 ... 0 0 0]
  [0 1 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[1 0 0 ... 0 0 0]
  [0 1 0 ... 0 0 0]
  [0 1 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]]
Train size 12475 , Valid size 3119


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

with open('input/tokenized_train_data_longformer_wiki.pkl', 'rb') as f:
    saved = pickle.load(f)
    wiki_ids = saved['ids']
    wiki_attention = saved['attention']
    wiki_labels_l1 = saved['labels_l1']
    wiki_labels_l2 = saved['labels_l2']
    wiki_labels_l3 = saved['labels_l3']
    
wiki_inds = [i for i in range(len(wiki_ids))]

# in order to train data simoutenously, the size of datasets need to be the same
print(wiki_ids.shape)
sss = StratifiedShuffleSplit(n_splits=1, test_size=15594 / 240942, random_state=42)
for train_index, test_index in sss.split(wiki_inds, wiki_labels_l1):
    wiki_ids = wiki_ids[test_index]
    wiki_attention = wiki_attention[test_index]
    wiki_labels_l1 = wiki_labels_l1[test_index]
    wiki_labels_l2 = wiki_labels_l2[test_index]
    wiki_labels_l3 = wiki_labels_l3[test_index]
print(wiki_ids.shape)

wiki_inds = [i for i in range(len(wiki_ids))]
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(wiki_inds, wiki_labels_l1):
    wiki_train_idx = train_index
    wiki_val_idx = test_index
print('Train size',len(wiki_train_idx),', Valid size',len(wiki_val_idx))

In [90]:
gc.collect()

# train_labels = [labels[train_idx,]]
# val_labels = [labels[val_idx,]]

train_labels = [labels[train_idx,], coarse_labels[train_idx,], binary_labels[train_idx,]]
val_labels = [labels[val_idx,], coarse_labels[val_idx,], binary_labels[val_idx,]]

# train_labels = [
#     labels[train_idx,],
#     coarse_labels[train_idx,],
#     binary_labels[train_idx,],
#     wiki_labels_l3[wiki_train_idx,],
#     wiki_labels_l2[wiki_train_idx,],
#     wiki_labels_l1[wiki_train_idx,]
# ]
# val_labels = [
#     labels[val_idx,],
#     coarse_labels[val_idx,],
#     binary_labels[val_idx,],
#     wiki_labels_l3[wiki_val_idx,],
#     wiki_labels_l2[wiki_val_idx,],
#     wiki_labels_l1[wiki_val_idx,]
# ]

print('start training...')
model.fit(x = [ids[train_idx,], attention[train_idx,]],
          y = train_labels,
          validation_data = ([ids[val_idx,], attention[val_idx,]],
                             val_labels),
          epochs = 5,
          batch_size = 2,
          verbose = 2)

# SAVE MODEL WEIGHTS
model.save_weights('saved_model2.h5')

start training...
Epoch 1/5
6238/6238 - 1790s - loss: 1.4268 - main_task_loss: 0.8294 - coarse_class_loss: 0.7245 - binary_class_loss: 0.4066 - main_task_categorical_accuracy: 0.6691 - coarse_class_categorical_accuracy: 0.6792 - binary_class_categorical_accuracy: 0.7591 - val_loss: 1.3178 - val_main_task_loss: 0.7576 - val_coarse_class_loss: 0.6711 - val_binary_class_loss: 0.3941 - val_main_task_categorical_accuracy: 0.6994 - val_coarse_class_categorical_accuracy: 0.7089 - val_binary_class_categorical_accuracy: 0.7808 - 1790s/epoch - 287ms/step
Epoch 2/5
6238/6238 - 2120s - loss: 1.3159 - main_task_loss: 0.7568 - coarse_class_loss: 0.6699 - binary_class_loss: 0.3928 - main_task_categorical_accuracy: 0.6931 - coarse_class_categorical_accuracy: 0.7024 - binary_class_categorical_accuracy: 0.7680 - val_loss: 1.2903 - val_main_task_loss: 0.7389 - val_coarse_class_loss: 0.6579 - val_binary_class_loss: 0.3916 - val_main_task_categorical_accuracy: 0.7065 - val_coarse_class_categorical_accuracy

<a href="/kaggle/working/saved_model.h5"> Download File </a>

In [None]:
def TransferConll(MODEL_NAME="allenai/longformer-base-4096", MAX_LEN=1024, LR=1e-4):
    # construct input
    conll_input_ids = tf.keras.layers.Input(shape=(128,), name='conll_input_ids', dtype='int32')
    conll_mask = tf.keras.layers.Input(shape=(128,), name='conll_attention_mask', dtype='int32')

    # pretrained/finetuned model (Transformers)
    config = AutoConfig.from_pretrained(MODEL_NAME)
    backbone = TFAutoModel.from_pretrained(MODEL_NAME, config=config)
    backbone.trainable = True

    # downstream output layer(s)
    conll_out = backbone(conll_input_ids, attention_mask=conll_mask)[0]

    # config multitasks
    tasks = {
        "POS": {
            "out": tf.keras.Sequential(
                [
                    tf.keras.layers.Dense(256, activation="relu"),
                    tf.keras.layers.Dense(48, activation="softmax"),
                ],
                name="POS"
            )(conll_out),
            "task_weight": 1.0,
            "loss": tf.keras.losses.CategoricalCrossentropy(),
            "met": tf.keras.metrics.CategoricalAccuracy()
        },
        "CHUNK": {
            "out": tf.keras.Sequential(
                [
                    tf.keras.layers.Dense(256, activation="relu"),
                    tf.keras.layers.Dense(24, activation="softmax"),
                ],
                name="CHUNK"
            )(conll_out),
            "task_weight": 1e-1,
            "loss": tf.keras.losses.CategoricalCrossentropy(),
            "met": tf.keras.metrics.CategoricalAccuracy()
        },
        "NER": {
            "out": tf.keras.Sequential(
                [
                    tf.keras.layers.Dense(256, activation="relu"),
                    tf.keras.layers.Dense(10, activation="softmax"),
                ],
                name="NER"
            )(conll_out),
            "task_weight": 1e-2,
            "loss": tf.keras.losses.CategoricalCrossentropy(),
            "met": tf.keras.metrics.CategoricalAccuracy()
        },
    }

    # construct multihead output
    outputs = [tasks[task]["out"] for task in tasks]
    loss = dict()
    loss_weights = dict()
    mets = dict()

    for task in tasks:
        loss[task] = tasks[task]["loss"]
        loss_weights[task] = tasks[task]["task_weight"]
        mets[task] = tasks[task]["met"]

    # integration
    model = tf.keras.Model(inputs=[conll_input_ids, conll_mask], outputs=outputs)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = LR),
                  loss = loss,
                  metrics = mets,
                  loss_weights = loss_weights
                 )

    return model
#MODEL_NAME = 'allenai/longformer-base-4096'
MODEL_NAME = 'input/LongFormer'
MAX_LEN = 1024
LR=0.25e-4

# build and load model
with strategy.scope():
    model = TransferConll(MODEL_NAME=MODEL_NAME, MAX_LEN=MAX_LEN, LR=LR)
model.load_weights('saved_model.h5')
print(model.summary())
gc.collect()

with open('input/tokenized_data_longformer_conll2003.pkl', 'rb') as f:
    saved = pickle.load(f)
    conll_ids = saved['ids']
    conll_attention = saved['attention']
    conll_pos_tags = saved['pos_tags']
    conll_chunks=_tags = saved['chunk_tags']
    conll_ner_tags = saved['ner_tags']

train_size = 0.8
np.random.seed(42)
inds = [i for i in range(len(conll_ids))]
np.random.shuffle(inds)
split_point = int(train_size * len(inds))
train_idx = inds[:split_point]
val_idx = inds[split_point:]
print('Train size',len(train_idx),', Valid size',len(val_idx))

train_labels = [
    conll_pos_tags[train_idx,],
    conll_chunks[train_idx,],
    conll_ner_tags[train_idx,]
]

val_labels = [
    conll_pos_tags[val_idx,],
    conll_chunks[val_idx,],
    conll_ner_tags[val_idx,]
]


print('start training...')
model.fit(x = [conll_ids[train_idx,], conll_attention[train_idx,]],
          y = train_labels,
          validation_data = ([conll_ids[val_idx,], conll_attention[val_idx,]],
                             val_labels),
          epochs = 5,
          batch_size = 2,
          verbose = 2)

# SAVE MODEL WEIGHTS
model.save_weights('saved_model.h5')






def LongFormerMultitask(MODEL_NAME="allenai/longformer-base-4096", MAX_LEN=1024, LR=1e-4):
    # construct input
    input_ids = tf.keras.layers.Input(shape=(MAX_LEN,), name='input_ids', dtype='int32')
    mask = tf.keras.layers.Input(shape=(MAX_LEN,), name='attention_mask', dtype='int32')


    # pretrained/finetuned model (Transformers)
    config = AutoConfig.from_pretrained(MODEL_NAME)
    backbone = TFAutoModel.from_pretrained(MODEL_NAME, name='tf_longformer_model', config=config)
    print(backbone.summary)
    backbone.trainable = True

    # downstream output layer(s)
    out = backbone(input_ids, attention_mask=mask)[0]

    # config multitasks
    tasks = {
        "main_task": {
            "out": tf.keras.Sequential(
                [
                    tf.keras.layers.Dense(256, activation="relu"),
                    tf.keras.layers.Dense(15, activation="softmax"),
                ],
                name="main_task"
            )(out),
            "task_weight": 1.0,
            "loss": tf.keras.losses.CategoricalCrossentropy(),
            "met": tf.keras.metrics.CategoricalAccuracy()
        },
        "coarse_class": {
            "out": tf.keras.Sequential(
                [
                    tf.keras.layers.Dense(256, activation="relu"),
                    tf.keras.layers.Dense(7, activation="softmax"),
                ],
                name="coarse_class"
            )(out),
            "task_weight": 0.6,
            "loss": tf.keras.losses.CategoricalCrossentropy(),
            "met": tf.keras.metrics.CategoricalAccuracy()
        },
        "binary_class": {
            "out": tf.keras.Sequential(
                [
                    tf.keras.layers.Dense(256, activation="relu"),
                    tf.keras.layers.Dense(3, activation="softmax"),
                ],
                name="binary_class"
            )(out),
            "task_weight": 0.4,
            "loss": tf.keras.losses.CategoricalCrossentropy(),
            "met": tf.keras.metrics.CategoricalAccuracy()
        }
    }

    # construct multihead output
    outputs = [tasks[task]["out"] for task in tasks]
    loss = dict()
    loss_weights = dict()
    mets = dict()

    for task in tasks:
        loss[task] = tasks[task]["loss"]
        loss_weights[task] = tasks[task]["task_weight"]
        mets[task] = tasks[task]["met"]

    # integration
    model = tf.keras.Model(inputs=[input_ids, mask], outputs=outputs)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = LR),
                  loss = loss,
                  metrics = mets,
                  loss_weights = loss_weights
                 )

    return model
MODEL_NAME = 'input/LongFormer'
MODEL_WEIGHT = 'saved_model.h5'
MAX_LEN = 1024
LR=0.25e-4
import h5py

with open('input/tokenized_data_longformer.pkl', 'rb') as f:
    saved = pickle.load(f)
    ids = saved['train_ids'][:, :MAX_LEN]
    attention = saved['train_attention'][:, :MAX_LEN]
    labels = saved['train_labels'][:, :MAX_LEN, :]

# construct labels
coarse_labels = coarse_class(labels)
binary_labels = binary_class(labels)

# ============================== SPLIT_LINE ==================================

# TRAIN VALID SPLIT 80% 20%
train_size = 0.8

# split dataset
np.random.seed(42)
inds = [i for i in range(len(ids))]
np.random.shuffle(inds)
split_point = int(train_size * len(inds))
train_idx = inds[:split_point]
val_idx = inds[split_point:]
print('Train size',len(train_idx),', Valid size',len(val_idx))

print('input seq shape', ids.shape)
print('attention shape', attention.shape)
print('labels shape', labels.shape)

# build and load model


with strategy.scope():
    model = LongFormerMultitask(MODEL_NAME=MODEL_NAME, MAX_LEN=MAX_LEN, LR=LR)
model.load_weights(MODEL_WEIGHT, by_name=True)
print(model.summary())
gc.collect()
train_labels = [
    labels[train_idx,],
    coarse_labels[train_idx,],
    binary_labels[train_idx,],
]
val_labels = [
    labels[val_idx,],
    coarse_labels[val_idx,],
    binary_labels[val_idx,],
]

print('start training...')
model.fit(x = [ids[train_idx,], attention[train_idx,]],
          y = train_labels,
          validation_data = ([ids[val_idx,], attention[val_idx,]],
                             val_labels),
          epochs = 5,
          batch_size = 2,
          verbose = 2)

# SAVE MODEL WEIGHTS
model.save_weights('saved_model1.h5')

**Make Prediction on Test Set, and Postprocessing**  
The following codes are mainly for test-time running and post-processing

In [110]:
# =====================================================================
def get_preds(dataset='train', verbose=True, text_ids=None, preds=None):
    target_map_rev = {0: 'Lead', 1: 'Position', 2: 'Evidence', 3: 'Claim', 4: 'Concluding Statement', 5: 'Counterclaim', 6: 'Rebuttal', 7: 'blank'}
    
    # construct tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    all_predictions = list()
    for id_num in range(len(preds)):
#         if (id_num % 100 == 0) & (verbose): print(id_num, ', ', end = '')

        # read and tokenize txt
        n = text_ids[id_num]
        name = f'input/feedback-prize-2021/{dataset}/{n}.txt'
        txt = open(name, 'r').read()
        tokens = tokenizer.encode_plus(txt, max_length = MAX_LEN, padding = 'max_length', truncation = True, return_offsets_mapping = True)
        off = tokens['offset_mapping']
        
        # find the start of each word
        w = list()
        blank = True
        for i in range(len(txt)):
            if (txt[i] != ' ') & (txt[i] != '\n') & (blank == True):
                w.append(i)
                blank = False
            elif (txt[i] == ' ') | (txt[i] == '\n'):
                blank = True
        w.append(1e6)
        
        # create token_to_word map
        word_map = -1 * np.ones(MAX_LEN, dtype = 'int32')
        w_i = 0
        for i in range(len(off)):
            if off[i][1] == 0:
                continue
            while off[i][0] >= w[w_i + 1]: 
                w_i += 1
            word_map[i] = int(w_i)
        
        # retrieve the segments and class
        pred = preds[id_num,] / 2.0
        i = 0
        while i < MAX_LEN:
            prediction = list()
            start = pred[i]
            if start in target_map_rev.keys():
                prediction.append(word_map[i])
                i += 1
                if i >= MAX_LEN: 
                    break
                while pred[i] == start + 0.5:
                    if not word_map[i] in prediction: 
                        prediction.append(word_map[i])
                    i += 1
                    if i >= MAX_LEN: 
                        break
            else: 
                i += 1
            prediction = [x for x in prediction if x != -1]
            if len(prediction) > 4:
                all_predictions.append((n, target_map_rev[int(start)], ' '.join([str(x) for x in prediction])))

    # MAKE DATAFRAME
    df = pd.DataFrame(all_predictions)
    df.columns = ['id', 'class', 'predictionstring']
    return df

In [111]:
# # MODEL_NAME = "bert-base-cased"
# MODEL_NAME = "../input/feedbacksaved/BERT" # load from pretrained.
# MAX_LEN = 512
# MODEL_WEIGHT = '../input/feedbacksaved/BERT_entire.h5'

#MODEL_NAME = 'allenai/longformer-base-4096'
MODEL_NAME = 'input/LongFormer'
MAX_LEN = 1024
# MODEL_WEIGHT = '../input/feedbacksaved/LongFormer_entire.h5'
MODEL_WEIGHT = "saved_model2.h5"

# build and load model
with strategy.scope():
    model = build_model(MODEL_NAME=MODEL_NAME, MAX_LEN=MAX_LEN)
model.load_weights(MODEL_WEIGHT)
print(model.summary())
print('Model Loading Complete.')

# load test data
test_ids, test_attention, test_IDS, test_labels = load_test_data(MODEL_NAME=MODEL_NAME, MAX_LEN=MAX_LEN, INDEX=val_idx[:5])
print('Test Data Loading Complete.')

# make prediction
test_pred = model.predict([test_ids, test_attention], batch_size=4, verbose=2)[0].argmax(axis=-1)
print('Prediction Complete.')

loading configuration file input/LongFormer\config.json
Model config LongformerConfig {
  "_name_or_path": "input/LongFormer",
  "architectures": [
    "LongformerModel"
  ],
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "sep_token_id": 2,
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

load

task weigts [1.0, 0.6, 0.4]
Model: "model_24"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 1024)]       0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 1024)]       0           []                               
                                                                                                  
 tf_longformer_model_25 (TFLong  TFLongformerBaseMod  148659456  ['input_ids[0][0]',              
 formerModel)                   elOutputWithPooling               'attention_mask[0][0]']         
                                (last_hidden_state=                                               
                                (None, 1024, 768),             

Didn't find file input/LongFormer\added_tokens.json. We won't load it.
loading file input/LongFormer\vocab.json
loading file input/LongFormer\merges.txt
loading file input/LongFormer\tokenizer.json
loading file None
loading file input/LongFormer\special_tokens_map.json
loading file input/LongFormer\tokenizer_config.json


None
Model Loading Complete.
0
Test Data Loading Complete.


Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x0000024C42BBA820>
Traceback (most recent call last):
  File "C:\Users\SeleneXX\AppData\Local\Programs\Python\Python39\lib\weakref.py", line 371, in remove
    self = selfref()
KeyboardInterrupt: 


KeyboardInterrupt: 

In [112]:
# def get_preds(dataset='train', verbose=True, text_ids=None, preds=None):
#     # target_map_rev = {0: 'Lead', 1: 'Position', 2: 'Evidence', 3: 'Claim', 4: 'Concluding Statement', 5: 'Counterclaim', 6: 'Rebuttal', 7: 'blank'}
#     target_map_rev = {
#         0: 'Lead_b',
#         1: 'Lead_i',
#         2: 'Position_b',
#         3: 'Position_i',
#         4: 'Evidence_b',
#         5: 'Evidence_i',
#         6: 'Claim_b',
#         7: 'Claim_i',
#         8: 'Concluding Statement_b',
#         9: 'Concluding Statement_i',
#         10: 'Counterclaim_b',
#         11: 'Counterclaim_i',
#         12: 'Rebuttal_b',
#         13: 'Rebuttal_i',
#         14: 'other'
#     }
#
#     # construct tokenizer
#     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
#
#     all_predictions = list()
#     for id_num in range(len(preds)):
# #         if (id_num % 100 == 0) & (verbose): print(id_num, ', ', end = '')
#
#         # read and tokenize txt
#         n = text_ids[id_num]
#         name = f'input/feedback-prize-2021/{dataset}/{n}.txt'
#         txt = open(name, 'r').read()
#         tokens = tokenizer.encode_plus(txt, max_length = MAX_LEN, padding = 'max_length', truncation = True, return_offsets_mapping = True)
#         off = tokens['offset_mapping']
#
#         # find the start of each word
#         w = list()
#         blank = True
#         for i in range(len(txt)):
#             if (txt[i] != ' ') & (txt[i] != '\n') & (blank == True):
#                 w.append(i)
#                 blank = False
#             elif (txt[i] == ' ') | (txt[i] == '\n'):
#                 blank = True
#         w.append(1e6)
#
#         # create token_to_word map
#         word_map = -1 * np.ones(MAX_LEN, dtype = 'int32')
#         w_i = 0
#         for i in range(len(off)):
#             if off[i][1] == 0:
#                 continue
#             while off[i][0] >= w[w_i + 1]:
#                 w_i += 1
#             word_map[i] = int(w_i)
#
#         # retrieve the segments and class
#         pred = preds[id_num,]
#         i = 0
#         while i < MAX_LEN:
#             prediction = list()
#             start = pred[i]
#             if start in target_map_rev.keys():
#                 prediction.append(word_map[i])
#                 i += 1
#                 if i >= MAX_LEN:
#                     break
#                 while pred[i] == start: # + 0.5:
#                     if not word_map[i] in prediction:
#                         prediction.append(word_map[i])
#                     i += 1
#                     if i >= MAX_LEN:
#                         break
#             else:
#                 i += 1
#             prediction = [x for x in prediction if x != -1]
#             if len(prediction) >= 1:
#                 all_predictions.append((n, target_map_rev[int(start)], ' '.join([str(x) for x in prediction])))
#
#     # MAKE DATAFRAME
#     df = pd.DataFrame(all_predictions)
#     df.columns = ['id', 'class', 'predictionstring']
#     return df

test_res_int = get_preds(dataset='train', verbose=False, text_ids=test_IDS, preds=test_pred)
labels = test_labels.argmax(axis=-1)
# map_clip = {'Lead':9, 'Position':5, 'Evidence':14, 'Claim':3, 'Concluding Statement':11, 'Counterclaim':6, 'Rebuttal':4}
# def threshold(df):
#     df = df.copy()
#     for key, value in map_clip.items():
#     # if df.loc[df['class']==key,'len'] < value 
#         index = df.loc[df['class']==key].query(f'len<{value}').index
#         df.drop(index, inplace = True)
#     return df

# test_res_int['len'] = test_res_int['predictionstring'].apply(lambda x:len(x.split()))
# test_res_int = threshold(test_res_int)

# quick check
print(test_res_int.shape)
test_res_int
result = get_preds(dataset='train', verbose=False, text_ids=test_IDS, preds=labels)
result

Didn't find file input/LongFormer\added_tokens.json. We won't load it.
loading file input/LongFormer\vocab.json
loading file input/LongFormer\merges.txt
loading file input/LongFormer\tokenizer.json
loading file None
loading file input/LongFormer\special_tokens_map.json
loading file input/LongFormer\tokenizer_config.json
Didn't find file input/LongFormer\added_tokens.json. We won't load it.
loading file input/LongFormer\vocab.json
loading file input/LongFormer\merges.txt
loading file input/LongFormer\tokenizer.json
loading file None
loading file input/LongFormer\special_tokens_map.json
loading file input/LongFormer\tokenizer_config.json


(41, 3)


Unnamed: 0,id,class,predictionstring
0,1DA010DC8392,Lead,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...
1,1DA010DC8392,Position,71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
2,1DA010DC8392,Claim,88 89 90 91 92 93
3,1DA010DC8392,Claim,95 96 97 98 99 100
4,1DA010DC8392,Claim,100 101 102 103 104 105 106 107 108 109 110 11...
5,1DA010DC8392,Evidence,114 115 116 117 118 119 120 121 122 123 124 12...
6,1DA010DC8392,Claim,180 181 182 183 184 185 186 187 188 189 190 19...
7,1DA010DC8392,Evidence,194 195 196 197 198 199 200 201 202 203 204 20...
8,1DA010DC8392,Evidence,257 258 259 260 261 262 263 264 265 266 267 26...
9,1DA010DC8392,Claim,311 312 313 314 315 316 317 318 319


In [113]:
# write to file
test_res_int.to_csv('predict.csv',index=False)
result.to_csv('result.csv',index=False)