## A Pytorch version NER Baseline [LB: 0.545]
### Part of this kernel is from zzy's [Pytorch NER infer](https://www.kaggle.com/zzy990106/pytorch-ner-infer)

### Training: [Feedback Prize Train](https://www.kaggle.com/hjhgjghhg/feedback-prize-train/edit/run/82681035)

### Infer: This kernel

### Upvote if you find this kernel useful!🤗

In [None]:
import random
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW, get_scheduler

## Config

In [None]:
config = {
    'fold_num': 5,
    'seed': 1234,
    #'model': 'roberta-base',
    #'model': '../input/robertalarge',
    'model': 'allenai/longformer-base-4096',
    #'model': 'allenai/longformer-large-4096',
    'max_len': 1024,
    'epochs': 5,
    'train_bs': 6,
    'valid_bs': 6,
    'lr': 3e-5,
    'num_workers': 0,
    'weight_decay': 1e-2,
    'num_warmup_steps': 1000,
    'lr_scheduler_type': 'linear',
    'gradient_accumulation_steps': 1,
}

In [None]:
labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim',
          'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']
labels2index = {
    'Lead': 1, 'Position': 3, 'Claim': 5, 'Counterclaim': 7, 'Rebuttal': 9, 'Evidence': 11, 'Concluding Statement': 13
}


## Set Seed

In [None]:
def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(config['seed'])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
#tokenizer = AutoTokenizer.from_pretrained(config['model'], add_prefix_space=True)
tokenizer = AutoTokenizer.from_pretrained('../input/test-notebook/roberta_trained', add_prefix_space=True)

In [None]:
class MyDataset(Dataset):
    def __init__(self, df, phase='Train'):
        self.df = df
        self.phase = phase

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.text.values[idx]
        if self.phase == 'Train':
            label = self.df.tagging.values[idx]
            return {'text': text, 'label': label}
        else:
            return {'text': text}


def collate_fn(data):
    input_ids, attention_mask = [], []
    text = [item['text'] for item in data]
    tokenized_inputs = tokenizer(
        text,
        max_length=config['max_len'],
        padding='max_length',
        truncation=True,
        is_split_into_words=True,
        return_tensors='pt'
    )

    if 'label' in data[0].keys():
        label = [item['label'] for item in data]
        tokenized_inputs['labels'] = torch.LongTensor(label)

    return tokenized_inputs

## Load Model

In [None]:
#model = AutoModelForTokenClassification.from_pretrained(config['model'], num_labels=15).to(device)
#model.load_state_dict(torch.load('../input/feedback-prize-train/roberta_trained/pytorch_model.bin'))
model = AutoModelForTokenClassification.from_pretrained('../input/feedback-prize-train/roberta_trained/', num_labels=15).to(device)

## Load Test Data

In [None]:
test_df = pd.read_csv('../input/feedback-prize-2021/sample_submission.csv')
test_df.head(5)

In [None]:
test_names, test_texts = [], []
for f in tqdm(list(os.listdir('../input/feedback-prize-2021/test'))):
    test_names.append(f.replace('.txt', ''))
    with open('../input/feedback-prize-2021/test/' + f, 'r', encoding='utf-8') as f:
        text = ''
        for line in f.readlines():
            #text += line.replace('\n', '').replace('\xa0', '')
            text += line.replace('\n', ' ')
        test_texts.append(text)
test_texts = pd.DataFrame({'id': test_names, 'text': test_texts})
test_texts['text'] = test_texts['text'].apply(lambda x: x.split())
test_texts

In [None]:
test_dataset = MyDataset(test_texts, phase='Test')
test_iter = DataLoader(test_dataset, batch_size=config['valid_bs'], collate_fn=collate_fn, shuffle=False,
                        num_workers=config['num_workers'])

## Predict

In [None]:
y_pred = []

with torch.no_grad():
    model.eval()
    tk = tqdm(test_iter, total=len(test_iter), position=0, leave=True)
    for step, batch in enumerate(tk):
        batch = {k: v.to(device) for k, v in batch.items()}

        output = model(input_ids=batch['input_ids'],
                       attention_mask=batch['attention_mask']).logits

        y_pred.extend(output.argmax(-1).cpu().numpy())
        
y_pred = np.array(y_pred)

In [None]:
y_pred[0][:200]

In [None]:
final_preds = []

for i in tqdm(range(len(test_texts))):
    idx = test_texts.id.values[i]
    pred = ['']*(len(y_pred[i])-2)

    for j in range(1, len(y_pred[i])-1):
        pred[j-1] = labels[y_pred[i][j]]

    pred = [x.replace('B-','').replace('I-','') for x in pred]
    
    j = 0
    while j < len(pred):
        cls = pred[j]
        if cls == 'O':
            j += 1
        end = j + 1
        while end < len(pred) and pred[end] == cls:
            end += 1
            
        if cls != 'O' and cls != '' and end - j > 10:
            final_preds.append((idx, cls, ' '.join(map(str, list(range(j, end))))))
        
        j = end
        
final_preds[0]

In [None]:
sub = pd.DataFrame(final_preds)
sub.columns = test_df.columns
sub.to_csv('submission.csv', index=False)

In [None]:
sub