In [None]:
#!pip install -Uqq datasets seqeval s3fs

In [None]:
!nvidia-smi

In [None]:
import re
import json
from glob import glob
import itertools

import pandas as pd
import numpy as np
from transformers import TrainingArguments, Trainer, pipeline, EarlyStoppingCallback
#from transformers import AutoTokenizer as Tokenizer, AutoModelForTokenClassification as Model
#from transformers import RobertaTokenizerFast as Tokenizer, RobertaForTokenClassification as Model
from transformers import DistilBertTokenizerFast as Tokenizer, DistilBertForTokenClassification as Model
import torch
from tqdm import tqdm, notebook
from sklearn.metrics import fbeta_score

In [None]:
train_samples = 10000
eval_samples = 200

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9.]+', ' ', str(txt).lower()).strip()

def prep_labels(path):
    short_labels = ['agid', 'anss', 'blsa', 'charybdis', 'cord 19', 'nces', 'c cap', 'bbs', 'ricord', 'niagads']
    cleaned_labels = pd.read_csv(path)['cleaned_label'].str.strip().unique().tolist()
    cleaned_labels = sorted(cleaned_labels + short_labels)
    s_labels = pd.Series(cleaned_labels, name='label')
    return s_labels


def prep_data(fns, labels):
    # JSON to WORDS & LABELS
    data = {'tokens': [], 'labels': []}
    for fn in tqdm(fns):
        with open(fn, 'r') as f:
            obj = json.load(f)
        for section in obj:
            text = clean_text(section.get('text'))
            if text:
                text_labels = list(re.sub(r'[^ ]', '0', text))
                for label in labels:
                    matches = [m.span() for m in re.finditer(label, text)]
                    for m in matches:
                        text_labels[m[0]:m[1]] = re.sub(r'[^ ]', '1', label)
                text_labels = ['1' in e for e in ''.join(text_labels).split()]
                text_words = text.split(' ')
                assert len(text_words) == len(text_labels)
                while len(text_words) > 0:
                    labels_np = np.array(text_labels[:512]).astype(int)
                    # ONLY ADD TEXT WITH AT LEAST ONE LABEL
                    if labels_np.sum() > 0:
                        data['tokens'].append(text_words[:512])
                        data['labels'].append(labels_np)
                    text_words = text_words[500:]
                    text_labels = text_labels[500:]
    return data

def tokenize_and_align_labels(data, tokenizer, label_all_tokens=True):
    # Tokenize Words and align Labels
    tokenized_inputs = tokenizer(data['tokens'], truncation=True, is_split_into_words=True, padding=True)

    labels = []
    for i, label in enumerate(data['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

class CRDataset(torch.utils.data.Dataset):
    def __init__(self, json_path, labels_path, tokenizer):
        cleaned_labels = prep_labels(labels_path)
        data = prep_data(json_path, cleaned_labels)
        tokenized_data = tokenize_and_align_labels(data, tokenizer)
        self.encodings = tokenized_data['input_ids']
        self.labels = tokenized_data['labels']

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.encodings[idx]),
            'labels': torch.tensor(self.labels[idx])
        }
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
pretrained_path = '/kaggle/input/hfdistilbertbaseuncasedtokenclassification'
cr_tokenizer = Tokenizer.from_pretrained(pretrained_path, padding="max_length", truncation=True, max_length=512, return_tensors="pt", add_prefix_space=True)
cr_model = Model.from_pretrained(pretrained_path, num_labels=2)

In [None]:
fns = glob('/kaggle/input/coleridgeinitiative-show-us-the-data/train/*.json')
train_fns = fns[:train_samples]
train_dataset = CRDataset(train_fns, '/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv', cr_tokenizer)
if eval_samples:
    eval_fns = fns[train_samples:train_samples+eval_samples]
    val_dataset = CRDataset(eval_fns, '/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv', cr_tokenizer)
else:
    val_dataset = None

In [None]:
# Train
training_args = TrainingArguments(
    #overwrite_output_dir=True,
    output_dir='/kaggle/tmp/results', # output directory
    num_train_epochs=3,               # total number of training epochs
    per_device_train_batch_size=16,   # batch size per device during training
    per_device_eval_batch_size=64,    # batch size for evaluation
    warmup_steps=500,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # strength of weight decay
    logging_dir='/kaggle/tmp/logs',   # directory for storing logs
    logging_steps=50,
    report_to='none',
    evaluation_strategy='steps',
    load_best_model_at_end=True
)

trainer = Trainer(
    model=cr_model,                      # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    callbacks=[EarlyStoppingCallback(10)]
)

trainer.train()

In [None]:
#model.save_pretrained('./model')

In [None]:
classifier = pipeline('ner', model=cr_model, tokenizer=cr_tokenizer, device=0)

In [None]:
def chunked_iterable(iterable, size):
    it = iter(iterable)
    while True:
        chunk = tuple(itertools.islice(it, size))
        if not chunk:
            break
        yield chunk

def classify_sample(fn):
    text = []
    with open(fn, 'r') as f:
        for section in json.load(f):
            if section and section.get('text'):
                text.append(section['text'])
    return '|'.join(label_text(' '.join(text)))
    
def label_text(text):
    labels = []
    for chunk in chunked_iterable(text.split(), 512):
        results = classifier(' '.join(chunk))
        tmp = []
        for n, r in enumerate(results):
            if r['entity'] == 'LABEL_1' and r['word'] != '.' and r['score'] > 0.8:
                print(r['word'], r['start'], r['end'], r['score'])
                try:
                    if n < len(results) and results[n+1]['entity'] == 'LABEL_1':
                        if r['end'] + 1 == results[n+1]['start'] or r['end'] == results[n+1]['start']:
                            tmp.append(r['word'].replace(' .', ''))
                    if n < len(results) and results[n+1]['entity'] != 'LABEL_1':
                        if tmp:
                            tmp.append(r['word'].replace(' .', ''))
                            l = ' '.join(tmp)
                            labels.append(l.replace(' ##', ''))
                            tmp = []
                        else:
                            labels.append(r['word'].replace(' .', ''))
                except:
                    labels.append(r['word'])
    labels = [l for l in labels if not l.endswith('nces')]
    return list(set(labels))

In [None]:
fns[50], fns[10004]

In [None]:
t = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv')
t[t['Id'] == '11cd3b53-957d-4eb0-bcdf-652d34e63af4']['cleaned_label'].unique().tolist()

In [None]:
classify_sample(fns[50])

In [None]:
t[t['Id'] == '1d089481-7727-4c6d-a641-72870cb0a0fa']['cleaned_label'].unique().tolist()

In [None]:
classify_sample(fns[10004])

Submissions are evaluated on a Jaccard-based FBeta score between predicted texts and ground truth texts, with Beta = 0.5 (a micro F0.5 score). Multiple predictions are delineated with a pipe (|) character in the submission file.

The following is Python code for calculating the Jaccard score for a single prediction string against a single ground truth string. Note that the overall score for a sample uses Jaccard to compare multiple ground truth and prediction strings that are pipe-delimited - this code does not handle that process or the final micro F-beta calculation.

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def score(y_true, y_pred):
    y_true_l = sorted(y_true.split('|'))
    y_pred_l = sorted(y_pred.split('|'))
    tp, fp, fn = 0, 0, 0
    for pred_label in y_pred_l:
        max_s = 0
        matched_ix = None
        for n, true_label in enumerate(y_true_l):
            s = jaccard(true_label, pred_label)
            if s >= 0.5:
                max_s = s
                matched_ix = n
        if max_s:
            tp += 1
        else:
            pass
    return fbeta_score(y_true, y_pred, average='micro', beta=0.5)

For each publication's set of predictions, a token-based Jaccard score is calculated for each potential prediction / ground truth pair. The prediction with the highest score for a given ground truth is matched with that ground truth.

Predicted strings for each publication are sorted alphabetically and processed in that order. Any scoring ties are resolved on the basis of that sort.
Any matched predictions where the Jaccard score meets or exceeds the threshold of 0.5 are counted as true positives (TP), the remainder as false positives (FP).
Any unmatched predictions are counted as false positives (FP).
Any ground truths with no nearest predictions are counted as false negatives (FN).
All TP, FP and FN across all samples are used to calculate a final micro F0.5 score. (Note that a micro F score does precisely this, creating one pool of TP, FP and FN that is used to calculate a score for the entire set of predictions.)

In [None]:
test_fns = glob('/kaggle/input/coleridgeinitiative-show-us-the-data/test/*.json')
#test_fns = glob('/kaggle/input/coleridgeinitiative-show-us-the-data/train/*.json')
test_sub = []
for fn in tqdm(test_fns):
    test_id = fn.rsplit('/', 1)[1].split('.')[0]
    labels = classify_sample(fn)
    test_sub.append({'Id': test_id, 'PredictionString': labels})
test_df = pd.DataFrame(test_sub)
test_df

In [None]:
test_df.to_csv('submission.csv', index=False)

In [None]:
!ls *.csv