In [1]:
! pip install datasets
! pip install evaluate
! pip install seqeval
! pip install transformers[torch]
! pip install accelerate -U

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 2.2 MB/s eta 0:00:01


Collecting safetensors>=0.3.1
  Downloading safetensors-0.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 34.6 MB/s eta 0:00:01
Installing collected packages: safetensors, accelerate
Successfully installed accelerate-0.28.0 safetensors-0.4.2


In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
import pandas as pd
from datasets import Dataset
from tqdm import trange, tqdm
import evaluate
import numpy as np
import torch
import random

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [3]:
print(device)

cuda:0


In [4]:
label_names = ['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-URL_PERSONAL',
       'B-EMAIL', 'B-ID_NUM', 'I-URL_PERSONAL', 'B-USERNAME',
       'B-PHONE_NUM', 'I-PHONE_NUM', 'B-STREET_ADDRESS',
       'I-STREET_ADDRESS']
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [5]:
model_name =  "allenai/longformer-base-4096"
model = AutoModelForTokenClassification.from_pretrained(model_name, id2label = id2label, label2id = label2id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space = True)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing LongformerForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForTokenClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN

In [6]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = 0 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(0)
        else:
            label = labels[word_id]
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels

In [7]:
def tokenize_and_align(example):
    tokenized_inputs = tokenizer(example["tokens"],
                                 truncation=True, 
                                 max_length=4096,
                                 is_split_into_words=True, 
                                 padding= True,
                                 )
    all_labels = example["tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [8]:
def tag2num(tags):
    return [label2id[tag] for tag in tags]

In [9]:
total_df = pd.DataFrame({'tokens': [], 'tags': []})
file_name = 'train.csv'
df = pd.read_csv(file_name).dropna()
df = df.drop(columns=['Unnamed: 0'])
doc_ids = list(df["doc_id"].unique())
random.shuffle(doc_ids)
size = len(doc_ids)*4//5
train_doc_ids = doc_ids[0:size]
eval_doc_ids = doc_ids[size:]
length = 4096
for doc_id in tqdm(train_doc_ids):
    tokens = df[df['doc_id'] == doc_id]['token'].to_list()
    tags = df[df['doc_id'] == doc_id]['label'].to_list()
    tags = tag2num(tags)
    cur = 0
#     while(cur + length < len(tags)):
#         total_df.loc[len(total_df)] = [tokens[cur:cur+length], tags[cur:cur+length]]
#         cur+=length
    total_df.loc[len(total_df)] = [tokens[cur:], tags[cur:]]


train_dataset = Dataset.from_pandas(total_df).remove_columns('__index_level_0__')

total_eval_df = pd.DataFrame({'tokens': [], 'tags': []})

for doc_id in tqdm(eval_doc_ids):
    eval_tokens = df[df['doc_id'] == doc_id]['token'].to_list()
    eval_tags = df[df['doc_id'] == doc_id]['label'].to_list()
    eval_tags = tag2num(eval_tags)
    cur = 0
#     while(cur + length < len(tags)):
#         total_eval_df.loc[len(total_eval_df)] = [eval_tokens[cur:cur+length], eval_tags[cur:cur+length]]
#         cur+=length
    total_eval_df.loc[len(total_eval_df)] = [eval_tokens[cur:], eval_tags[cur:]]

eval_dataset = Dataset.from_pandas(total_eval_df).remove_columns('__index_level_0__')

100%|██████████| 2885/2885 [00:11<00:00, 245.25it/s]
100%|██████████| 722/722 [00:02<00:00, 243.44it/s]


In [10]:
train_dataset = train_dataset.map(tokenize_and_align, batched = True,remove_columns=['tokens', 'tags'])
eval_dataset = eval_dataset.map(tokenize_and_align, batched =True, remove_columns=['tokens', 'tags'])

Map:   0%|          | 0/2885 [00:00<?, ? examples/s]

Map:   0%|          | 0/722 [00:00<?, ? examples/s]

In [11]:
data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)

In [12]:
metric = evaluate.load("seqeval")

In [13]:
training_args = TrainingArguments(
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    optim="adamw_torch",
    output_dir="./results",
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

You're using a LongformerTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
results = trainer.predict(eval_dataset).predictions

In [None]:
correct_labels = []
predict_labels = []
for text_pos, text in enumerate(tqdm(results)):
    labels = np.argmax(text, axis = 1)
    predict_labels.extend(labels)
    correct_labels.extend(eval_dataset['labels'][text_pos])

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

print(f1_score(correct_labels, predict_labels, average = "macro"))
print(recall_score(correct_labels, predict_labels, average = "macro"))
print(f1_score(correct_labels, predict_labels, average = "weighted"))
print(recall_score(correct_labels, predict_labels, average = "weighted"))

In [None]:
from collections import Counter

print(Counter(predict_labels).keys())
Counter(correct_labels).keys()

In [None]:
trainer.save_model()

In [None]:
import json
with open("./correct_labels", "w") as fp:
    json.dump(list(map(int, correct_labels)), fp)
    
with open("./predict_labels", "w") as fp:
    json.dump(list(map(int, predict_labels)), fp)

In [None]:
import json
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

with open("./correct_labels", "r") as fp:
    correct_labels = json.load(fp)
    
with open("./predict_labels", "r") as fp:
    predict_labels = json.load(fp)
    
print(f1_score(correct_labels, predict_labels, average = "macro"))
print(recall_score(correct_labels, predict_labels, average = "macro"))
print(f1_score(correct_labels, predict_labels, average = "weighted"))
print(recall_score(correct_labels, predict_labels, average = "weighted"))

In [None]:
from sklearn.metrics import confusion_matrix
label_names = ['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-URL_PERSONAL',
       'B-EMAIL', 'B-ID_NUM', 'I-URL_PERSONAL', 'B-USERNAME',
       'B-PHONE_NUM', 'I-PHONE_NUM', 'B-STREET_ADDRESS',
       'I-STREET_ADDRESS']
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

train_labels = list(set(correct_labels))

label_names = [id2label[label] for label in train_labels]
print(label_names)
    
confusion_matrix(correct_labels, predict_labels, labels = train_labels)

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

multilabel_confusion_matrix(correct_labels, predict_labels)

In [None]:
import numpy as np

label_names = ['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-URL_PERSONAL',
       'B-EMAIL', 'B-ID_NUM', 'I-URL_PERSONAL', 'B-USERNAME',
       'B-PHONE_NUM', 'I-PHONE_NUM', 'B-STREET_ADDRESS',
       'I-STREET_ADDRESS']
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

train_labels = list(set(correct_labels))

cm = confusion_matrix(correct_labels, predict_labels, labels = train_labels)

num_classes = cm.shape[0]
f1_scores = np.zeros(num_classes)
recalls = np.zeros(num_classes)
    
for i in range(num_classes):
    print('----')
    print(f'label: {id2label[train_labels[i]]}')
    true_positives = cm[i, i]
    false_positives = np.sum(cm[:, i]) - true_positives
    false_negatives = np.sum(cm[i, :]) - true_positives
        
    precision = true_positives / (true_positives + false_positives + 1e-10)
    recall = true_positives / (true_positives + false_negatives + 1e-10)
        
    f1_scores[i] = 2 * (precision * recall) / (precision + recall + 1e-10)
    print(f'F1 score: {f1_scores[i]}')
    recalls[i] = recall
    print(f'Recall score: {recalls[i]}')