In [1]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "xlnet-base-cased"
batch_size = 16

In [2]:
from datasets import load_dataset, load_metric, load_from_disk

In [3]:
# datasets = load_dataset("conll2003")

# datasets.save_to_disk(f'./data_tc.pt')
datasets = load_from_disk(f'./data/tc/data_tc.pt')

In [4]:
label_list = datasets["train"].features[f"{task}_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

To get a sense of what the data looks like, the following function will show some examples picked randomly in the dataset (automatically decoding the labels in passing).

In [5]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

In [6]:
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
# tokenizer.save_pretrained('./xlnet-base-cased-tokenizer-tc')
tokenizer = AutoTokenizer.from_pretrained('./tokenizers/xlnet-base-cased-tokenizer-tc', use_fast=True)

In [7]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

Note that transformers are often pretrained with subword tokenizers, meaning that even if your inputs have been split into words already, each of those words could be split again by the tokenizer. Let's look at an example of that:

In [8]:
# example = datasets["train"][4]
# tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
# tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
# word_ids = tokenized_input.word_ids()
# aligned_labels = [-100 if i is None else example[f"{task}_tags"][i] for i in word_ids]
label_all_tokens = True

In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, padding=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [10]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Loading cached processed dataset at data/tc/data_tc.pt/train/cache-628866849c2668e4.arrow
Loading cached processed dataset at data/tc/data_tc.pt/validation/cache-999e0527f3b67c85.arrow
Loading cached processed dataset at data/tc/data_tc.pt/test/cache-94f068bcc6aca27b.arrow


## Fine-tuning the model

In [11]:
import torch
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
# torch.save(model, f'./model/model_tc_not_trained.pt')

model = torch.load(f'./model/model_tc_not_trained.pt')

In [12]:
args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [13]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [14]:

""" seqeval metric. """

from seqeval.metrics import accuracy_score, classification_report

import datasets


_CITATION = """\
@inproceedings{ramshaw-marcus-1995-text,
    title = "Text Chunking using Transformation-Based Learning",
    author = "Ramshaw, Lance  and
      Marcus, Mitch",
    booktitle = "Third Workshop on Very Large Corpora",
    year = "1995",
    url = "https://www.aclweb.org/anthology/W95-0107",
}
@misc{seqeval,
  title={{seqeval}: A Python framework for sequence labeling evaluation},
  url={https://github.com/chakki-works/seqeval},
  note={Software available from https://github.com/chakki-works/seqeval},
  author={Hiroki Nakayama},
  year={2018},
}
"""

_DESCRIPTION = """\
seqeval is a Python framework for sequence labeling evaluation.
seqeval can evaluate the performance of chunking tasks such as named-entity recognition, part-of-speech tagging, semantic role labeling and so on.

This is well-tested by using the Perl script conlleval, which can be used for
measuring the performance of a system that has processed the CoNLL-2000 shared task data.

seqeval supports following formats:
IOB1
IOB2
IOE1
IOE2
IOBES

See the [README.md] file at https://github.com/chakki-works/seqeval for more information.
"""

_KWARGS_DESCRIPTION = """
Produces labelling scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
    references: List of List of reference labels (Ground truth (correct) target values)
    suffix: True if the IOB prefix is after type, False otherwise. default: False
Returns:
    'scores': dict. Summary of the scores for overall and per type
        Overall:
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': F1 score, also known as balanced F-score or F-measure,
        Per type:
            'precision': precision,
            'recall': recall,
            'f1': F1 score, also known as balanced F-score or F-measure
Examples:

    >>> predictions = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
    >>> references = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
    >>> seqeval = datasets.load_metric("seqeval")
    >>> results = seqeval.compute(predictions=predictions, references=references)
    >>> print(list(results.keys()))
    ['MISC', 'PER', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy']
    >>> print(results["overall_f1"])
    0.5
    >>> print(results["PER"]["f1"])
    1.0
"""


@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Seqeval(datasets.Metric):
    def _info(self):
        return datasets.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            homepage="https://github.com/chakki-works/seqeval",
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features(
                {
                    "predictions": datasets.Sequence(datasets.Value("string", id="label"), id="sequence"),
                    "references": datasets.Sequence(datasets.Value("string", id="label"), id="sequence"),
                }
            ),
            codebase_urls=["https://github.com/chakki-works/seqeval"],
            reference_urls=["https://github.com/chakki-works/seqeval"],
        )

    def _compute(self, predictions, references, suffix=False):
        report = classification_report(y_true=references, y_pred=predictions, suffix=suffix, output_dict=True)
        report.pop("macro avg")
        report.pop("weighted avg")
        overall_score = report.pop("micro avg")

        scores = {
            type_name: {
                "precision": score["precision"],
                "recall": score["recall"],
                "f1": score["f1-score"],
                "number": score["support"],
            }
            for type_name, score in report.items()
        }
        scores["overall_precision"] = overall_score["precision"]
        scores["overall_recall"] = overall_score["recall"]
        scores["overall_f1"] = overall_score["f1-score"]
        scores["overall_accuracy"] = accuracy_score(y_true=references, y_pred=predictions)

        return scores

In [15]:
metric = Seqeval()

This metric takes list of labels for the predictions and references:

In [16]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [17]:
from models.CNN_test.layers import Linear
from torch.utils.data import DataLoader
import time

def copy_lin_layer(old_layer, in_f, out_f):
    new_lin = Linear(in_features=in_f, out_features=out_f, bias=True)
    new_lin.weight = old_layer.weight
    new_lin.bias = old_layer.bias
    new_lin.in_features = old_layer.in_features
    new_lin.out_features = old_layer.out_features

    return new_lin


def set_quantize(model, quantize: bool = False, bitness: int = 4,
                 quantize_type: str = None,
                 trainable: bool = False, use_qloss: bool = False):
    for i in model.transformer.layer:
        i.ff.layer_1.set_quantize(quantize, bitness,
                                  quantize_type, trainable, use_qloss)
        i.ff.layer_2.set_quantize(quantize, bitness,
                                  quantize_type, trainable, use_qloss)


def get_i_qsin_loss(model):
    res = 0

    for i in model.transformer.layer:
        input_loss = i.ff.layer_1.quant.q_i_sum + i.ff.layer_2.quant.q_i_sum
        res = res + input_loss

    return res


def get_w_qsin_loss(model):
    res = 0

    for i in model.transformer.layer:
        weight_loss = i.ff.layer_1.quant.q_w_sum + i.ff.layer_2.quant.q_w_sum
        res = res + weight_loss

    return res

In [18]:
for i in model.transformer.layer:
    i.ff.layer_1 = copy_lin_layer(i.ff.layer_1, 768, 3072)
    i.ff.layer_2 = copy_lin_layer(i.ff.layer_2, 3072, 768)

In [19]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Runtime,Samples Per Second
1,0.2065,0.073167,0.907677,0.927323,0.917395,0.981184,16.2395,200.129
2,0.0525,0.05797,0.937997,0.947776,0.942861,0.986629,14.4942,224.227
3,0.0281,0.05688,0.940161,0.951549,0.945821,0.987608,14.475,224.525


TrainOutput(global_step=2634, training_loss=0.07682400901116651, metrics={'train_runtime': 725.7539, 'train_samples_per_second': 3.629, 'total_flos': 5106177648551250.0, 'epoch': 3.0})

We can now finetune our model by just calling the `train` method:

The `evaluate` method allows you to evaluate again on the evaluation dataset or on another dataset:

In [20]:
trainer.evaluate()

{'eval_loss': 0.05687996745109558,
 'eval_precision': 0.9401608789484011,
 'eval_recall': 0.9515488482922955,
 'eval_f1': 0.9458205862034934,
 'eval_accuracy': 0.9876076917775174,
 'eval_runtime': 14.0366,
 'eval_samples_per_second': 231.537,
 'epoch': 3.0}

To get the precision/recall/f1 computed for each category now that we have finished training, we can apply the same function as before on the result of the `predict` method:

In [19]:
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

NameError: name 'trainer' is not defined

Don't forget to [upload your model](https://huggingface.co/transformers/model_sharing.html) on the [🤗 Model Hub](https://huggingface.co/models). You can then use it only to generate results like the one shown in the first picture of this notebook!

In [22]:

torch.save(model, f'./model/model_tc_no_stat.pt')


In [20]:

model = torch.load(f'./model/model_tc_no_stat.pt')

num_bits = 4

device = torch.device('cuda')
model.to(device)

train_enc = tokenized_datasets['train']
train_enc.set_format(type='torch', columns= ['attention_mask', 'input_ids', 'labels'])
train_loader = DataLoader(train_enc, batch_size=16, shuffle=False)

set_quantize(model, True, num_bits, "static_train")
model.eval()

for epoch, batch in zip(range(1), train_loader):
    print("!")
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    model(input_ids, attention_mask=attention_mask, labels=labels)


model.train()
set_quantize(model)

torch.save(model, f'./model/model_tc_stat_{num_bits}.pt')

!


In [21]:
datasets = load_from_disk(f'./data/tc/data_tc.pt')
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

def continue_processing(num_bits, quant_type):
    print(f"\n{num_bits} bit, {quant_type}:\n")
    
    model = torch.load( f'./model/model_tc_stat_{num_bits}.pt')
    
    set_quantize(model, True, num_bits, quant_type)

    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    print(trainer.evaluate())


continue_processing(num_bits, "dynamic")
continue_processing(num_bits, "static")



Loading cached processed dataset at data/tc/data_tc.pt/train/cache-628866849c2668e4.arrow
Loading cached processed dataset at data/tc/data_tc.pt/validation/cache-999e0527f3b67c85.arrow
Loading cached processed dataset at data/tc/data_tc.pt/test/cache-94f068bcc6aca27b.arrow



4 bit, dynamic:



  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.790109634399414, 'eval_precision': 0.25, 'eval_recall': 0.0004964257347100874, 'eval_f1': 0.0009908838684106222, 'eval_accuracy': 0.79709146047281, 'eval_runtime': 18.0929, 'eval_samples_per_second': 179.628}

4 bit, static:



{'eval_loss': 2.2840611934661865, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.79696739954511, 'eval_runtime': 16.0287, 'eval_samples_per_second': 202.762}


In [22]:
args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
)


def qsin_run(quant_bit):

    print(f"\n\nModel qsin {quant_bit} bit train:\n\n")
    model.train()

    set_quantize(model, True, quant_bit, "static", trainable=True, use_qloss=True)

    class qsinTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            """
            How the loss is computed by Trainer. By default, all models return the loss in the first element.

            Subclass and override for custom behavior.
            """
            if self.label_smoother is not None and "labels" in inputs:
                labels = inputs.pop("labels")
            else:
                labels = None
            outputs = model(**inputs)

            if self.args.past_index >= 0:
                self._past = outputs[self.args.past_index]

            if labels is not None:
                loss = self.label_smoother(outputs, labels)
            else:
                # We don't use .loss here since the model may return tuples instead of ModelOutput.
                loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

            qsin_loss = get_i_qsin_loss(model) * 0.0000001
            qsin_loss = qsin_loss + get_w_qsin_loss(model) * 0.001

            sum_loss = qsin_loss + loss

            return (sum_loss, outputs) if return_outputs else sum_loss
    
    tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

    trainer = qsinTrainer(
        model,
        args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    start_time = time.time()
    
    print(get_i_qsin_loss(model))
    print(get_w_qsin_loss(model))

    trainer.train()
    
    
    print(get_i_qsin_loss(model))
    print(get_w_qsin_loss(model))

    print(trainer.evaluate())

    print("--- %s seconds ---" % (time.time() - start_time))

qsin_run(num_bits)

Loading cached processed dataset at data/tc/data_tc.pt/train/cache-628866849c2668e4.arrow
Loading cached processed dataset at data/tc/data_tc.pt/validation/cache-999e0527f3b67c85.arrow
Loading cached processed dataset at data/tc/data_tc.pt/test/cache-94f068bcc6aca27b.arrow




Model qsin 4 bit train:


0
0


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Runtime,Samples Per Second
1,1084.6802,587.390442,0.479539,0.314138,0.379604,0.876036,27.8606,116.652
2,396.1536,213.480545,0.753066,0.694897,0.722813,0.939706,27.946,116.296
3,113.8641,65.96492,0.818546,0.830818,0.824637,0.96059,28.1328,115.524
4,28.4381,19.50243,0.859502,0.873411,0.866401,0.971177,29.2155,111.242
5,14.5888,7.056528,0.864609,0.890191,0.877214,0.973672,29.6924,109.456
6,5.2987,4.096546,0.878715,0.909948,0.894059,0.976677,31.2192,104.103
7,3.5204,3.346557,0.898816,0.911934,0.905327,0.978469,28.8916,112.489
8,3.0057,3.030596,0.891327,0.91531,0.903159,0.978055,27.3651,118.764
9,2.86,2.83218,0.896224,0.9166,0.906298,0.97913,27.3977,118.623
10,2.7624,2.811432,0.89887,0.916005,0.907356,0.979047,32.9444,98.651


tensor(6814.3491, device='cuda:0')
tensor(2702.1340, device='cuda:0')


{'eval_loss': 2.811432123184204, 'eval_precision': 0.8988698363211224, 'eval_recall': 0.9160047656870532, 'eval_f1': 0.9073564122738002, 'eval_accuracy': 0.9790474877662141, 'eval_runtime': 28.2468, 'eval_samples_per_second': 115.057, 'epoch': 10.0}
--- 6643.0355043411255 seconds ---


In [23]:
model = torch.load( f'./model/model_tc_stat_{num_bits}.pt')
datasets = load_from_disk(f'./data/tc/data_tc.pt')
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

set_quantize(model, True, num_bits, "static", trainable=True, use_qloss=False)


trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

Loading cached processed dataset at data/tc/data_tc.pt/train/cache-628866849c2668e4.arrow
Loading cached processed dataset at data/tc/data_tc.pt/validation/cache-999e0527f3b67c85.arrow
Loading cached processed dataset at data/tc/data_tc.pt/test/cache-94f068bcc6aca27b.arrow


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Runtime,Samples Per Second
1,0.457,0.176864,0.761418,0.799444,0.779968,0.949645,19.4999,166.667
2,0.1239,0.117535,0.862246,0.865071,0.863657,0.968461,15.6716,207.381
3,0.0766,0.117472,0.864109,0.883241,0.87357,0.971769,15.6425,207.767
4,0.0556,0.099873,0.872156,0.898133,0.884954,0.974113,15.5885,208.487
5,0.0428,0.099091,0.881405,0.906871,0.893956,0.976139,15.9159,204.199
6,0.036,0.104597,0.888092,0.909253,0.898548,0.97771,17.4768,185.961
7,0.0296,0.106678,0.896339,0.909154,0.902701,0.978482,23.2965,139.506
8,0.0234,0.10777,0.893768,0.915508,0.904507,0.978855,16.6215,195.53
9,0.0172,0.109883,0.898173,0.917792,0.907877,0.979833,16.9659,191.56
10,0.0158,0.115419,0.896278,0.91799,0.907004,0.979544,16.4045,198.117


TrainOutput(global_step=8780, training_loss=0.07535297745724202, metrics={'train_runtime': 2842.4731, 'train_samples_per_second': 3.089, 'total_flos': 1.70118190879137e+16, 'epoch': 10.0})