In [1]:
import numpy as np
import pandas as pd
import evaluate
from sklearn.metrics import classification_report
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

In [2]:
weights_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(weights_name)

In [3]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss; CP set the weights here:
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0], device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [4]:
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

In [5]:
df = pd.read_csv("../data/aann_classifier_data.csv")
df = df.sample(frac=1.0, random_state=1234)

In [6]:
full_dist = df.label.value_counts().to_frame()
full_dist['per'] = full_dist / full_dist.sum()
full_dist

Unnamed: 0,label,per
0,3559,0.777753
1,1017,0.222247


In [7]:
train_count = int(df.shape[0] * 0.80)

train_df = df.iloc[: train_count]
test_df = df.iloc[train_count: ]

In [8]:
test_dist = test_df.label.value_counts().to_frame()
test_dist['per'] = test_dist / test_dist.sum()
test_dist

Unnamed: 0,label,per
0,710,0.775109
1,206,0.224891


In [9]:
train = Dataset.from_dict(train_df.to_dict(orient='list'))
dataset = DatasetDict({"train": train})

In [10]:
dataset_tokenized = dataset.map(tokenize_function, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(weights_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
trainer = CustomTrainer(model=model, train_dataset=dataset_tokenized['train'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
trainer.train()


[34m[1mwandb[0m: Currently logged in as: [33mkmisra[0m ([33mperplexity_probe[0m). Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
500,0.1263
1000,0.0192


TrainOutput(global_step=1374, training_loss=0.05623289383134467, metrics={'train_runtime': 189.5661, 'train_samples_per_second': 57.922, 'train_steps_per_second': 7.248, 'total_flos': 2888959387852800.0, 'train_loss': 0.05623289383134467, 'epoch': 3.0})

In [15]:
def model_predict(examples, tokenizer, model):
    enc = tokenizer.batch_encode_plus(
        examples,
        padding=True,
        truncation=True,
        add_special_tokens=True,
        return_attention_mask=True,
        return_tensors='pt')
    logits = model(**enc).logits
    preds = logits.softmax(-1).argmax(-1).numpy()
    return preds

In [16]:
model.to('cpu')
model_predict(["The family spent five beautiful yet exhausting fortnights there."], tokenizer, model)

array([0])

In [17]:
preds = []
test_exs = list(test_df.sentence.values)
batch_size = 20
for i in range(0, len(test_exs), batch_size):
    preds += list(model_predict(test_exs[i: i+batch_size], tokenizer, model))

In [18]:
print(classification_report(test_df.label.values, preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       710
           1       1.00      0.99      0.99       206

    accuracy                           1.00       916
   macro avg       1.00      0.99      0.99       916
weighted avg       1.00      1.00      1.00       916



In [19]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
trainer.save_model("../models/aann-detector")
trainer.tokenizer = tokenizer
trainer.hub_model_id = "kanishka/aann-detector"
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

events.out.tfevents.1699771293.greaterark.2280195.0:   0%|          | 0.00/4.82k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

events.out.tfevents.1699769235.greaterark.2275542.0:   0%|          | 0.00/4.82k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

'https://huggingface.co/kanishka/aann-detector/tree/main/'

In [43]:
model_predict(["He said, 'they spent a mere five pounds there'."], tokenizer, model)

array([1])

In [70]:
model = AutoModelForSequenceClassification.from_pretrained("../models/aann-detector/")
tokenizer.save_pretrained("../models/aann-detector/")
tokenizer = AutoTokenizer.from_pretrained("../models/aann-detector/")

In [71]:
model.push_to_hub("kanishka/aann-detector")
tokenizer.push_to_hub("kanishka/aann-detector")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kanishka/aann-detector/commit/80553c083c634d63bd79e3ec2d5b26ab1b57d525', commit_message='Upload tokenizer', commit_description='', oid='80553c083c634d63bd79e3ec2d5b26ab1b57d525', pr_url=None, pr_revision=None, pr_num=None)

In [67]:
trainer.push_to_hub("push model to hub")

RepositoryNotFoundError: 404 Client Error. (Request ID: Root=1-65507256-6b920635234a5f4325c3ba4c;ffa65f07-7bc7-4a0c-8ed1-bdc61dcd8d55)

Repository Not Found for url: https://huggingface.co/api/models/kanishka/aann-detector/preupload/main.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.
Note: Creating a commit assumes that the repo already exists on the Huggingface Hub. Please use `create_repo` if it's not the case.

In [58]:
trainer.hub_model_id

'kanishka/aann-detector'

In [63]:
trainer.tokenizer = tokenizer

In [None]:
trainer