In [1]:
!pip install transformers
!pip install transformers[sentencepiece]
!pip install xformers
!pip install datasets
!pip install evaluate
!pip install huggingface_hub

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m49.9 MB/s[0m eta [36m0:00:0

In [2]:
from datasets import load_dataset #HF библиотека по работе с датасэтами
from transformers import AutoTokenizer, AutoConfig #автоматическое определение токенайзера предобученной модели
from transformers import DataCollatorWithPadding #набивка токенов до единой размерности
from transformers import TFAutoModelForSequenceClassification #головная часть модели для решения GLUE задачи
from tensorflow.keras.losses import SparseCategoricalCrossentropy #loss function
from tensorflow.keras.optimizers.schedules import PolynomialDecay #уменьшает learning rate по ходу обучения
from tensorflow.keras.optimizers import Adam #optimizer
import tensorflow as tf
import evaluate #оценка результатов
import numpy as np
from transformers import PushToHubCallback

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
raw_datasets = load_dataset('SIA86/TechnicalSupportCalls') #с помощью библиотеки datasets загружаем датасет с сайта HF
checkpoint = "bert-base-multilingual-cased" #выбираем модель
tokenizer = AutoTokenizer.from_pretrained(checkpoint) #загружаем токенайзер из предобученной модели
#tokenizer.add_special_tokens({'pad_token': '[PAD]'}) #add for gpt3

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) #токенизируем скаченный датасет
tokenized_datasets = tokenized_datasets.map(lambda examples: {"labels": examples["label"]}, batched=True)
#tokenized_datasets = tokenized_datasets.rename_column("label", "labels") #переименование колонки label



In [None]:
from collections import defaultdict

id2label = defaultdict(int, {str(i):k for i,k in enumerate(raw_datasets['train'].features['label'].names)})

label2id = defaultdict(int, {k: i for i,k in enumerate(raw_datasets['train'].features['label'].names)})
label2id

In [6]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 921
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 231
    })
})

In [None]:
tokenized_datasets['train'][15]['input_ids']

In [12]:
from keras.api._v2.keras import callbacks
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) #подгружаем дата коллатор

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset( #переводим датасэт в tf.dataset формат
    columns=["attention_mask", "input_ids", "token_type_ids"], #оставляем только нужные колонки
    label_cols=["labels"], #передаем колонку с таргетами
    shuffle=False,
    collate_fn=data_collator, #выравнивание токенов по длинне
    batch_size=8,
)


tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

callback = PushToHubCallback(
    "bert-cased-text-classification", save_strategy="epoch", tokenizer=tokenizer)


num_epochs = 20

num_train_steps = len(tf_train_dataset) * num_epochs #определяем длинну шагов (колическо элементов в датасете/на бэтчсайз * эпохи)

lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0, decay_steps=num_train_steps
)
opt = Adam(learning_rate=lr_scheduler) #настраиваем leraning rate таким образом чтобы он уменьшался по ходу обучения

config = AutoConfig.from_pretrained(checkpoint, label2id=label2id, id2label=id2label)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, config=config) #создаем головную часть модели указываем количество лэйблов
#model = TFAutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) #настраиваем loss func на прием logits
model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])
model.fit(tf_train_dataset, validation_data=tf_test_dataset, epochs=num_epochs, callbacks=[callback])

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
/content/bert-cased-text-classification is already a clone of https://huggingface.co/SIA86/bert-cased-text-classification. Make sure you pull the latest changes with `repo.git_pull()`.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7d4675718070>

In [17]:
import requests

API_URL = "https://api-inference.huggingface.co/models/SIA86/bert-cased-text-classification"
headers = {"Authorization": "Bearer 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"}

output = {
	"inputs": "Не могу отправить письмо по электронной почте.",
}
response = requests.post(API_URL, headers=headers, json=output)


