In [1]:
!pip install transformers datasets evaluate transformers[torch]

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import shuffle

from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import evaluate

data_path = "drive/MyDrive/diploma"
max_length = 512

class ModelNames:
    distilbert = "distilbert/distilbert-base-uncased"
    albert = "albert/albert-base-v2"

model_name = ModelNames.albert

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

tokenizer = AutoTokenizer.from_pretrained(model_name)
accuracy = evaluate.load("accuracy")

def prepare_dataset():
    X_train = pd.read_csv(f"{data_path}/X_train.csv")
    y_train = pd.read_csv(f"{data_path}/y_train.csv")
    X_test = pd.read_csv(f"{data_path}/X_test.csv")
    y_test = pd.read_csv(f"{data_path}/y_test.csv")

    train = X_train.copy()
    test = X_test.copy()

    train["text"] = train["title"] + "\n" + train["description"]
    test["text"] = test["title"] + "\n" + test["description"]

    train = train.drop(columns=["title", "description"])
    test = test.drop(columns=["title", "description"])

    train["label"] = y_train
    test["label"] = y_test

    train = Dataset.from_pandas(train)
    test = Dataset.from_pandas(test)

    train = train.map(preprocess_function, batched=True)
    test = test.map(preprocess_function, batched=True)

    # train = train.select(range(16))
    # test = test.select(range(16))

    return train, test

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=max_length)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


train, test = prepare_dataset()

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir=model_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Map:   0%|          | 0/4688 [00:00<?, ? examples/s]

Map:   0%|          | 0/4065 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.458484,0.77663
2,0.555500,0.576822,0.720049
3,0.555500,0.641423,0.706519
4,0.335700,0.778295,0.732841
5,0.335700,0.913727,0.743419


TrainOutput(global_step=1465, training_loss=0.35466582002086444, metrics={'train_runtime': 2953.7519, 'train_samples_per_second': 7.936, 'train_steps_per_second': 0.496, 'total_flos': 537304009132800.0, 'train_loss': 0.35466582002086444, 'epoch': 5.0})

In [7]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1714439520.00b7989a9f0d.367.1:   0%|          | 0.00/7.45k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/DenysZakharkevych/albert-base-v2/commit/8ff9eaf6734ce45e4eac5a1d3543eee823f3aa9c', commit_message='End of training', commit_description='', oid='8ff9eaf6734ce45e4eac5a1d3543eee823f3aa9c', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model=model_name)
text = train[1]['text']
classifier(text)

[{'label': 'NEGATIVE', 'score': 0.5195567607879639}]