In [2]:
!pip install datasets
!pip install transformers
!pip install evaluate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")

dataset['train'][0]

{'label': 4,
 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank."}

In [22]:
from transformers import AutoTokenizer
model_checkpoint = "google-bert/bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_data = dataset.map(tokenize_function, batched = True)

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [42]:
train_dataset = tokenized_data['train'].shuffle().select(range(1000)) #smaller amt of data since it takes a lot of processing power
test_dataset = tokenized_data['test'].shuffle().select(range(1000))

In [54]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = 5)
TrainingArgs = TrainingArguments(
    output_dir='/content/drive/results',          # output directory
    do_predict= True,
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    warmup_steps=15,                # number of warmup steps for learning rate
    save_steps=30,
    save_total_limit=10,
    load_best_model_at_end= True,
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=30,
    eval_strategy = 'steps')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
! pip install -U accelerate
! pip install -U transformers

In [55]:
import numpy as np
import evaluate

metric = evaluate.load('accuracy')
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  pred = np.argmax(logits, axis = -1)
  return metric.compute(predictions = pred, references = labels)
trainer = Trainer(
    model = model,
    args = TrainingArgs,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    compute_metrics = compute_metrics)

In [56]:
trainer.train()


Step,Training Loss,Validation Loss,Accuracy
30,1.6966,1.64818,0.205
60,1.6111,1.549993,0.312
90,1.5634,1.425337,0.391
120,1.4389,1.286613,0.429
150,1.3507,1.229219,0.466
180,1.3072,1.22512,0.435
210,1.2346,1.349812,0.412
240,1.3884,1.169273,0.515
270,1.0748,1.225482,0.474
300,1.02,1.269117,0.475


TrainOutput(global_step=750, training_loss=1.0391238606770834, metrics={'train_runtime': 1355.7746, 'train_samples_per_second': 2.213, 'train_steps_per_second': 0.553, 'total_flos': 789354427392000.0, 'train_loss': 1.0391238606770834, 'epoch': 3.0})

In [57]:
print(test_dataset)
trainer.evaluate(test_dataset)

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})


{'eval_loss': 1.0462288856506348,
 'eval_accuracy': 0.565,
 'eval_runtime': 30.6513,
 'eval_samples_per_second': 32.625,
 'eval_steps_per_second': 16.313,
 'epoch': 3.0}

In [58]:
trainer.save_model("./my_model")

In [60]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [61]:
trainer.push_to_hub('Kirkos27/ModelFor_yelp_review_full')

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/Kirkos27/results/commit/78ad6fd240068b0caab38fd0c35c6a23991a87e7', commit_message='Kirkos27/ModelFor_yelp_review_full', commit_description='', oid='78ad6fd240068b0caab38fd0c35c6a23991a87e7', pr_url=None, pr_revision=None, pr_num=None)