In [1]:
import os
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
from tqdm import tqdm
from transformers import AutoTokenizer, DataCollatorWithPadding

In [3]:
pip install --upgrade transformers



In [4]:
pip install accelerate -U



In [5]:
pip install transformers[torch]



In [6]:
pip install evaluate



In [7]:
pip -q install datasets

In [8]:
from datasets import load_dataset

dataset = load_dataset("SetFit/imdb")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/34.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/33.7M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [84]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 25000
    })
})

In [85]:
dataset['train'][25]

{'text': "My girlfriend once brought around The Zombie Chronicles for us to watch as a joke. Little did we realize the joke was on her for paying £1 for it. While watching this film I started to come up with things I would rather be doing than watching The Zombie Chronicles. These included:<br /><br />1) Drinking bleach 2) Rubbing sand in my eyes 3) Writing a letter to Brad Sykes and Garrett Clancy 4) Re-enacting the American civil war 5) Tax returns 6) GCSE Maths 7) Sex with an old lady.<br /><br />Garrett Clancy, aka Sgt. Ben Draper wrote this? The guy couldn't even dig a hole properly. The best ting he did was kick a door down (the best part of the film). This was the worst film I have ever seen, and I've seen White Noise: The Light. Never has a film had so many mistakes in it. My girlfriend left it here, so now I live with the shame of owning this piece of crap.<br /><br />News just in: Owen Wilson watched this film and tried to kill himself. Fact.<br /><br />DO NOT WATCH",
 'label

In [86]:
dataset['train'][0].keys()

dict_keys(['text', 'label', 'label_text'])

In [87]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [88]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

In [89]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [90]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [91]:
tokenized_datasets['train'][0].keys()

dict_keys(['text', 'label', 'label_text', 'input_ids', 'attention_mask'])

In [92]:
to_remove = list(tokenized_datasets['train'][0].keys())

to_remove.remove('input_ids')
to_remove.remove('attention_mask')
to_remove.remove('label')

tokenized_datasets = tokenized_datasets.remove_columns(to_remove)

In [93]:
tokenized_datasets['train'].features

{'label': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [94]:
#tokenized_datasets['train'][0]

In [95]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=16, collate_fn=data_collator
)

test_dataloader = DataLoader(
    tokenized_datasets["test"],shuffle=True, batch_size=16, collate_fn=data_collator
)

In [96]:
for batch in tqdm(train_dataloader):
    break
{k: v.shape for k, v in batch.items()}

  0%|          | 0/1563 [00:00<?, ?it/s]


{'input_ids': torch.Size([16, 512]),
 'attention_mask': torch.Size([16, 512]),
 'labels': torch.Size([16])}

In [97]:
from transformers import AutoModelForSequenceClassification

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, ignore_mismatched_sizes = True, id2label=id2label, label2id=label2id)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [98]:
train_data_subset = tokenized_datasets["train"].shuffle(seed=42).select(range(500))
test_data_subset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

In [99]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [100]:
import numpy as np
import evaluate

metric_acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=predictions, references=labels)

In [142]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="best_model",
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon = 1e-08,
    disable_tqdm = False,
    do_eval = True,
    evaluation_strategy = "epoch",
    learning_rate = 2e-05,
    lr_scheduler_type = "linear",
    num_train_epochs = 5,
    optim = "adamw_hf",
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    save_strategy = "epoch",
    seed = 42,
    weight_decay = 0.01,
    warmup_steps = 0,
    report_to = "none",
    save_steps = 32,
    eval_steps = 32,
    logging_steps = 32,
    load_best_model_at_end=True,
)

In [143]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=train_data_subset,
    eval_dataset=test_data_subset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics
)

In [144]:
stats = trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.0432,0.724505,0.85
2,0.0144,0.837322,0.85
3,0.0085,0.85481,0.85
4,0.0114,1.045905,0.83
5,0.0056,0.999216,0.83


In [120]:
from pprint import pprint
pprint(trainer.evaluate())

{'epoch': 2.0,
 'eval_accuracy': 0.86,
 'eval_loss': 0.32624104619026184,
 'eval_runtime': 1.7853,
 'eval_samples_per_second': 56.014,
 'eval_steps_per_second': 3.921}


In [147]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="/content/best_model/best_model/checkpoint-160", tokenizer=tokenizer)

In [151]:
text_1 = "This is awesome film. I like it."

In [152]:
result = classifier(text_1)

In [153]:
print(result)

[{'label': 'POSITIVE', 'score': 0.9997853636741638}]
