Se instala librería transformers

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Se importan librerías necesarias

In [None]:
import gzip
import shutil
import time
import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext
import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

Se configuran ajustes generales como el dispositivo(CPU o GPU), número de épocas, etc.

In [None]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f78c754bc30>

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)
NUM_EPOCHS = 1

cpu


Se descarga el Dataset de Reseñas de películas de IMDB y se guarda en un *dataframe*

In [None]:
url = ("https://github.com/rasbt/"
"machine-learning-book/raw/"
"main/ch08/movie_data.csv.gz")
filename = url.split("/")[-1]
with open(filename, "wb") as f:
    r = requests.get(url)
    f.write(r.content)
with gzip.open('movie_data.csv.gz', 'rb') as f_in:
    with open('movie_data.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
df = pd.read_csv('movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


Se divide el Dataset en conjuntos de entrenamiento,validación, test y se tokenizan.

In [None]:
train_texts = df.iloc[:5000]['review'].values
train_labels = df.iloc[:5000]['sentiment'].values
valid_texts = df.iloc[5000:10000]['review'].values
valid_labels = df.iloc[5000:10000]['sentiment'].values
test_texts = df.iloc[10000:]['review'].values
test_labels = df.iloc[10000:]['sentiment'].values

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained(
'distilbert-base-uncased'
)
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

In [None]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx])
            for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)
train_dataset = IMDbDataset(train_encodings, train_labels)
valid_dataset = IMDbDataset(valid_encodings, valid_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=16, shuffle=True)
valid_loader = torch.utils.data.DataLoader(
valid_dataset, batch_size=16, shuffle=False)
test_loader = torch.utils.data.DataLoader(
test_dataset, batch_size=16, shuffle=False)

Se carga el modelo preentrenado "distilbert-base-uncased"

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(DEVICE)
model.train()
optim = torch.optim.Adam(model.parameters(), lr=5e-5)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
)
trainer = Trainer(
model=model,
    args=training_args,
    train_dataset=train_dataset,
    optimizers=(optim, None) # optim and learning rate scheduler
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.22.1",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/pytorch_model.bin
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification

In [None]:
from datasets import load_metric
import numpy as np
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    print("logits",logits)
    # note: logits are a numpy array, not a pytorch tensor
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optim, None)) # optim and learning rate scheduler)

Se entrena modelo

In [None]:
start_time = time.time()
trainer.train()

***** Running training *****
  Num examples = 5000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 313


Se evalúa el modelo

In [None]:
print(trainer.evaluate())

Se guarda el modelo

In [None]:
torch.save(model.state_dict(), 'best_model_state.bin')

NameError: ignored

In [None]:
review_text = "good"
encoded_review = tokenizer.encode_plus(
  review_text,
  max_length=200,
  add_special_tokens=True,
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',
)

In [None]:
class_names = ['negative', 'positive']
input_ids = encoded_review['input_ids'].to(DEVICE)
attention_mask = encoded_review['attention_mask'].to(DEVICE)
output = model(input_ids, attention_mask)

print(output)
_, prediction = torch.max(output['logits'], dim=1)
print(prediction)
print(f'Review text: {review_text}')
print(f'Sentiment  : {class_names[prediction]}')

SequenceClassifierOutput(loss=None, logits=tensor([[-0.1657, -0.3010]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([0])
Review text: good
Sentiment  : negative
