In [12]:
import os
import numpy as np
import pandas as pd

from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizerFast
from transformers import TFTrainer, TFTrainingArguments

import tensorflow as tf
from sklearn.model_selection import train_test_split

In [13]:
DATA_DIR = '../data'

dataset_path = os.path.join(DATA_DIR, 'computers_train', 'computers_train_small.json.gz')
df = pd.read_json(dataset_path, compression='gzip', lines=True)

In [14]:
df.keys()

Index(['id_left', 'title_left', 'description_left', 'brand_left', 'price_left',
       'specTableContent_left', 'keyValuePairs_left', 'category_left',
       'cluster_id_left', 'identifiers_left', 'id_right', 'title_right',
       'description_right', 'brand_right', 'price_right',
       'specTableContent_right', 'keyValuePairs_right', 'category_right',
       'cluster_id_right', 'identifiers_right', 'label', 'pair_id'],
      dtype='object')

In [15]:
X = '[CLS] ' + df['title_left'] + ' [SEP] ' + df['title_right'] + ' [SEP]'
y = df['label']

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [17]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [18]:
train_encodings = tokenizer(X_train.to_list(), truncation=True, padding=True)
val_encodings = tokenizer(X_val.to_list(), truncation=True, padding=True)

In [19]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
))

In [21]:
training_args = TFTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

with training_args.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                                  num_labels=2,
                                                                  output_attentions=False,
                                                                  output_hidden_states=False)

trainer = TFTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

ResourceExhaustedError: OOM when allocating tensor with shape[30522,768] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:TruncatedNormal]