In [None]:
!pip install transformers
!pip install datasets
!pip install pandas
!pip install numpy
!pip install tensorflow-gpu

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [13]:
PATH_TO_LABEL_TOP_100 = '/content/drive/MyDrive/Colab Notebooks/label.csv'
PATH_TO_TEXT_NO_STOPWORDS = '/content/drive/MyDrive/Colab Notebooks/text_light_clean.csv'

nrows = None
idx = None

In [14]:
import pandas as pd
import numpy as np
import random

df_labels = pd.read_csv(PATH_TO_LABEL_TOP_100, dtype={'id': int, 'label': str, 'label_encoded': int}, sep=',')
df_texts = pd.read_csv(PATH_TO_TEXT_NO_STOPWORDS, sep=',')

df = pd.merge(df_texts, df_labels, on='id', how='left')
df = df.fillna('')

df = df[df['text'] != '']
df = df[df['label'] != '']

arr_texts = df.text.to_numpy()
arr_labels = df.label_encoded.to_numpy()

arr_labels_encoded_unique = np.unique(arr_labels)

if nrows is not None:
    idx = random.sample(range(0, len(arr_labels)), nrows)

if idx is not None:
    arr_labels = [arr_labels[i] for i in idx]
    arr_texts = [arr_texts[i] for i in idx]

arr_y = np.asarray(arr_labels)
arr_x = np.asarray(arr_texts)

In [15]:
seed = 1337
rng = np.random.RandomState(seed)
rng.shuffle(arr_texts)
rng = np.random.RandomState(seed)
rng.shuffle(arr_y)

# Extract a training & validation split
validation_split = .1
num_validation_samples = int(validation_split * len(arr_x))
arr_x_train = arr_x[:-num_validation_samples]
arr_x_test = arr_x[-num_validation_samples:]
arr_y_train = arr_y[:-num_validation_samples]
arr_y_test = arr_y[-num_validation_samples:]

In [20]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification, AutoConfig, BertModel, RobertaForSequenceClassification, TFRobertaForSequenceClassification, TFXLMRobertaForSequenceClassification
from transformers import create_optimizer

max_len = 512

# config = AutoConfig.from_pretrained("distilroberta-base")
config = AutoConfig.from_pretrained("xlm-roberta-base")
config.hidden_dropout_prob = 0.15
config.attention_probs_dropout_prob = 0.15
config.num_labels = 100

model = TFXLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", config=config)
# model = TFAutoModelForSequenceClassification.from_pretrained("distilroberta-base", config=config)
# model = TFXLMRobertaForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/Saved/")

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

x_train = [e[:max_len] if isinstance(e, str) else '' for e in arr_x_train]
x_test = [e[:max_len] if isinstance(e, str) else '' for e in arr_x_test]

df = pd.DataFrame(np.stack((x_train, arr_y_train), axis=1), columns=['text', 'label'])
df.to_csv('/content/train.csv', index=False)

df = pd.DataFrame(np.stack((x_test, arr_y_test), axis=1), columns=['text', 'label'])
df.to_csv('/content/test.csv', index=False)

data_files = {"train": "/content/train.csv", "test": "/content/test.csv"}
dataset = load_dataset("csv", data_files=data_files)

def preprocess_function(data):
    return tokenizer(data['text'], truncation=False)

tokenized_imdb_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_imdb_test = dataset['test'].map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf", max_length=max_len)

batch_size = 32

tf_train_set = model.prepare_tf_dataset(
    tokenized_imdb_train,
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_imdb_test,
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

num_epochs = 10
batches_per_epoch = len(tokenized_imdb_train) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=10000, num_train_steps=total_train_steps)
# optimizer = optimizer=tf.keras.optimizers.Adam(learning_rate=5e-6, epsilon=1e-8)

model.compile(optimizer=optimizer, metrics=['accuracy'])
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs)

model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/XLM-RoBERTa', overwrite=True)

All model checkpoint layers were used when initializing TFXLMRobertaForSequenceClassification.

Some layers of TFXLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-b04644a44095f468/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-b04644a44095f468/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/252 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/28 [00:00<?, ?ba/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
