In [None]:
!pip install transformers
!pip install datasets
!pip install pandas
!pip install numpy
!pip install tensorflow-gpu

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [None]:
LABELS = '/data/output_received_by.csv'
TEXT = '/data/output_heavy.csv'

nrows = None
idx = None

In [None]:
import pandas as pd
import numpy as np
import random

df_labels = pd.read_csv(PATH_TO_LABEL_TOP_100, dtype={'id': int, 'label': str, 'label_encoded': int}, sep=',')
df_texts = pd.read_csv(PATH_TO_TEXT_NO_STOPWORDS, sep=',')

df = pd.merge(df_texts, df_labels, on='id', how='left')
df = df.fillna('')

df = df[df['text'] != '']
df = df[df['label'] != '']

arr_texts = df.text.to_numpy()
arr_labels = df.label_encoded.to_numpy()

arr_labels_encoded_unique = np.unique(arr_labels)

if nrows is not None:
    idx = random.sample(range(0, len(arr_labels)), nrows)

if idx is not None:
    arr_labels = [arr_labels[i] for i in idx]
    arr_texts = [arr_texts[i] for i in idx]

arr_y = np.asarray(arr_labels)
arr_x = np.asarray(arr_texts)

In [None]:
seed = 1337
rng = np.random.RandomState(seed)
rng.shuffle(arr_texts)
rng = np.random.RandomState(seed)
rng.shuffle(arr_y)

# Extract a training & validation split
validation_split = .1
num_validation_samples = int(validation_split * len(arr_x))
arr_x_train = arr_x[:-num_validation_samples]
arr_x_test = arr_x[-num_validation_samples:]
arr_y_train = arr_y[:-num_validation_samples]
arr_y_test = arr_y[-num_validation_samples:]

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification, AutoConfig, BertModel, RobertaForSequenceClassification, TFRobertaForSequenceClassification, TFXLMRobertaForSequenceClassification
from transformers import create_optimizer

max_len = 512

# config = AutoConfig.from_pretrained("distilroberta-base")
config = AutoConfig.from_pretrained("xlm-roberta-base")
config.hidden_dropout_prob = 0.15
config.attention_probs_dropout_prob = 0.15
config.num_labels = 100

# model = TFXLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", config=config)
# model = TFAutoModelForSequenceClassification.from_pretrained("distilroberta-base", config=config)
# model = TFXLMRobertaForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/finetuned/")

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

x_train = [e[:max_len] if isinstance(e, str) else '' for e in arr_x_train]
x_test = [e[:max_len] if isinstance(e, str) else '' for e in arr_x_test]

df = pd.DataFrame(np.stack((x_train, arr_y_train), axis=1), columns=['text', 'label'])
df.to_csv('/content/train.csv', index=False)

df = pd.DataFrame(np.stack((x_test, arr_y_test), axis=1), columns=['text', 'label'])
df.to_csv('/content/test.csv', index=False)

data_files = {"train": "/content/train.csv", "test": "/content/test.csv"}
dataset = load_dataset("csv", data_files=data_files)

def preprocess_function(data):
    return tokenizer(data['text'], truncation=False)

tokenized_imdb_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_imdb_test = dataset['test'].map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf", max_length=max_len)

batch_size = 32

tf_train_set = model.prepare_tf_dataset(
    tokenized_imdb_train,
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_imdb_test,
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

num_epochs = 10
batches_per_epoch = len(tokenized_imdb_train) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=10000, num_train_steps=total_train_steps)
# optimizer = optimizer=tf.keras.optimizers.Adam(learning_rate=5e-6, epsilon=1e-8)

model.compile(optimizer=optimizer, metrics=['accuracy'])
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs)

model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/XLM-RoBERTa', overwrite=True)