In [1]:
import os, re
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
from transformers import RobertaTokenizer, RobertaConfig, TFRobertaForSequenceClassification
from tokenizers import ByteLevelBPETokenizer
pd.options.display.max_columns = 30

# Get the Data

In [2]:
x_train, y_train = fetch_20newsgroups(subset='train', return_X_y=True)
x_valid, y_valid = fetch_20newsgroups(subset='test', return_X_y=True)

# Build Tokenizer

In [3]:
os.mkdir('text_files')
for e, text in enumerate(x_train):
    with open(f"text_files/train_{e+1:05}.txt", 'w') as f:
        f.write(re.sub(r'\s+', ' ', text))

In [4]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(
    files=[f"text_files/train_{e+1:05}.txt" for e in range(len(x_train))], vocab_size=30_522,
    special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>']
)
os.mkdir('tokenizer')
tokenizer.save_model('tokenizer')






['tokenizer/vocab.json', 'tokenizer/merges.txt']

# Tokenize Data

In [5]:
tokenizer = RobertaTokenizer.from_pretrained('tokenizer')
def tokenize(texts):
    return tokenizer(
        texts, padding='max_length', truncation=True, max_length=512, return_tensors='np'
    )

In [6]:
x_train_tokenized = tokenize(x_train)
x_valid_tokenized = tokenize(x_valid)

# Prepare Data Loaders

In [7]:
train_data = tf.data.Dataset.from_tensor_slices((dict(x_train_tokenized), y_train)).batch(8)
valid_data = tf.data.Dataset.from_tensor_slices((dict(x_valid_tokenized), y_valid)).batch(8)

2022-07-02 09:28:49.744900: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-02 09:28:49.745979: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-02 09:28:49.746648: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-02 09:28:49.750090: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

# Build Model

In [8]:
config = RobertaConfig(
    vocab_size=30_522,
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1, 
    num_labels=20,
)
model = TFRobertaForSequenceClassification(config)

# Train

In [9]:
model.compile(
    optimizer=tf.optimizers.Adam(2e-5), 
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
    metrics='accuracy'
)
model.fit(train_data, validation_data=valid_data, epochs=10)
model.save_pretrained('news-classifier')

Epoch 1/10


2022-07-02 09:29:06.195379: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Inference

In [10]:
model = TFRobertaForSequenceClassification.from_pretrained('news-classifier')
logits = model.predict(valid_data, verbose=1).logits
preds_proba = tf.nn.softmax(logits).numpy()
preds = preds_proba.argmax(axis=1)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at news-classifier.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.




# Evaluation

In [11]:
clf_report = pd.DataFrame(confusion_matrix(y_valid, preds))
precision, recall, fscore, support = precision_recall_fscore_support(y_valid, preds)
clf_report['precision'] = precision
clf_report['recall'] = recall
clf_report['fscore'] = fscore
clf_report['support'] = support
clf_report

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,precision,recall,fscore,support
0,210,0,0,0,0,0,0,4,0,1,1,1,0,5,8,12,14,1,4,58,0.857143,0.658307,0.744681,319
1,0,233,20,14,16,20,4,8,1,2,0,3,48,12,5,1,0,1,1,0,0.737342,0.598972,0.660993,389
2,0,23,206,70,12,14,1,11,2,2,0,2,33,8,6,0,2,0,1,1,0.789272,0.522843,0.629008,394
3,0,9,13,282,43,1,2,8,1,1,0,1,28,2,1,0,0,0,0,0,0.603854,0.719388,0.656577,392
4,0,5,2,37,306,0,5,14,0,0,6,0,10,0,0,0,0,0,0,0,0.721698,0.794805,0.756489,385
5,0,23,18,2,2,319,1,5,0,0,0,1,11,7,5,0,1,0,0,0,0.883657,0.807595,0.843915,395
6,0,3,2,24,14,0,298,12,7,7,2,0,14,2,2,0,0,0,3,0,0.922601,0.764103,0.835905,390
7,0,0,0,4,2,0,4,360,7,2,2,2,6,2,2,0,3,0,0,0,0.672897,0.909091,0.773362,396
8,0,0,0,0,0,1,2,34,341,5,0,0,4,1,4,1,3,0,2,0,0.867684,0.856784,0.8622,398
9,0,1,0,1,0,0,2,7,3,358,12,0,5,1,6,0,0,0,1,0,0.84434,0.901763,0.872107,397
