## Reuters newswire classification with Bert Model

In [None]:
!pip install --upgrade keras-nlp
!pip install --upgrade keras

In [1]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"  

import keras_nlp
import tensorflow as tf 
import tensorflow_datasets as tfds

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

2024-08-13 13:36:16.027525: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-13 13:36:16.027640: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-13 13:36:16.274416: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Load data

In [2]:
start_char = 1
oov_char = 2
index_from = 3

In [3]:
(x_train, y_train),  (x_test, y_test) = tf.keras.datasets.reuters.load_data(start_char=start_char, oov_char=oov_char, index_from=index_from)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz
[1m2110848/2110848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [4]:
X = np.concatenate((x_train, x_test), axis=0)
Y = np.concatenate((y_train, y_test), axis=0)

In [5]:
X.shape

(11228,)

In [6]:
Y.shape

(11228,)

## Preprocessing data

In [7]:
df = pd.concat([pd.DataFrame({'X': X, 'Y': Y})], ignore_index=True)
df.head()

Unnamed: 0,X,Y
0,"[1, 27595, 28842, 8, 43, 10, 447, 5, 25, 207, ...",3
1,"[1, 3267, 699, 3434, 2295, 56, 16784, 7511, 9,...",4
2,"[1, 53, 12, 284, 15, 14, 272, 26, 53, 959, 32,...",3
3,"[1, 4, 686, 867, 558, 4, 37, 38, 309, 2276, 46...",4
4,"[1, 8295, 111, 8, 25, 166, 40, 638, 10, 436, 2...",4


In [8]:
word_index = tf.keras.datasets.reuters.get_word_index()

inverted_word_index = dict(
    (i + index_from, word) for (word, i) in word_index.items()
)

inverted_word_index[start_char] = ""
inverted_word_index[oov_char] = ""

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters_word_index.json
[1m550378/550378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [9]:
encode = lambda text : [word_index[word] for word in text.split()]
decode = lambda inp : " ".join([inverted_word_index[idx] for idx in inp])

In [10]:
df["text"] = df["X"].apply(decode)

In [11]:
df.head()

Unnamed: 0,X,Y,text
0,"[1, 27595, 28842, 8, 43, 10, 447, 5, 25, 207, ...",3,mcgrath rentcorp said as a result of its dece...
1,"[1, 3267, 699, 3434, 2295, 56, 16784, 7511, 9,...",4,generale de banque sa lt genb br and lt helle...
2,"[1, 53, 12, 284, 15, 14, 272, 26, 53, 959, 32,...",3,shr 3 28 dlrs vs 22 cts shr diluted 2 99 dlrs...
3,"[1, 4, 686, 867, 558, 4, 37, 38, 309, 2276, 46...",4,the farmers home administration the u s agric...
4,"[1, 8295, 111, 8, 25, 166, 40, 638, 10, 436, 2...",4,seton co said its board has received a propos...


In [12]:
X = df["text"].to_numpy()
Y = df["Y"].to_numpy()

In [14]:
X.shape

(11228,)

In [15]:
Y.shape

(11228,)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [17]:
train_tf_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(8)
test_tf_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(8)

## Load Bert model and train

In [None]:
classifier = keras_nlp.models.BertClassifier.from_preset(
        "bert_base_en_uncased",
        num_classes=2
    )
   
# Fit the model
classifier.fit(train_tf_dataset, validation_data=test_tf_dataset,  epochs=3)
    

Attaching 'model.safetensors' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'model.safetensors.index.json' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'metadata.json' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'metadata.json' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'task.json' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'model.safetensors' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'model.safetensors.index.json' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'metadata.json' from model 'keras/bert/keras/bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'met

I0000 00:00:1723556366.964376     130 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1723556367.039431     130 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m 490/1123[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m6:14[0m 592ms/step - loss: nan - sparse_categorical_accuracy: 0.0428

## Test trained model

In [None]:
classifier.predict([X[0], X[1]])

In [None]:
print(Y[0], Y[1])