In [1]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

import keras_nlp
import keras
import keras.backend as K
import tensorflow as tf

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl

cmap = mpl.colormaps['coolwarm']

2024-05-02 10:22:09.119357: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-02 10:22:09.119450: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-02 10:22:09.251223: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Configuration

In [2]:
class CFG:
    seed = 42
    preset = 'deberta_v3_extra_small_en'
    sequence_length = 512
    batch_size = 32
    scheduler = 'exp'
    cache = False
    
    lr_begin = 1e-5
    lr_max = 1e-4
    lr_num_ramp = 4
    lr_decay_rate = 0.6
    epochs = 15

In [3]:
keras.utils.set_random_seed(CFG.seed)
keras.mixed_precision.set_global_policy('float32')

### Load Data

In [4]:
BASE_PATH = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2'

df = pd.read_csv(f'{BASE_PATH}/train.csv')

In [5]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(df, test_size=0.2, stratify=df['score'])

In [6]:
train_df.score.value_counts()

score
3    5024
2    3778
4    3141
1    1001
5     776
6     125
Name: count, dtype: int64

### Preprocessing

In [7]:
preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
    preset=CFG.preset,
    sequence_length=CFG.sequence_length
)

for key, value in preprocessor("The quick fox jumped.").items():
    print(f'{key}: {value.shape}')

Attaching 'tokenizer.json' from model 'keras/deberta_v3/keras/deberta_v3_extra_small_en/2' to your Kaggle notebook...
Attaching 'tokenizer.json' from model 'keras/deberta_v3/keras/deberta_v3_extra_small_en/2' to your Kaggle notebook...
Attaching 'assets/tokenizer/vocabulary.spm' from model 'keras/deberta_v3/keras/deberta_v3_extra_small_en/2' to your Kaggle notebook...


token_ids: (512,)
padding_mask: (512,)


### Model Aids

In [8]:
num_classes = 6
W = tf.cast(tf.constant([[(j-i)**2 / (num_classes - 1) ** 2 for j in range(num_classes)] for i in range(num_classes)]), 'float32')
def weighted_kappa(y_true, y_pred):
    # y_true = [item in batch] = correct y
    # y_pred = [item in batch, y prediction] = softmax probability
    # y_logit = [item in batch, y logit] = one-hot encoded
    E_pred = tf.math.reduce_sum(y_pred, axis=0)
    E_true = tf.math.reduce_sum(y_true, axis=0)
    E = tf.tensordot(E_true, E_pred, 0)
    O = tf.tensordot(y_true, y_pred, ([0], [0]))
    E /= tf.math.reduce_sum(E)
    O /= tf.math.reduce_sum(O)
    return 1 - tf.math.reduce_sum(W * O) / tf.math.reduce_sum(W * E)

# between 0 and 2, 0 is perfect correct and 2 is perfectly incorrect
def anti_weighted_kappa(y_true, y_pred):
    return 1 - weighted_kappa(y_true, y_pred)

### Model Definition

In [9]:
classifier = keras_nlp.models.DebertaV3Classifier.from_preset(
    CFG.preset, preprocessor=None, num_classes=6, activation="softmax"
)

inp = classifier.input
logit = classifier(inp)

model = keras.Model(inp, logit)
model.compile(optimizer=keras.optimizers.RMSprop(learning_rate=CFG.lr_begin), loss=anti_weighted_kappa, metrics=[weighted_kappa, keras.metrics.CategoricalAccuracy()])

model.summary()

Attaching 'config.json' from model 'keras/deberta_v3/keras/deberta_v3_extra_small_en/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/deberta_v3/keras/deberta_v3_extra_small_en/2' to your Kaggle notebook...
Attaching 'model.weights.h5' from model 'keras/deberta_v3/keras/deberta_v3_extra_small_en/2' to your Kaggle notebook...


In [10]:
tmp = preprocessor(["hello my friend!"])
model(tmp)

<tf.Tensor: shape=(1, 6), dtype=float32, numpy=
array([[0.16138707, 0.19296643, 0.16298348, 0.15415876, 0.19217007,
        0.13633417]], dtype=float32)>

### Data Stream

In [11]:
def build_ds(texts, scores, drop_batch_remainder=False):
    inp = preprocessor(texts)
    ds = tf.data.Dataset.from_tensor_slices((inp, scores))
    ds = ds.prefetch(tf.data.AUTOTUNE)
    ds = ds.shuffle(256)
    ds = ds.cache()
    ds = ds.batch(CFG.batch_size, drop_remainder=drop_batch_remainder)
    def one_hot_score(inp, score):
        return (inp, tf.one_hot(score, 6))
    ds = ds.map(one_hot_score)
    return ds

In [12]:
train_ds = build_ds(train_df.full_text.tolist(), np.asarray(train_df.score.tolist())-1, True)

val_ds = build_ds(valid_df.full_text.tolist(), np.asarray(valid_df.score.tolist())-1, False)

### Training

In [13]:
def rate_scheduler(epoch, lr):
    lr_max = CFG.lr_max
    decay = CFG.lr_decay_rate
    ramp_ep = CFG.lr_num_ramp
    if epoch < ramp_ep:
        return lr + (lr_max - lr) / (ramp_ep - epoch)
    else:
        return lr * decay

In [14]:
checkpoint = keras.callbacks.ModelCheckpoint("/kaggle/working/v1-{epoch}.weights.h5", monitor="val_loss", save_weights_only=True)
best_checkpoint = keras.callbacks.ModelCheckpoint("/kaggle/working/v1.weights.h5", monitor="val_loss", save_weights_only=True, save_best_only=True)
rate_control = keras.callbacks.LearningRateScheduler(rate_scheduler)

In [15]:
history = model.fit(train_ds, validation_data=val_ds, epochs=CFG.epochs, callbacks=[checkpoint, rate_control, best_checkpoint])

Epoch 1/15


I0000 00:00:1714645484.103892      69 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1714645484.206700      69 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m  1/432[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m15:08:05[0m 126s/step - categorical_accuracy: 0.0625 - loss: 1.0044 - weighted_kappa: -0.0044

W0000 00:00:1714645484.918610      69 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 712ms/step - categorical_accuracy: 0.2023 - loss: 0.7252 - weighted_kappa: 0.2748

W0000 00:00:1714645826.746074      68 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m471s[0m 799ms/step - categorical_accuracy: 0.2026 - loss: 0.7247 - weighted_kappa: 0.2753 - val_categorical_accuracy: 0.5442 - val_loss: 0.2770 - val_weighted_kappa: 0.7233 - learning_rate: 3.2500e-05
Epoch 2/15
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m335s[0m 775ms/step - categorical_accuracy: 0.5081 - loss: 0.2992 - weighted_kappa: 0.7008 - val_categorical_accuracy: 0.5529 - val_loss: 0.2390 - val_weighted_kappa: 0.7611 - learning_rate: 5.5000e-05
Epoch 3/15
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 771ms/step - categorical_accuracy: 0.5207 - loss: 0.2690 - weighted_kappa: 0.7310 - val_categorical_accuracy: 0.4688 - val_loss: 0.3263 - val_weighted_kappa: 0.6749 - learning_rate: 7.7500e-05
Epoch 4/15
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m333s[0m 771ms/step - categorical_accuracy: 0.5317 - loss: 0.2631 - weighted_kappa: 0.7369 - val_categorical_accuracy: 0.4942

### Evaluate

In [16]:
def load_model(path):
    classifier = keras_nlp.models.DebertaV3Classifier.from_preset(
        CFG.preset, preprocessor=None, num_classes=6, activation="softmax"
    )

    inp = classifier.input
    logit = classifier(inp)

    model = keras.Model(inp, logit)

    model.load_weights(path)
    
    return model

In [17]:
test_model = load_model("v1.weights.h5")

Attaching 'config.json' from model 'keras/deberta_v3/keras/deberta_v3_extra_small_en/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/deberta_v3/keras/deberta_v3_extra_small_en/2' to your Kaggle notebook...
Attaching 'model.weights.h5' from model 'keras/deberta_v3/keras/deberta_v3_extra_small_en/2' to your Kaggle notebook...


In [18]:
test_df = pd.read_csv(f'{BASE_PATH}/test.csv')
test_inp = preprocessor(test_df.full_text.tolist())
prob = test_model(test_inp)
predict = np.argmax(prob, axis=-1) + 1

In [19]:
out_df = pd.DataFrame({
    'essay_id': test_df.essay_id,
    'score': predict
})

In [20]:
out_df.to_csv("submission.csv")

In [21]:
out_df

Unnamed: 0,essay_id,score
0,000d118,2
1,000fe60,3
2,001ab80,5
