# Imports

In [1]:
import os, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
print(f'TF version: {tf.__version__}')
import tensorflow_addons as tfa
from tensorflow.keras import layers

import transformers
print(f'transformers version: {transformers.__version__}')
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

import sys


TF version: 2.10.0
transformers version: 4.24.0


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sys.path.append('./input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [3]:
def set_seed(seed=1225):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
#     os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed(1225)

# Load DataFrame

In [4]:
df = pd.read_csv('./input/feedback-prize-english-language-learning/train.csv')
display(df.head())
print('\n---------DataFrame Summary---------')
df.info()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5



---------DataFrame Summary---------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3911 entries, 0 to 3910
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   text_id      3911 non-null   object 
 1   full_text    3911 non-null   object 
 2   cohesion     3911 non-null   float64
 3   syntax       3911 non-null   float64
 4   vocabulary   3911 non-null   float64
 5   phraseology  3911 non-null   float64
 6   grammar      3911 non-null   float64
 7   conventions  3911 non-null   float64
dtypes: float64(6), object(2)
memory usage: 244.6+ KB


# CV Split

In [5]:
N_FOLD = 5
TARGET_COLS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

skf = MultilabelStratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=42)
for n, (train_index, val_index) in enumerate(skf.split(df, df[TARGET_COLS])):
    df.loc[val_index, 'fold'] = int(n)
df['fold'] = df['fold'].astype(int)
df['fold'].value_counts()

1    783
0    782
4    782
3    782
2    782
Name: fold, dtype: int64

In [6]:
df.to_csv('./df_folds.csv', index=False)

# Model Config

In [7]:
MAX_LENGTH = 512
BATCH_SIZE = 2
DEBERTA_MODEL = "./input/deberta-v3-base"

Why we should disable dropout in regression task, check this [discussion](https://www.kaggle.com/competitions/commonlitreadabilityprize/discussion/260729).

In [8]:
tokenizer = transformers.AutoTokenizer.from_pretrained(DEBERTA_MODEL)
tokenizer.save_pretrained('./tokenizer/')

cfg = transformers.AutoConfig.from_pretrained(DEBERTA_MODEL, output_hidden_states=True)
cfg.hidden_dropout_prob = 0
cfg.attention_probs_dropout_prob = 0
cfg.save_pretrained('./tokenizer/')



# Data Process Function

To make use of HugggingFace DeBERTa model, we have to tokenize our input texts as the pretrained DeBERTa model requires.

In [9]:
def deberta_encode(texts, tokenizer=tokenizer):
    input_ids = []
    attention_mask = []
    
    for text in texts.tolist():
        token = tokenizer(text, 
                          add_special_tokens=True, 
                          max_length=MAX_LENGTH, 
                          return_attention_mask=True, 
                          return_tensors="np", 
                          truncation=True, 
                          padding='max_length')
        input_ids.append(token['input_ids'][0])
        attention_mask.append(token['attention_mask'][0])
    
    return np.array(input_ids, dtype="int32"), np.array(attention_mask, dtype="int32")

In [10]:
def get_dataset(df):
    inputs = deberta_encode(df['full_text'])
    targets = np.array(df[TARGET_COLS], dtype="float32")
    return inputs, targets

# Model

## MeanPool

Instead of using '[CLS]' token, MeanPool method averaging one layer of hidden states along the sequence axis with masking out padding tokens.

## WeightedLayerPool

WeightedLayerPool uses a set of trainable weights to average a set of hidden states from transformer backbone. I use a Dense layer with constraint to implement it.

In [11]:
class MeanPool(tf.keras.layers.Layer):
    def call(self, inputs, mask=None):
        broadcast_mask = tf.expand_dims(tf.cast(mask, "float32"), -1)
        embedding_sum = tf.reduce_sum(inputs * broadcast_mask, axis=1)
        mask_sum = tf.reduce_sum(broadcast_mask, axis=1)
        mask_sum = tf.math.maximum(mask_sum, tf.constant([1e-9]))
        return embedding_sum / mask_sum

WeightedLayerPool weights constraints: softmax to push sum(w) to be 1.

In [12]:
class WeightsSumOne(tf.keras.constraints.Constraint):
    def __call__(self, w):
        return tf.nn.softmax(w, axis=0)

## Model Design 

Take the last 4 layers hidden states of DeBERTa, take MeanPool of them to gather information along the sequence axis, then take WeightedLayerPool with a set of trainable weights to gather information along the depth axis of the model, then finally a regression head.

## Last Layer Reinitialization

Reinitialization of the last transformer encoder block: GlorotUniform for Dense kernel, Zeros for Dense bias, Zeros for LayerNorm beta, Ones for LayerNorm gamma.

## Layer-wise Learning Rate Decay

Using MultiOptimizer to implement LLRD: Initial learning rate 1e-5 with layer-wise decay 0.9 for transformer encoder and embedding block, 1e-4 for the rest of the model. All learning rates have ExponentialDecay schedulers with decay rate 0.3

In [13]:
def get_model():
    input_ids = tf.keras.layers.Input(
        shape=(MAX_LENGTH,), dtype=tf.int32, name="input_ids"
    )
    
    attention_masks = tf.keras.layers.Input(
        shape=(MAX_LENGTH,), dtype=tf.int32, name="attention_masks"
    )
   
    deberta_model = transformers.TFAutoModel.from_pretrained(DEBERTA_MODEL, config=cfg)
    
    #Last Layer Reinitialization or Partially Reinitialization
#     Uncommon next three lines to check deberta encoder block
#     print('DeBERTa Encoder Block:')
#     for layer in deberta_model.deberta.encoder.layer:
#         print(layer)
        
    REINIT_LAYERS = 1
    normal_initializer = tf.keras.initializers.GlorotUniform()
    zeros_initializer = tf.keras.initializers.Zeros()
    ones_initializer = tf.keras.initializers.Ones()

#     print(f'\nRe-initializing encoder block:')
    for encoder_block in deberta_model.deberta.encoder.layer[-REINIT_LAYERS:]:
#         print(f'{encoder_block}')
        for layer in encoder_block.submodules:
            if isinstance(layer, tf.keras.layers.Dense):
                layer.kernel.assign(normal_initializer(shape=layer.kernel.shape, dtype=layer.kernel.dtype))
                if layer.bias is not None:
                    layer.bias.assign(zeros_initializer(shape=layer.bias.shape, dtype=layer.bias.dtype))

            elif isinstance(layer, tf.keras.layers.LayerNormalization):
                layer.beta.assign(zeros_initializer(shape=layer.beta.shape, dtype=layer.beta.dtype))
                layer.gamma.assign(ones_initializer(shape=layer.gamma.shape, dtype=layer.gamma.dtype))

    deberta_output = deberta_model.deberta(
        input_ids, attention_mask=attention_masks
    )
    hidden_states = deberta_output.hidden_states
    
    #WeightedLayerPool + MeanPool of the last 4 hidden states
    stack_meanpool = tf.stack(
        [MeanPool()(hidden_s, mask=attention_masks) for hidden_s in hidden_states[-4:]], 
        axis=2)
    
    weighted_layer_pool = layers.Dense(1,
                                       use_bias=False,
                                       kernel_constraint=WeightsSumOne())(stack_meanpool)
    
    weighted_layer_pool = tf.squeeze(weighted_layer_pool, axis=-1)
    #x=layers.Dense(4096,activation='relu')(weighted_layer_pool)
    output = layers.Dense(6, activation='linear')(weighted_layer_pool)
    
    #output = layers.Rescaling(scale=4.0, offset=1.0)(x)
    model = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=output)
    
    #Compile model with Layer-wise Learning Rate Decay
    layer_list = [deberta_model.deberta.embeddings] + list(deberta_model.deberta.encoder.layer)
    layer_list.reverse()
    
    INIT_LR = 1e-5
    LLRDR = 0.9
    LR_SCH_DECAY_STEPS = 1600 # 2 * len(train_df) // BATCH_SIZE
    
    lr_schedules = [tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=INIT_LR * LLRDR ** i, 
        decay_steps=LR_SCH_DECAY_STEPS, 
        decay_rate=0.3) for i in range(len(layer_list))]
    lr_schedule_head = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=1e-4, 
        decay_steps=LR_SCH_DECAY_STEPS, 
        decay_rate=0.3)
    
    optimizers = [tf.keras.optimizers.Adam(learning_rate=lr_sch) for lr_sch in lr_schedules]
    
    optimizers_and_layers = [(tf.keras.optimizers.Adam(learning_rate=lr_schedule_head), model.layers[-4:])] +\
        list(zip(optimizers, layer_list))
    
#     Uncomment next three lines to check optimizers_and_layers
#     print('\nLayer-wise Learning Rate Decay Initial LR:')
#     for o,l in optimizers_and_layers:
#         print(f'{o._decayed_lr("float32").numpy()} for {l}')
        
    optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)
    
    model.compile(optimizer=optimizer,
                 loss='mse',
                 metrics=[tf.keras.metrics.RootMeanSquaredError()],
                 )
    return model

In [14]:
tf.keras.backend.clear_session()
#model = get_model()
#model.summary()

# 5 Folds Training Loop

In [15]:
valid_rmses = []
for fold in range(N_FOLD):
    print(f'\n-----------FOLD {fold} ------------')
    
    #Create dataset
    train_df = df[df['fold'] != fold].reset_index(drop=True)
    valid_df = df[df['fold'] == fold].reset_index(drop=True)
    train_dataset = get_dataset(train_df)
    valid_dataset = get_dataset(valid_df)
    
    print('Data prepared.')
    print(f'Training data input_ids shape: {train_dataset[0][0].shape} dtype: {train_dataset[0][0].dtype}') 
    print(f'Training data attention_mask shape: {train_dataset[0][1].shape} dtype: {train_dataset[0][1].dtype}')
    print(f'Training data targets shape: {train_dataset[1].shape} dtype: {train_dataset[1].dtype}')
    print(f'Validation data input_ids shape: {valid_dataset[0][0].shape} dtype: {valid_dataset[0][0].dtype}')
    print(f'Validation data attention_mask shape: {valid_dataset[0][1].shape} dtype: {valid_dataset[0][1].dtype}')
    print(f'Validation data targets shape: {valid_dataset[1].shape} dtype: {valid_dataset[1].dtype}')
    
    #Create model
    tf.keras.backend.clear_session()
    model = get_model()
    print('Model prepared.')
    
    #Training model
    print('Start training...')
    callbacks = [
    tf.keras.callbacks.ModelCheckpoint(f"best_model_fold{fold}.h5",
                                       monitor="val_loss",
                                       mode="min",
                                       save_best_only=True,
                                       verbose=1,
                                       save_weights_only=True,),
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                     min_delta=1e-5, 
                                     patience=3, 
                                     verbose=1,
                                     mode='min',)
    ]
    history = model.fit(x=train_dataset[0],
                        y=train_dataset[1],
                        validation_data=valid_dataset, 
                        epochs=10,
                        shuffle=True,
                        batch_size=1,
                        callbacks=callbacks
                       )
    
    valid_rmses.append(np.min(history.history['val_root_mean_squared_error']))
    print('Training finished.')
    del train_dataset, valid_dataset, train_df, valid_df
    gc.collect()
    tf.keras.backend.clear_session()


-----------FOLD 0 ------------
Data prepared.
Training data input_ids shape: (3129, 512) dtype: int32
Training data attention_mask shape: (3129, 512) dtype: int32
Training data targets shape: (3129, 6) dtype: float32
Validation data input_ids shape: (782, 512) dtype: int32
Validation data attention_mask shape: (782, 512) dtype: int32
Validation data targets shape: (782, 6) dtype: float32




Model prepared.
Start training...
Epoch 1/10
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Epoch 1: val_loss improved from inf to 0.20902, saving model to best_model_fold0.h5
Epoch 2/10
Epoch 2: val_loss improved from 0.20902 to 0.20666, saving model to best_model_fold0.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.20666 to 0.20625, saving model to best_model_fold0.h5
Epoch 4/10

In [None]:
print(f'{len(valid_rmses)} Folds validation RMSE:\n{valid_rmses}')
print(f'Local CV Average score: {np.mean(valid_rmses)}')

# Inference



In [None]:
test_df = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
test_df.head()

In [None]:
test_dataset = deberta_encode(test_df['full_text'])

# Single model 5 Folds ensemble prediction

In [None]:
fold_preds = []
for fold in range(N_FOLD):
    tf.keras.backend.clear_session()
    model = get_model()
    model.load_weights(f'best_model_fold{fold}.h5')
    print(f'\nFold {fold} inference...')
    pred = model.predict(test_dataset, batch_size=BATCH_SIZE)
    fold_preds.append(pred)
    gc.collect()

In [None]:
preds = np.mean(fold_preds, axis=0)
preds = np.clip(preds, 1, 5)

In [None]:
sub_df = pd.concat([test_df[['text_id']], pd.DataFrame(preds, columns=TARGET_COLS)], axis=1)
sub_df.to_csv('submission2.csv', index=False)

In [None]:
sub_df.head()