# DeBERTa 4-Way Ensemble

## Imports

In [1]:
import os, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
print(f'TF version: {tf.__version__}')
import tensorflow_addons as tfa
from tensorflow.keras import layers

import transformers
print(f'transformers version: {transformers.__version__}')
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

TF version: 2.6.4
transformers version: 4.20.1


In [2]:
def set_seed(seed=25):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
#     os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed(25)

In [3]:
N_FOLD = 5

## Get Data and Model

In [4]:
def get_dataset(df):
    inputs = deberta_encode(df['full_text'])
    targets = np.array(df[TARGET_COLS], dtype="float32")
    return inputs, targets

In [5]:
class MeanPool(tf.keras.layers.Layer):
    def call(self, inputs, mask=None):
        broadcast_mask = tf.expand_dims(tf.cast(mask, "float32"), -1)
        embedding_sum = tf.reduce_sum(inputs * broadcast_mask, axis=1)
        mask_sum = tf.reduce_sum(broadcast_mask, axis=1)
        mask_sum = tf.math.maximum(mask_sum, tf.constant([1e-9]))
        return embedding_sum / mask_sum
        
class WeightsSumOne(tf.keras.constraints.Constraint):
    def __call__(self, w):
        return tf.nn.softmax(w, axis=0)

In [7]:
tokenizer = transformers.AutoTokenizer.from_pretrained("../input/debertav3base")
tokenizer.save_pretrained('./tokenizer/')

cfg = transformers.AutoConfig.from_pretrained("../input/debertav3base", output_hidden_states=True)
cfg.hidden_dropout_prob = 0
cfg.attention_probs_dropout_prob = 0
cfg.save_pretrained('./tokenizer/')

  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


In [8]:
def deberta_encode(texts, tokenizer=tokenizer):
    input_ids = []
    attention_mask = []
    
    for text in texts.tolist():
        token = tokenizer(text, 
                          add_special_tokens=True, 
                          max_length=MAX_LENGTH, 
                          return_attention_mask=True, 
                          return_tensors="np", 
                          truncation=True, 
                          padding='max_length')
        input_ids.append(token['input_ids'][0])
        attention_mask.append(token['attention_mask'][0])
    
    return np.array(input_ids, dtype="int32"), np.array(attention_mask, dtype="int32")

In [9]:
test_df = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')

In [10]:
MAX_LENGTH=512
BATCH_SIZE=8
test_dataset = deberta_encode(test_df['full_text'])

## v3base-1

In [11]:
def get_model_v3_large():
    input_ids = tf.keras.layers.Input(
        shape=(MAX_LENGTH,), dtype=tf.int32, name="input_ids"
    )
    
    attention_masks = tf.keras.layers.Input(
        shape=(MAX_LENGTH,), dtype=tf.int32, name="attention_masks"
    )
   
    deberta_model = transformers.TFAutoModel.from_pretrained("../input/debertav3base", config=cfg)
    
    #Last Layer Reinitialization or Partially Reinitialization
#     Uncommon next three lines to check deberta encoder block
#     print('DeBERTa Encoder Block:')
#     for layer in deberta_model.deberta.encoder.layer:
#         print(layer)
        
    REINIT_LAYERS = 1
    normal_initializer = tf.keras.initializers.GlorotUniform()
    zeros_initializer = tf.keras.initializers.Zeros()
    ones_initializer = tf.keras.initializers.Ones()

#     print(f'\nRe-initializing encoder block:')
    for encoder_block in deberta_model.deberta.encoder.layer[-REINIT_LAYERS:]:
#         print(f'{encoder_block}')
        for layer in encoder_block.submodules:
            if isinstance(layer, tf.keras.layers.Dense):
                layer.kernel.assign(normal_initializer(shape=layer.kernel.shape, dtype=layer.kernel.dtype))
                if layer.bias is not None:
                    layer.bias.assign(zeros_initializer(shape=layer.bias.shape, dtype=layer.bias.dtype))

            elif isinstance(layer, tf.keras.layers.LayerNormalization):
                layer.beta.assign(zeros_initializer(shape=layer.beta.shape, dtype=layer.beta.dtype))
                layer.gamma.assign(ones_initializer(shape=layer.gamma.shape, dtype=layer.gamma.dtype))

    deberta_output = deberta_model.deberta(
        input_ids, attention_mask=attention_masks
    )
    hidden_states = deberta_output.hidden_states
    
    #WeightedLayerPool + MeanPool of the last 4 hidden states
    stack_meanpool = tf.stack(
        [MeanPool()(hidden_s, mask=attention_masks) for hidden_s in hidden_states[-4:]], 
        axis=2)
    
    weighted_layer_pool = layers.Dense(
        1,
        use_bias=False,
        kernel_constraint=WeightsSumOne()
        )(stack_meanpool)
    
    weighted_layer_pool = tf.squeeze(weighted_layer_pool, axis=-1)
    output=layers.Dense(6,activation='linear')(weighted_layer_pool)
    #x = layers.Dense(6, activation='linear')(x)
    
    #output = layers.Rescaling(scale=4.0, offset=1.0)(x)
    model = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=output)
    
    #Compile model with Layer-wise Learning Rate Decay
    layer_list = [deberta_model.deberta.embeddings] + list(deberta_model.deberta.encoder.layer)
    layer_list.reverse()
    
    INIT_LR = 1e-5
    LLRDR = 0.9
    LR_SCH_DECAY_STEPS = 1600 # 2 * len(train_df) // BATCH_SIZE
    
    lr_schedules = [tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=INIT_LR * LLRDR ** i, 
        decay_steps=LR_SCH_DECAY_STEPS, 
        decay_rate=0.3) for i in range(len(layer_list))]
        
    lr_schedule_head = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=1e-4, 
        decay_steps=LR_SCH_DECAY_STEPS, 
        decay_rate=0.3)
    
    optimizers = [tf.keras.optimizers.Adam(learning_rate=lr_sch) for lr_sch in lr_schedules]
    
    optimizers_and_layers = [(tf.keras.optimizers.Adam(learning_rate=lr_schedule_head), model.layers[-4:])] +\
        list(zip(optimizers, layer_list))
    
#     Uncomment next three lines to check optimizers_and_layers
#     print('\nLayer-wise Learning Rate Decay Initial LR:')
#     for o,l in optimizers_and_layers:
#         print(f'{o._decayed_lr("float32").numpy()} for {l}')
        
    optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)
    
    model.compile(optimizer=optimizer,
                 loss='mse',
                 metrics=[tf.keras.metrics.RootMeanSquaredError()],
                 )
    return model

In [12]:
fold_preds = []
for fold in range(N_FOLD):
    tf.keras.backend.clear_session()
    model_v3_large = get_model_v3_large()
    model_v3_large.load_weights(f'../input/v3base1/best_model_fold{fold}.h5')
    print(f'\nFold {fold} inference...')
    pred = model_v3_large.predict(test_dataset, batch_size=BATCH_SIZE)
    fold_preds.append(pred)
    gc.collect()

2022-11-29 02:12:09.786426: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 02:12:09.787551: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 02:12:09.788260: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 02:12:09.789119: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil


Fold 0 inference...


2022-11-29 02:12:45.472292: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)



Fold 1 inference...

Fold 2 inference...

Fold 3 inference...

Fold 4 inference...


In [13]:
preds = np.mean(fold_preds, axis=0)
preds = np.clip(preds, 1, 5)
TARGET_COLS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
sub_df = pd.concat([test_df[['text_id']], pd.DataFrame(preds, columns=TARGET_COLS)], axis=1)
sub_df.to_csv('submission0.csv', index=False)
#0.4553

## v3base-2

In [14]:
fold_preds = []
for fold in range(N_FOLD):
    tf.keras.backend.clear_session()
    model_v3_large = get_model_v3_large()
    model_v3_large.load_weights(f'../input/v3base2/best_model_fold{fold}.h5')
    print(f'\nFold {fold} inference...')
    pred = model_v3_large.predict(test_dataset, batch_size=BATCH_SIZE)
    fold_preds.append(pred)
    gc.collect()


Fold 0 inference...

Fold 1 inference...

Fold 2 inference...

Fold 3 inference...

Fold 4 inference...


In [15]:
preds = np.mean(fold_preds, axis=0)
preds = np.clip(preds, 1, 5)
TARGET_COLS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
sub_df = pd.concat([test_df[['text_id']], pd.DataFrame(preds, columns=TARGET_COLS)], axis=1)
sub_df.to_csv('submission1.csv', index=False)
#0.4566

## base3

In [16]:
fold_preds = []
for fold in range(N_FOLD):
    tf.keras.backend.clear_session()
    model_v3_large = get_model_v3_large()
    model_v3_large.load_weights(f'../input/v3base3/v3base-3/best_model_fold{fold}.h5')
    print(f'\nFold {fold} inference...')
    pred = model_v3_large.predict(test_dataset, batch_size=BATCH_SIZE)
    fold_preds.append(pred)
    gc.collect()
preds = np.mean(fold_preds, axis=0)
preds = np.clip(preds, 1, 5)
TARGET_COLS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
sub_df = pd.concat([test_df[['text_id']], pd.DataFrame(preds, columns=TARGET_COLS)], axis=1)
sub_df.to_csv('submission2.csv', index=False)
#0.454


Fold 0 inference...

Fold 1 inference...

Fold 2 inference...

Fold 3 inference...

Fold 4 inference...


## base4

In [17]:
fold_preds = []
for fold in range(N_FOLD):
    tf.keras.backend.clear_session()
    model_v3_large = get_model_v3_large()
    model_v3_large.load_weights(f'../input/v3base4/v3base-4/best_model_fold{fold}.h5')
    print(f'\nFold {fold} inference...')
    pred = model_v3_large.predict(test_dataset, batch_size=BATCH_SIZE)
    fold_preds.append(pred)
    gc.collect()
preds = np.mean(fold_preds, axis=0)
preds = np.clip(preds, 1, 5)
TARGET_COLS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
sub_df = pd.concat([test_df[['text_id']], pd.DataFrame(preds, columns=TARGET_COLS)], axis=1)
sub_df.to_csv('submission3.csv', index=False)
#0.458


Fold 0 inference...

Fold 1 inference...

Fold 2 inference...

Fold 3 inference...

Fold 4 inference...


## Ensemble

In [18]:
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
submission = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')

sub1 = pd.read_csv(f'submission0.csv')[TARGET_COLS] * 1.1
sub2 = pd.read_csv(f'submission1.csv')[TARGET_COLS] * 0.9
sub3 = pd.read_csv(f'submission2.csv')[TARGET_COLS] * 1.5
sub4 = pd.read_csv(f'submission3.csv')[TARGET_COLS] * 1.0

ens = ((sub1 + sub2+sub3+sub4)
       /(1.1+0.9+1.3+0.9))

submission[TARGET_COLS] = ens
display(submission.head())
submission.to_csv('submission.csv', index=False)

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.060087,2.916615,3.259661,3.132458,2.845059,2.849767
1,000BAD50D026,2.824525,2.615886,2.900809,2.541753,2.292425,2.748376
2,00367BB2546B,3.827956,3.622893,3.830468,3.764464,3.608222,3.540919
