In [1]:
import os, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
print(f'TF version: {tf.__version__}')
import tensorflow_addons as tfa
from tensorflow.keras import layers

import transformers
print(f'transformers version: {transformers.__version__}')
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

import sys
sys.path.append('./input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

TF version: 2.10.0


  from .autonotebook import tqdm as notebook_tqdm


transformers version: 4.24.0


In [2]:
def set_seed(seed=25):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
#     os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed(25)

In [3]:
N_FOLD = 5

In [4]:
def get_dataset(df):
    inputs = deberta_encode(df['full_text'])
    targets = np.array(df[TARGET_COLS], dtype="float32")
    return inputs, targets

In [5]:
class MeanPool(tf.keras.layers.Layer):
    def call(self, inputs, mask=None):
        broadcast_mask = tf.expand_dims(tf.cast(mask, "float32"), -1)
        embedding_sum = tf.reduce_sum(inputs * broadcast_mask, axis=1)
        mask_sum = tf.reduce_sum(broadcast_mask, axis=1)
        mask_sum = tf.math.maximum(mask_sum, tf.constant([1e-9]))
        return embedding_sum / mask_sum
class WeightsSumOne(tf.keras.constraints.Constraint):
    def __call__(self, w):
        return tf.nn.softmax(w, axis=0)

# v3base-1

In [31]:
tokenizer = transformers.AutoTokenizer.from_pretrained("./input/deberta-v3-base")
tokenizer.save_pretrained('./tokenizer/')

cfg = transformers.AutoConfig.from_pretrained("./input/deberta-v3-base", output_hidden_states=True)
cfg.hidden_dropout_prob = 0
cfg.attention_probs_dropout_prob = 0
cfg.save_pretrained('./tokenizer/')



In [32]:
def deberta_encode(texts, tokenizer=tokenizer):
    input_ids = []
    attention_mask = []
    
    for text in texts.tolist():
        token = tokenizer(text, 
                          add_special_tokens=True, 
                          max_length=MAX_LENGTH, 
                          return_attention_mask=True, 
                          return_tensors="np", 
                          truncation=True, 
                          padding='max_length')
        input_ids.append(token['input_ids'][0])
        attention_mask.append(token['attention_mask'][0])
    
    return np.array(input_ids, dtype="int32"), np.array(attention_mask, dtype="int32")

In [33]:
test_df = pd.read_csv('./input/feedback-prize-english-language-learning/test.csv')

In [34]:
MAX_LENGTH=512
BATCH_SIZE=8
test_dataset = deberta_encode(test_df['full_text'])

In [37]:
def get_model_v3_large():
    input_ids = tf.keras.layers.Input(
        shape=(MAX_LENGTH,), dtype=tf.int32, name="input_ids"
    )
    
    attention_masks = tf.keras.layers.Input(
        shape=(MAX_LENGTH,), dtype=tf.int32, name="attention_masks"
    )
   
    deberta_model = transformers.TFAutoModel.from_pretrained("./input/deberta-v3-base", config=cfg)
    
    #Last Layer Reinitialization or Partially Reinitialization
#     Uncommon next three lines to check deberta encoder block
#     print('DeBERTa Encoder Block:')
#     for layer in deberta_model.deberta.encoder.layer:
#         print(layer)
        
    REINIT_LAYERS = 1
    normal_initializer = tf.keras.initializers.GlorotUniform()
    zeros_initializer = tf.keras.initializers.Zeros()
    ones_initializer = tf.keras.initializers.Ones()

#     print(f'\nRe-initializing encoder block:')
    for encoder_block in deberta_model.deberta.encoder.layer[-REINIT_LAYERS:]:
#         print(f'{encoder_block}')
        for layer in encoder_block.submodules:
            if isinstance(layer, tf.keras.layers.Dense):
                layer.kernel.assign(normal_initializer(shape=layer.kernel.shape, dtype=layer.kernel.dtype))
                if layer.bias is not None:
                    layer.bias.assign(zeros_initializer(shape=layer.bias.shape, dtype=layer.bias.dtype))

            elif isinstance(layer, tf.keras.layers.LayerNormalization):
                layer.beta.assign(zeros_initializer(shape=layer.beta.shape, dtype=layer.beta.dtype))
                layer.gamma.assign(ones_initializer(shape=layer.gamma.shape, dtype=layer.gamma.dtype))

    deberta_output = deberta_model.deberta(
        input_ids, attention_mask=attention_masks
    )
    hidden_states = deberta_output.hidden_states
    
    #WeightedLayerPool + MeanPool of the last 4 hidden states
    stack_meanpool = tf.stack(
        [MeanPool()(hidden_s, mask=attention_masks) for hidden_s in hidden_states[-4:]], 
        axis=2)
    
    weighted_layer_pool = layers.Dense(1,
                                       use_bias=False,
                                       kernel_constraint=WeightsSumOne())(stack_meanpool)
    
    weighted_layer_pool = tf.squeeze(weighted_layer_pool, axis=-1)
    output=layers.Dense(6,activation='linear')(weighted_layer_pool)
    #x = layers.Dense(6, activation='linear')(x)
    
    #output = layers.Rescaling(scale=4.0, offset=1.0)(x)
    model = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=output)
    
    #Compile model with Layer-wise Learning Rate Decay
    layer_list = [deberta_model.deberta.embeddings] + list(deberta_model.deberta.encoder.layer)
    layer_list.reverse()
    
    INIT_LR = 1e-5
    LLRDR = 0.9
    LR_SCH_DECAY_STEPS = 1600 # 2 * len(train_df) // BATCH_SIZE
    
    lr_schedules = [tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=INIT_LR * LLRDR ** i, 
        decay_steps=LR_SCH_DECAY_STEPS, 
        decay_rate=0.3) for i in range(len(layer_list))]
    lr_schedule_head = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=1e-4, 
        decay_steps=LR_SCH_DECAY_STEPS, 
        decay_rate=0.3)
    
    optimizers = [tf.keras.optimizers.Adam(learning_rate=lr_sch) for lr_sch in lr_schedules]
    
    optimizers_and_layers = [(tf.keras.optimizers.Adam(learning_rate=lr_schedule_head), model.layers[-4:])] +\
        list(zip(optimizers, layer_list))
    
#     Uncomment next three lines to check optimizers_and_layers
#     print('\nLayer-wise Learning Rate Decay Initial LR:')
#     for o,l in optimizers_and_layers:
#         print(f'{o._decayed_lr("float32").numpy()} for {l}')
        
    optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)
    
    model.compile(optimizer=optimizer,
                 loss='mse',
                 metrics=[tf.keras.metrics.RootMeanSquaredError()],
                 )
    return model

In [44]:
fold_preds = []
for fold in range(N_FOLD):
    tf.keras.backend.clear_session()
    model_v3_large = get_model_v3_large()
    model_v3_large.load_weights(f'./v3base-1/best_model_fold{fold}.h5')
    print(f'\nFold {fold} inference...')
    pred = model_v3_large.predict(test_dataset, batch_size=BATCH_SIZE)
    fold_preds.append(pred)
    gc.collect()




Fold 0 inference...

Fold 1 inference...

Fold 2 inference...

Fold 3 inference...

Fold 4 inference...


In [45]:
preds = np.mean(fold_preds, axis=0)
preds = np.clip(preds, 1, 5)
TARGET_COLS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
sub_df = pd.concat([test_df[['text_id']], pd.DataFrame(preds, columns=TARGET_COLS)], axis=1)
sub_df.to_csv('submission0.csv', index=False)
#0.4553

## v3base-2

In [41]:
fold_preds = []
for fold in range(N_FOLD):
    tf.keras.backend.clear_session()
    model_v3_large = get_model_v3_large()
    model_v3_large.load_weights(f'./v3base-2/best_model_fold{fold}.h5')
    print(f'\nFold {fold} inference...')
    pred = model_v3_large.predict(test_dataset, batch_size=BATCH_SIZE)
    fold_preds.append(pred)
    gc.collect()




Fold 0 inference...

Fold 1 inference...

Fold 2 inference...

Fold 3 inference...

Fold 4 inference...


In [43]:
preds = np.mean(fold_preds, axis=0)
preds = np.clip(preds, 1, 5)
TARGET_COLS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
sub_df = pd.concat([test_df[['text_id']], pd.DataFrame(preds, columns=TARGET_COLS)], axis=1)
sub_df.to_csv('submission1.csv', index=False)
#0.4566

# ensemble

In [48]:
submission = pd.read_csv('./input/feedback-prize-english-language-learning/sample_submission.csv')

sub1 = pd.read_csv(f'submission0.csv')[TARGET_COLS] * 1.1
sub2 = pd.read_csv(f'submission1.csv')[TARGET_COLS] * 1.0
#sub3 = pd.read_csv(f'submission_2.csv')[CFG3.target_cols] * CFG3.weight
#sub4 = pd.read_csv(f'submission_3.csv')[CFG4.target_cols] * CFG4.weight
#sub5 = pd.read_csv(f'submission_4.csv')[CFG5.target_cols] * CFG5.weight
#sub6 = pd.read_csv(f'submission_5.csv')[CFG6.target_cols] * CFG6.weight
#sub7 = pd.read_csv(f'submission_7.csv')[CFG7.target_cols] * CFG7.weight
##sub8 = pd.read_csv(f'submission_8.csv')[CFG8.target_cols] * CFG8.weight
#sub9 = pd.read_csv(f'submission_9.csv')[CFG9.target_cols] * CFG9.weight
#sub10 = pd.read_csv(f'submission_10.csv')[CFG10.target_cols] * CFG10.weight

ens = ((sub1 + sub2)
       /(1.1+1.0))

#ens = (sub1 + sub2)/(CFG1.weight + CFG2.weight)

submission[TARGET_COLS] = ens

submission.to_csv('submission.csv', index=False)

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.833304,2.703496,3.02305,2.91003,2.64596,2.646942
1,000BAD50D026,2.59724,2.443963,2.693636,2.34145,2.138072,2.560819
2,00367BB2546B,3.546981,3.35439,3.570885,3.499786,3.357464,3.287691
