# Pretrained Embeddings

## Load libraries

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR

import tensorflow as tf
import tensorflow_addons as tfa

import transformers

from transformers import AutoTokenizer, AutoModel
from transformers import DataCollatorWithPadding
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

from datasets import Dataset

import os
import gc
import sys
from tqdm.notebook import tqdm

## Set Configs

In [2]:
CONFIG = {
        'folds': 5,
        'seed': 101,

        # Kaggle embeds
        'debertav3large_npy': '../input/fb3-save-pretrained-embeddings/debertav3large_FB3.npy',
        'distilrobertabase_npy': '../input/fb3-save-pretrained-embeddings/distilrobertabase_FB3.npy',

        'debertav3large': '../input/deberta-v3-large/deberta-v3-large/',
        'distilrobertabase': '../input/huggingface-roberta-variants/distilroberta-base/distilroberta-base',

        'batch_size': 4,
        'max_len': 512}

## Obtain Data

In [3]:
train = pd.read_csv("../input/feedback-prize-english-language-learning/train.csv")
test = pd.read_csv("../input/feedback-prize-english-language-learning/test.csv")

tgtCols = ['cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']

print(train.shape)
print(test.shape)

(3911, 8)
(3, 2)


## Create Folds

In [4]:
import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

train["kfold"] = -1

mskf = MultilabelStratifiedKFold(n_splits=CONFIG['folds'], shuffle=True, random_state=CONFIG['seed'])
data_labels = train[tgtCols].values

for f, (t_, v_) in enumerate(mskf.split(train, data_labels)):
    train.loc[v_, "kfold"] = f + 1

In [5]:
train['kfold'].value_counts().sort_index()

1    782
2    782
3    782
4    782
5    783
Name: kfold, dtype: int64

## Custom functions

### Competition metric

In [6]:
def mcrmse(y_true,y_pred):
    rmse_scores = []
    for i in range(len(tgtCols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
    return np.mean(rmse_scores)

### Data process functions

In [7]:
def hf_encode(texts, chkpt):
    
    tokenizer = transformers.AutoTokenizer.from_pretrained(CONFIG[chkpt])
    tokenizer.save_pretrained('./tokenizer/')

    input_ids = []
    attention_mask = []
    
    for text in texts.tolist():
        token = tokenizer(text, 
                          add_special_tokens=True, 
                          max_length=CONFIG['max_len'], 
                          return_attention_mask=True, 
                          return_tensors="np", 
                          truncation=True, 
                          padding='max_length')
        input_ids.append(token['input_ids'][0])
        attention_mask.append(token['attention_mask'][0])
    
    return np.array(input_ids, dtype="int32"), np.array(attention_mask, dtype="int32")

In [8]:
def get_dataset(df):
    inputs = hf_encode(df['full_text'])
    targets = np.array(df[tgtCols], dtype="float32")
    return inputs, targets

In [9]:
def pickle_dump(path, saveobj):
    import pickle
    filehandler = open(path,"wb")
    pickle.dump(saveobj,filehandler)
#     print("File pickled")
    filehandler.close()

In [10]:
def pickle_load(path):
    import pickle
    file = open(path,'rb')
    loadobj = pickle.load(file)
    file.close()
    return loadobj

### Transformer embeddings

In [11]:
class MeanPool(tf.keras.layers.Layer):
    def call(self, inputs, mask=None):
        broadcast_mask = tf.expand_dims(tf.cast(mask, "float32"), -1)
        embedding_sum = tf.reduce_sum(inputs * broadcast_mask, axis=1)
        mask_sum = tf.reduce_sum(broadcast_mask, axis=1)
        mask_sum = tf.math.maximum(mask_sum, tf.constant([1e-9]))
        return embedding_sum / mask_sum

In [12]:
def get_model(chkpt):
    
    cfg = transformers.AutoConfig.from_pretrained(CONFIG[chkpt], output_hidden_states=True)
    cfg.hidden_dropout_prob = 0
    cfg.attention_probs_dropout_prob = 0
    cfg.save_pretrained('./tokenizer/')
    
    input_ids = tf.keras.layers.Input(
        shape=(CONFIG['max_len'],), dtype=tf.int32, name="input_ids"
    )
    
    attention_masks = tf.keras.layers.Input(
        shape=(CONFIG['max_len'],), dtype=tf.int32, name="attention_masks"
    )
    
    try:
        model = transformers.TFAutoModel.from_pretrained(CONFIG[chkpt], config=cfg)
    except:
        model = transformers.TFAutoModel.from_pretrained(CONFIG[chkpt], config=cfg, from_pt=True)
        
    output = model(
        input_ids, attention_mask=attention_masks
    )
    hidden_states = output.hidden_states
    
    output = tf.stack(
        [MeanPool()(hidden_s, mask=attention_masks) for hidden_s in hidden_states[-1:]], 
        axis=2)
    
    output = tf.squeeze(output, axis=-1)
    
    model = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=output)

    model.compile(optimizer="adam",
                 loss='huber_loss',
                 metrics=[tf.keras.metrics.RootMeanSquaredError()],
                 )
    
    return model

In [13]:
def pretrain_embeddings(chkpt, df):
    
    model = get_model(chkpt)
    dataset = hf_encode(df['full_text'], chkpt)
    preds = model.predict(dataset, batch_size=CONFIG['batch_size'])
    
    del model, dataset
    _ = gc.collect()
    
    return preds

## Model Training

In [14]:
train_dataset = np.load(CONFIG['debertav3large_npy'])
train_dataset = np.concatenate([train_dataset, np.load(CONFIG['distilrobertabase_npy'])], axis=1)

train_dataset.shape

(3911, 1792)

In [15]:
scores = []

for fold in range(1,CONFIG['folds']):

    print('#'*25)
    print(f'## Fold {fold}')
    print('#'*25)

    trn_idx = train[train['kfold']!=fold].index.values
    val_idx = train[train['kfold']==fold].index.values

    X_train = train_dataset[trn_idx,:]
    X_valid = train_dataset[val_idx,:]

    y_train = train[train['kfold']!=fold][tgtCols].copy()
    y_valid = train[train['kfold']==fold][tgtCols].copy()

    val_preds = np.zeros((len(val_idx),6))

    for i, tgt in enumerate(tgtCols):

        print(tgt,', ',end='')
        clf = SVR(C=1)
        clf.fit(X_train, y_train[tgt].values)
        pickle_dump(f"./SVR_tgt{tgt}_fold{fold}.pkl", clf)
        val_preds[:,i] = clf.predict(X_valid)

    score = mcrmse(y_valid[tgtCols].values, val_preds)
    scores.append(score)
    print("Fold : {} RSME score: {}".format(fold,score))

    print('#'*25)
    print('Overall CV RSME =',np.mean(scores))

#########################
## Fold 1
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 1 RSME score: 0.4574948104976467
#########################
Overall CV RSME = 0.4574948104976467
#########################
## Fold 2
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 2 RSME score: 0.4532811299221495
#########################
Overall CV RSME = 0.4553879702098981
#########################
## Fold 3
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 3 RSME score: 0.45189874801482954
#########################
Overall CV RSME = 0.4542248961448752
#########################
## Fold 4
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 4 RSME score: 0.45774089350253444
#########################
Overall CV RSME = 0.45510389548429003


In [16]:
del train_dataset
_ = gc.collect()

## Model Inference

In [None]:
test_dataset = pretrain_embeddings('debertav3large', test)
test_dataset = np.concatenate([test_dataset, pretrain_embeddings('distilrobertabase', test)], axis=1)

test_dataset.shape

In [18]:
fold_preds = []

for fold in range(1,CONFIG['folds']):

    print('#'*25)
    print(f'## Fold {fold}')
    print('#'*25)
    
    test_preds = np.zeros((len(test_dataset),6))
    for i, tgt in enumerate(tgtCols):

        print(tgt,', ',end='')
        model = pickle_load(f"./SVR_tgt{tgt}_fold{fold}.pkl")
        test_preds[:,i] = model.predict(test_dataset)
    
    fold_preds.append(test_preds)
    
    del model
    _ = gc.collect()

#########################
## Fold 1
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , #########################
## Fold 2
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , #########################
## Fold 3
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , #########################
## Fold 4
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 

In [19]:
preds = np.mean(fold_preds, axis=0)
preds = np.clip(preds, 1, 5)

In [20]:
sub_df = pd.concat([test[['text_id']], pd.DataFrame(preds, columns=tgtCols)], axis=1)
sub_df.to_csv('submission.csv', index=False)

In [21]:
sub_df.head()

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.96472,2.777844,3.095375,2.986499,2.722708,2.646358
1,000BAD50D026,2.589157,2.395107,2.672813,2.292683,2.015081,2.550648
2,00367BB2546B,3.499556,3.379709,3.526141,3.585186,3.3596,3.28833
