In [1]:
import os
import pandas as pd

from sklearn.model_selection import KFold

import tensorflow as tf
from tensorflow.keras import backend as K
from transformers import AutoTokenizer, TFAutoModel

from data.datasets import regular_encode, data_generator
from data.tokenizer import VnCoreTokenizer
from trainer.model import build_model
from utils import *

from tqdm.notebook import tqdm
tqdm.pandas()

  from pandas import Panel


In [2]:
SEED = 1710
MAX_LEN = 256
BATCH_SIZE = 24
N_SPLITS = 5
N_EPOCHS = 5
DISPLAY = 1  # USE display=1 FOR INTERACTIVE

seed_all(SEED)

In [3]:
vncore_tokenizer = VnCoreTokenizer()
warmup_train_df = pd.read_excel("../data/raw_data/warmup_training_dataset.xlsx", index_col="id")
warmup_test_df = pd.read_excel("../data/raw_data/warmup_test_set.xlsx", index_col="id")

public_train_df = pd.read_csv("../data/tokenize_data/public_train.csv")
public_test_df = pd.read_csv("../data/tokenize_data/public_test.csv")

# TODO: make use of warmup_test_df
train_df = pd.concat([warmup_train_df, public_train_df]).drop_duplicates()
test_df = public_test_df.copy()
train_df["post_message"].fillna("", inplace=True)

In [4]:
%%time
train_df["post_message"] = train_df["post_message"].progress_apply(vncore_tokenizer.tokenize)
test_df["post_message"] = test_df["post_message"].progress_apply(vncore_tokenizer.tokenize)

train_len_word = [len(text.split()) for text in train_df.post_message]
test_len_word = [len(text.split()) for text in test_df.post_message]
test_len_char = [len(text) for text in train_df.post_message]
test_len_char = [len(text) for text in test_df.post_message]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5172.0), HTML(value='')))




KeyboardInterrupt: 

In [6]:
bert = "/home/leonard/leonard/vlsp/ReINTEL/pretrained_phobert-base"

model = build_model(bert, max_len=MAX_LEN)
bert_tokenizer = AutoTokenizer.from_pretrained(bert)

model.summary()
exp = f'phobert_{MAX_LEN}_len_{N_SPLITS}_folds'

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
output_dir = f'../{exp}_models'
os.makedirs(output_dir, exist_ok=True)

Some layers from the model checkpoint at /home/leonard/leonard/vlsp/ReINTEL/pretrained_phobert-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at /home/leonard/leonard/vlsp/ReINTEL/pretrained_phobert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.
Special tokens have been added in the vocabulary, make sure the associated word e

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
bert_input_id (InputLayer)      [(None, 256)]        0                                            
__________________________________________________________________________________________________
bert_attention_mask (InputLayer [(None, 256)]        0                                            
__________________________________________________________________________________________________
bert_token_type_ids (InputLayer [(None, 256)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model_1 (TFRobertaMo ((None, 256, 768), ( 134998272   bert_input_id[0][0]              
                                                                 bert_attention_mask[0]

In [8]:
def scheduler(epoch):
    return 3e-5*0.2**epoch


for fold, (idxT, idxV) in enumerate(kf.split(train_df)):
    print('*'*100)
    print(f'FOLD: {fold+1}/{N_SPLITS}')
    
    K.clear_session()

    model = build_model(bert, max_len=MAX_LEN)

    reduce_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

    model_dir = os.path.join(output_dir, f'Fold_{fold+1}.h5')

    sv = tf.keras.callbacks.ModelCheckpoint(model_dir,
                                            monitor='val_auc',
                                            verbose=1,
                                            save_best_only=True,
                                            save_weights_only=True,
                                            mode='max',
                                            save_freq='epoch')

    train_df_ = train_df.iloc[idxT]
    val_df_ = train_df.iloc[idxV]
    train_dataset, valid_dataset = data_generator(train_df_, val_df_, bert_tokenizer, maxlen=MAX_LEN)

    n_steps = train_df_.shape[0] // BATCH_SIZE
    train_history = model.fit(
        train_dataset,
        steps_per_epoch=n_steps,

        callbacks=[sv,
                   reduce_lr,
                   # tb
                   ],
        validation_data=valid_dataset,
        epochs=N_EPOCHS
    )

****************************************************************************************************
FOLD: 1/5


Some layers from the model checkpoint at /home/leonard/leonard/vlsp/ReINTEL/pretrained_phobert-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at /home/leonard/leonard/vlsp/ReINTEL/pretrained_phobert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


NameError: name 'roberta_tokenizer' is not defined

In [None]:
X_test = regular_encode(test_df['post_message'].values, bert_tokenizer, maxlen=MAX_LEN)
y_test = np.zeros((len(test_df), 1))
test_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_test, y_test))
    .batch(BATCH_SIZE)
)

In [None]:
model = build_model(bert, max_len=MAX_LEN)
preds = []

for i, file_name in enumerate(os.listdir(output_dir)):
    print('_'*80)

    K.clear_session()
    model_path = os.path.join(output_dir, file_name)

    print(f'Inferencing with model from: {model_path}')
    model.load_weights(model_path)

    pred = model.predict(test_dataset,
                         batch_size=128,
                         verbose=DISPLAY)
    # print(pred[])
    preds.append(pred)


preds = np.mean(preds, axis=0)

test_df["prediction"] = preds

test_df["prediction"].to_csv(f"{exp}.csv", header=False)