In [None]:
import os
import pandas as pd

import tensorflow as tf
from tensorflow.keras import backend as K

import transformers
from transformers import AutoTokenizer

from data.utils import seed_all, length_plot
from data.text import tokenized_text_normalize, vlsp_impute
from data.datasets import data_generator
from trainer.model import build_model, scheduler

print('Using Tensorflow version:', tf.__version__)
print('Using Transformers version:', transformers.__version__)

In [None]:
seed_all(1512)

In [None]:
train_df = pd.read_csv("../data/final_data/train_5_folds.csv")
test_df = pd.read_csv("../data/final_data/private_test.csv")

train_df["post_message"] = train_df["post_message"].astype(str)
test_df["post_message"] = test_df["post_message"].astype(str)

In [None]:
train_df["post_message"] = train_df["post_message"].apply(tokenized_text_normalize)
test_df["post_message"] = test_df["post_message"].apply(tokenized_text_normalize)

In [None]:
train_df = vlsp_impute(train_df)
test_df = vlsp_impute(test_df)

In [None]:
MAX_LEN = 256
BATCH_SIZE = 24

In [None]:
roberta = 'vinai/phobert-base' 
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta)

In [None]:
model = build_model(roberta,max_len=MAX_LEN)
model.summary()

In [None]:
n_splits = 5
n_epochs = 5

DISPLAY=1 # USE display=1 FOR INTERACTIVE
exp = f'phobert+auxiliary_{MAX_LEN}_len'

output_dir = f'../outputs/{exp}_models'
os.makedirs(output_dir, exist_ok=True)

In [None]:
strategy = tf.distribute.MirroredStrategy()

In [None]:
for fold in sorted(train_df["fold"].unique()):
    print('*'*100)
    print(f'FOLD: {fold+1}/{n_splits}')
    K.clear_session()
    with strategy.scope():
        model = build_model(roberta,max_len=MAX_LEN)
        
    reduce_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

    model_dir = os.path.join(output_dir, f'Fold_{fold+1}.h5')

    sv = tf.keras.callbacks.ModelCheckpoint(model_dir, 
                                            monitor='val_auc', 
                                            verbose=1, 
                                            save_best_only=True,
                                            save_weights_only=True, 
                                            mode='max', 
                                            save_freq='epoch')
    
    train_df_ = train_df[train_df["fold"]!=fold]
    val_df_ = train_df[train_df["fold"]==fold]
    train_dataset, valid_dataset = data_generator(train_df_, val_df_, roberta_tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE)
    
    n_steps = train_df_.shape[0] // BATCH_SIZE + 1
    train_history = model.fit(
        train_dataset,
        steps_per_epoch=n_steps,
        
        callbacks=[sv, 
            reduce_lr,
            # tb
            ],
        validation_data=valid_dataset,
        epochs=n_epochs
    )

In [None]:
from transformers import TFAutoModel, AutoConfig
config = AutoConfig.from_pretrained('vinai/phobert-base', output_attentions=True,output_hidden_states=True,use_cache=True)
model = TFAutoModel.from_config(config)

In [None]:
X_test = regular_encode(test_df, max_len=MAX_LEN)
y_test = np.zeros((len(test_df),1))
test_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_test,y_test))
    .batch(BATCH_SIZE)
)