In [None]:
!pip install --upgrade pip
!pip install transformers==4.16 --quiet
!pip install vncorenlp==1.0.3 --quiet
!sudo apt-get install git-lfs
!pip install sentencepiece --quiet
!pip install tokenizer --quiet
!pip install underthesea --quiet

In [None]:
import json
import re
import string

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, classification_report
from underthesea import text_normalize

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import optim

from transformers import AutoModel, AutoTokenizer, get_scheduler
from tqdm.auto import tqdm
from functools import partial

import seaborn as sns
import matplotlib.pyplot as plt

import requests
import gc
import random

from torch.utils.data import TensorDataset

In [None]:
tqdm.pandas()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
SEED = 42

In [None]:
def fix_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
#     torch.use_deterministic_algorithms(True)

fix_seed(SEED)

# Dataset

## UiT-VSFC

In [None]:
uit_train_data = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/uit/uit_train_data.csv')
uit_val_data = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/uit/uit_val_data.csv')
uit_test_data = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/uit/uit_test_data.csv')

## UET Data

In [None]:
train_data_1 = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/uetcfs/uetcfs_train_data.csv')
val_data_1 = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/uetcfs/uetcfs_val_data.csv')
test_data_1 = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/uetcfs/uetcfs_test_data.csv')

train_data_2 = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/sguet/sguet_train_data.csv')
val_data_2 = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/sguet/sguet_val_data.csv')
test_data_2 = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/sguet/sguet_test_data.csv')

## Data Preprocessing

In [None]:
!git clone https://github.com/vncorenlp/VnCoreNLP

In [None]:
from vncorenlp import VnCoreNLP

# paste path to VnCoreNLP-1.1.1.jar
rdrsegmenter = VnCoreNLP("/kaggle/working/VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')

In [None]:
def word_segment(text):
    return "".join([" ".join(sen) for sen in rdrsegmenter.tokenize(text_normalize(text))])

In [None]:
def preprocess_text(text):
    text = re.sub(r'<[^>]*>', '', text)
    text = re.sub(r'([A-Z])\1+', lambda m: m.group(1).upper(), text, flags=re.IGNORECASE)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\w*\d\w*', ' ', text).strip()
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    text = text.lower()
    text = word_segment(text)
    return text

In [None]:
uit_train_data['content'] = uit_train_data.content.progress_apply(partial(preprocess_text),)
uit_val_data['content'] = uit_val_data.content.progress_apply(partial(preprocess_text),)
uit_test_data['content'] = uit_test_data.content.progress_apply(partial(preprocess_text),)

In [None]:
train_data_2['content'] = train_data_2.content.progress_apply(partial(preprocess_text),)
val_data_2['content'] = val_data_2.content.progress_apply(partial(preprocess_text),)
test_data_2['content'] = test_data_2.content.progress_apply(partial(preprocess_text),)

In [None]:
train_data_1['content'] = train_data_1.content.progress_apply(partial(preprocess_text),)
val_data_1['content'] = val_data_1.content.progress_apply(partial(preprocess_text),)
test_data_1['content'] = test_data_1.content.progress_apply(partial(preprocess_text),)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

In [None]:
def encoding(data, tokenizer, max_token_len=128):
    contents = data['content']
    input_ids = []
    attention_masks = []

    for index, content in enumerate(contents):
        encoded = tokenizer.encode_plus(
            content,
            truncation=True,
            add_special_tokens=True,
            max_length=max_token_len,
            padding="max_length",
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])

    input_ids = torch.cat(input_ids,dim=0)
    attention_masks = torch.cat(attention_masks,dim=0)
    sentiment = torch.tensor(np.array(data['sentiment']))
    return input_ids, attention_masks, sentiment

In [None]:
BATCH_SIZE_FOR_GRAMS = 16

In [None]:
# hyperparameters
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 5e-5

In [None]:
# uetcfs
test_input_ids_1, test_attention_masks_1, test_encoded_labels_1 = encoding(test_data_1, tokenizer, max_token_len=MAX_LEN)
test_dataset_1 = TensorDataset(test_input_ids_1, test_attention_masks_1, test_encoded_labels_1)
test_dataloader_1 = DataLoader(test_dataset_1, batch_size=BATCH_SIZE, shuffle=False)

# sguet
test_input_ids_2, test_attention_masks_2, test_encoded_labels_2 = encoding(test_data_2, tokenizer, max_token_len=MAX_LEN)
test_dataset_2 = TensorDataset(test_input_ids_2, test_attention_masks_2, test_encoded_labels_2)
test_dataloader_2 = DataLoader(test_dataset_2, batch_size=BATCH_SIZE, shuffle=False)

# uit
test_input_ids_3, test_attention_masks_3, test_encoded_labels_3 = encoding(uit_test_data, tokenizer, max_token_len=MAX_LEN)
test_dataset_3 = TensorDataset(test_input_ids_3, test_attention_masks_3, test_encoded_labels_3)
test_dataloader_3 = DataLoader(test_dataset_3, batch_size=BATCH_SIZE, shuffle=False)

del test_input_ids_1, test_attention_masks_1, test_encoded_labels_1
del test_input_ids_2, test_attention_masks_2, test_encoded_labels_2
del test_input_ids_3, test_attention_masks_3, test_encoded_labels_3
del test_dataset_1, test_dataset_2, test_dataset_3
gc.collect()

# Model

In [None]:
class PhoBertBase(nn.Module):
    def __init__(self, n_classes, drop_out=0.1):
        super(PhoBertBase, self).__init__()
        self.bert = AutoModel.from_pretrained("vinai/phobert-base")
        self.l1 = torch.nn.Linear(768, 256)
        self.l2 = torch.nn.Linear(256, n_classes)
        self.d1 = torch.nn.Dropout(drop_out)

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask)
        output = output[1]
        output = self.l1(output)
        output = self.d1(output)
        output = self.l2(output)
        return output

In [None]:
train_input_ids, train_attention_masks, train_encoded_labels = encoding(uit_train_data, tokenizer, max_token_len=MAX_LEN)
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_encoded_labels)
train_dataloader_uit = DataLoader(train_dataset, batch_size=BATCH_SIZE_FOR_GRAMS, shuffle=True)
len_uit = len(uit_train_data)

del train_input_ids, train_attention_masks, train_encoded_labels
del uit_train_data, uit_val_data
gc.collect()

In [None]:
teacher_model_uit = PhoBertBase(n_classes=3, drop_out=0.2)
best_model_cp = torch.load('/kaggle/input/teacher-chkpt/best_uit_base_42.pt')
# teacher_model_uit = PhoBertLarge(n_classes=3, drop_out=0.3)
# best_model_cp = torch.load('/kaggle/input/teacher-chkpt/best_uit_large_42.pt')
teacher_model_uit.load_state_dict(best_model_cp, strict=False)
teacher_model_uit.to(device)

In [None]:
train_input_ids, train_attention_masks, train_encoded_labels = encoding(train_data_2, tokenizer, max_token_len=MAX_LEN)
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_encoded_labels)
train_dataloader_sguet = DataLoader(train_dataset, batch_size=BATCH_SIZE_FOR_GRAMS, shuffle=True)
len_sguet = len(train_data_2)

del train_input_ids, train_attention_masks, train_encoded_labels
del train_data_2, val_data_2
gc.collect()

In [None]:
teacher_model_sguet = PhoBertBase(n_classes=3, drop_out=0.2)
best_model_cp = torch.load('/kaggle/input/teacher-chkpt/best_sguet_base_42_bs16.pt')
# teacher_model_sguet = PhoBertLarge(n_classes=3, drop_out=0.5)
# best_model_cp = torch.load('/kaggle/input/teacher-chkpt/best_sguet_large_42.pt')
teacher_model_sguet.load_state_dict(best_model_cp, strict=False)
teacher_model_sguet.to(device)

In [None]:
train_input_ids, train_attention_masks, train_encoded_labels = encoding(train_data_1, tokenizer, max_token_len=MAX_LEN)
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_encoded_labels)
train_dataloader_uetcfs = DataLoader(train_dataset, batch_size=BATCH_SIZE_FOR_GRAMS, shuffle=True)
len_uetcfs = len(train_data_1)

del train_input_ids, train_attention_masks, train_encoded_labels
del train_data_1, val_data_1
gc.collect()

In [None]:
teacher_model_uetcfs = PhoBertBase(n_classes=3)
best_model_cp = torch.load('/kaggle/input/teacher-chkpt/best_uetcfs_base_42.pt')
# teacher_model_uetcfs = PhoBertLarge(n_classes=3, drop_out=0.2)
# best_model_cp = torch.load('/kaggle/input/teacher-chkpt/best_uetcfs_large_42.pt')
teacher_model_uetcfs.load_state_dict(best_model_cp, strict=False)
teacher_model_uetcfs.to(device)

# Compute inner-product matrices

In [None]:
def filter_params_to_merge(param_names, exclude_param_regex=None):
    params_to_merge = []
    for name in param_names:
        params_to_merge.append(name)
    return params_to_merge


def filter_modules_by_regex(base_module, include_patterns, include_type):
    modules = {}
    for name, module in base_module.named_modules():
        valid_name = not include_patterns or any([re.match(patt, name) for patt in include_patterns])
        valid_type = not include_type or any([isinstance(module, md_cls) for md_cls in include_type])
        if valid_type and valid_name:
            modules[name] = module
    return modules

In [None]:
def compute_gram(model, train_dataloader):
    grams = {} # gram matrices for each linear layer inputs
    
    def get_gram(name):
        def hook(module, input, output):
            x = input[0].detach() # $[b,t,h]
            x = x.view(-1, x.size(-1))
            xtx = torch.matmul(x.transpose(0,1), x) # [h,h]
            if name not in grams:
                grams[name] = xtx
            else:
                grams[name] = grams[name] + xtx
        return hook

    linear_modules = filter_modules_by_regex(model, None, [nn.Linear])
    
    handles = []
    for name, module in linear_modules.items():
        handle = module.register_forward_hook(get_gram(name))
        handles.append(handle)

    for batch in tqdm(train_dataloader, desc='Computing gram matrix'):
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        outputs = model(input_ids, attention_masks)

    for handle in handles:
        handle.remove()

    return grams

In [None]:
import time

start_time = time.time()

In [None]:
with torch.no_grad():
    grams_uit = compute_gram(teacher_model_uit, train_dataloader_uit)

In [None]:
with torch.no_grad():
    grams_uetcfs = compute_gram(teacher_model_uetcfs, train_dataloader_uetcfs)

In [None]:
with torch.no_grad():
    grams_sguet = compute_gram(teacher_model_sguet, train_dataloader_sguet)

In [None]:
torch.cuda.empty_cache()

# Merging with RegMean

In [None]:
def get_all_params(new_local_models):
    params={}
    
    for local_model in new_local_models:
        n2p = {k: v for k,v in local_model.named_parameters()}
        merge_param_names = filter_params_to_merge([n for n in n2p], ['.*classifier.*']) # for glue label spaces are different
        for n in merge_param_names:
            if n not in params:
                params[n] = []
            params[n].append(n2p[n])
            
    return params

In [None]:
def copy_params_to_model(avg_params, model):
    for n, p in model.named_parameters():
        if n in avg_params:
            p.data.copy_(avg_params[n])

def reduce_non_diag(cov_mat, a):
    diag_weight = torch.diag(torch.ones(cov_mat.size(0)) - a).to(cov_mat.device)
    non_diag_weight = torch.zeros_like(diag_weight).fill_(a)
    weight = diag_weight + non_diag_weight
    ret = cov_mat * weight
    return ret

def regmean_merge(all_params, all_grams, non_diag, first_cycle, last_model, num_models):
    avg_params = {}
    n_model = len(all_grams)
    print([first_cycle, last_model])
    for name in all_params:
        h_avged = False
        if name.endswith('.weight'):
            print(f'Regmean: {name}')
            module_name = name[:-len('.weight')]
            if module_name in all_grams[0]:
                gram_m_ws, grams = [], []
                print("Reg")

                for model_id, model_grams in enumerate(all_grams):
                    param_grams = model_grams[module_name] 

                    # for roberta we dont need this; but it is important for deberta and t5
                    if non_diag:
                        param_grams = reduce_non_diag(param_grams, a=0.85)

                    param = all_params[name][model_id]
                    if first_cycle:
                        gram_m_ws.append(torch.matmul(param_grams, param.transpose(0,1)))
                    else:
                        if model_id == 0:
                            gram_m_ws.append(param.transpose(0,1))
                        else:
                            gram_m_ws.append(torch.matmul(param_grams, param.transpose(0,1)))
                    grams.append(param_grams)
                
                del all_params[name][0]
                del all_params[name][0]
                gc.collect()
                torch.cuda.empty_cache() 
                sum_gram_m_ws = sum(gram_m_ws)
                if last_model:
                    sum_gram = sum(grams)
                    del grams
                    gc.collect()
                    sum_gram_inv = torch.inverse(sum_gram)
                    del sum_gram
                    gc.collect()
                    wt = torch.matmul(sum_gram_inv, sum_gram_m_ws)
                    del sum_gram_inv, sum_gram_m_ws
                    gc.collect()
                else:
                    wt = sum_gram_m_ws # sum of X^TXW
                    del sum_gram_m_ws
                    gc.collect()
                    
                torch.cuda.empty_cache()
                w = wt.transpose(0,1)
                avg_params[name] = w
                h_avged = True
                
        if not h_avged: # if not averaged with regmean, then do simple avg
            if first_cycle:
                avg_params[name] = torch.stack(all_params[name],0).sum(0)
            else:
                avg_params[name] = torch.stack(all_params[name],0).sum(0)
                avg_params[name].data = avg_params[name].data / num_models
    return avg_params

In [None]:
merged_model = PhoBertBase(n_classes=3)

In [None]:
model_name = 'PhoBERT'
local_models = [teacher_model_uit, teacher_model_uetcfs, teacher_model_sguet]
regmean_grams = [grams_uit, grams_uetcfs, grams_sguet]
len_arr = [len_uit, len_uetcfs, len_sguet]

In [None]:
del teacher_model_uetcfs, teacher_model_sguet, teacher_model_uit
del grams_uit, grams_uetcfs, grams_sguet
del len_uit, len_uetcfs, len_sguet
gc.collect()
torch.cuda.empty_cache() 

In [None]:
with torch.no_grad():
    
    if regmean_grams: # regmean average
        num_models = len(regmean_grams)
        non_diag = False if 'PhoBERT' in model_name else True
        
        for i in range(len(regmean_grams)):
            for key, val in regmean_grams[i].items():
                regmean_grams[i][key] = val / len_arr[i]
                
        grams = []    
        first_cycle = True
        last_model = False
        
        for i in range(len(regmean_grams) - 1):
            print(f'Cycle {i}:')
            if i == 0:
                grams = regmean_grams[0:2]
                params = get_all_params(local_models[0:2]) 
                for l in range(2):
                    del local_models[0]
                    del regmean_grams[0]
                gc.collect()
                torch.cuda.empty_cache()
                print(len(local_models), len(regmean_grams))
            else:
                first_cycle = False
                if i == num_models - 2:
                    last_model = True
                    
                sum_grams = {k: grams[0].get(k) + grams[1].get(k) for k in set(grams[0])}
                grams = [sum_grams, regmean_grams[0]]
                
                # The previous merged model + 1 new local model
                params = get_all_params([merged_model, local_models[0]])
                
                del merged_model, sum_grams
                gc.collect()
                torch.cuda.empty_cache()
                merged_model = PhoBertBase(n_classes=3)            

            avg_params = regmean_merge(params, grams, non_diag, first_cycle, last_model, num_models)  
            copy_params_to_model(avg_params, merged_model)
            merged_model.to(device)

            del avg_params, params
            gc.collect()
            torch.cuda.empty_cache()

    else: # simple average
        params = get_all_params(local_models)
        avg_params = {k: torch.stack(v,0).mean(0) for k, v in params.items()}
        copy_params_to_model(avg_params, merged_model)
        merged_model.to(device)

In [None]:
print(f'Total time to merge: {time.time() - start_time}')

# Test

In [None]:
def predict_test(model, test_loader):
    model.eval()
    predictions = []
    labels = []
    with torch.no_grad():
        progress_bar = tqdm(test_loader, desc='Validating', leave=False)
        for batch in progress_bar:
            label = batch[2].type(torch.LongTensor)
            label = label.to(device)
            input_ids = batch[0].to(device)
            attention_masks = batch[1].to(device)
            
            # Forwards pass
            output = model(input_ids, attention_masks)

            _, preds = torch.max(output, 1)
            predictions.append(preds.flatten())
            labels.append(label.data)

    predictions = torch.cat(predictions).detach().cpu()
    return predictions

In [None]:
predictions_uetcfs = predict_test(merged_model, test_dataloader_1)
predictions_sguet = predict_test(merged_model, test_dataloader_2)
predictions_uit = predict_test(merged_model, test_dataloader_3)

In [None]:
guess = pd.DataFrame()
guess['content'] = test_data_1["content"]
guess['sentiment'] = list(map(float, predictions_uetcfs))
guess

In [None]:
print(classification_report(test_data_1['sentiment'], guess['sentiment'], digits = 4))

In [None]:
guess = pd.DataFrame()
guess['content'] = test_data_2["content"]
guess['sentiment'] = list(map(float, predictions_sguet))
guess

In [None]:
print(classification_report(test_data_2['sentiment'], guess['sentiment'], digits = 4))

In [None]:
guess = pd.DataFrame()
guess['content'] = uit_test_data["content"]
guess['sentiment'] = list(map(float, predictions_uit))
guess

In [None]:
print(classification_report(uit_test_data['sentiment'], guess['sentiment'], digits = 4))

In [None]:
torch.save(merged_model.state_dict(), '/kaggle/working/best_model.pt')