In [None]:
!pip install --upgrade pip
!pip install transformers==4.16 --quiet
!pip install vncorenlp==1.0.3 --quiet
!sudo apt-get install git-lfs
!pip install sentencepiece --quiet
!pip install tokenizer --quiet
!pip install underthesea --quiet

In [None]:
import json
import re
import string

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, classification_report
from underthesea import word_tokenize, text_normalize

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import optim

from transformers import AutoModel, AutoTokenizer, get_scheduler
from tqdm.auto import tqdm
from functools import partial

import seaborn as sns
import matplotlib.pyplot as plt
import requests
import gc
import random

from torch.utils.data import TensorDataset

In [None]:
tqdm.pandas()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
SEED = 42

In [None]:
def fix_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
#     torch.use_deterministic_algorithms(True)

fix_seed(SEED)

# Dataset

## UiT-VSFC

In [None]:
uit_train_data = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/uit/uit_train_data.csv')
uit_val_data = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/uit/uit_val_data.csv')
uit_test_data = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/uit/uit_test_data.csv')

## UET Data

In [None]:
train_data_1 = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/uetcfs/uetcfs_train_data.csv')
val_data_1 = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/uetcfs/uetcfs_val_data.csv')
test_data_1 = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/uetcfs/uetcfs_test_data.csv')

train_data_2 = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/sguet/sguet_train_data.csv')
val_data_2 = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/sguet/sguet_val_data.csv')
test_data_2 = pd.read_csv('/kaggle/input/merge-fixed-dataset-712/dataset/sguet/sguet_test_data.csv')

## Data Preprocessing

In [None]:
!git clone https://github.com/vncorenlp/VnCoreNLP

In [None]:
from vncorenlp import VnCoreNLP

# paste path to VnCoreNLP-1.1.1.jar
rdrsegmenter = VnCoreNLP("/kaggle/working/VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')

In [None]:
def word_segment(text):
    return "".join([" ".join(sen) for sen in rdrsegmenter.tokenize(text_normalize(text))])

In [None]:
def preprocess_text(text):
    text = re.sub(r'<[^>]*>', '', text)
    text = re.sub(r'([A-Z])\1+', lambda m: m.group(1).upper(), text, flags=re.IGNORECASE)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\w*\d\w*', ' ', text).strip()
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    text = text.lower()
    text = word_segment(text)
    return text

In [None]:
uit_train_data['content'] = uit_train_data.content.progress_apply(partial(preprocess_text),)
uit_val_data['content'] = uit_val_data.content.progress_apply(partial(preprocess_text),)
uit_test_data['content'] = uit_test_data.content.progress_apply(partial(preprocess_text),)

In [None]:
train_data_2['content'] = train_data_2.content.progress_apply(partial(preprocess_text),)
val_data_2['content'] = val_data_2.content.progress_apply(partial(preprocess_text),)
test_data_2['content'] = test_data_2.content.progress_apply(partial(preprocess_text),)

In [None]:
train_data_1['content'] = train_data_1.content.progress_apply(partial(preprocess_text),)
val_data_1['content'] = val_data_1.content.progress_apply(partial(preprocess_text),)
test_data_1['content'] = test_data_1.content.progress_apply(partial(preprocess_text),)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

In [None]:
def encoding(data, tokenizer, max_token_len=128):
    contents = data['content']
    input_ids = []
    attention_masks = []

    for index, content in enumerate(contents):
        encoded = tokenizer.encode_plus(
            content,
            truncation=True,
            add_special_tokens=True,
            max_length=max_token_len,
            padding="max_length",
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])

    input_ids = torch.cat(input_ids,dim=0)
    attention_masks = torch.cat(attention_masks,dim=0)
    sentiment = torch.tensor(np.array(data['sentiment']))
    return input_ids, attention_masks, sentiment

In [None]:
train_data = pd.concat([uit_train_data, train_data_1, train_data_2])
val_data = pd.concat([uit_val_data, val_data_1, val_data_2])
test_data = pd.concat([uit_test_data, test_data_1, test_data_2])
train_data.shape, val_data.shape, test_data.shape

In [None]:
del uit_train_data, train_data_1, train_data_2
del uit_val_data, val_data_1, val_data_2
gc.collect()

In [None]:
# hyperparameters
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 5e-5

In [None]:
train_input_ids, train_attention_masks, train_encoded_labels = encoding(train_data, tokenizer, max_token_len=MAX_LEN)
val_input_ids, val_attention_masks, val_encoded_labels = encoding(val_data, tokenizer, max_token_len=MAX_LEN)
test_input_ids, test_attention_masks, test_encoded_labels = encoding(test_data, tokenizer, max_token_len=MAX_LEN)
    
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_encoded_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_encoded_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_encoded_labels)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

del train_input_ids, train_attention_masks, train_encoded_labels
del val_input_ids, val_attention_masks, val_encoded_labels
del test_input_ids, test_attention_masks, test_encoded_labels
del val_data
del train_dataset, val_dataset, test_dataset 
gc.collect()

# Model

In [None]:
class PhoBertBase(nn.Module):
    def __init__(self, n_classes, drop_out=0.1):
        super(PhoBertBase, self).__init__()
        self.bert = AutoModel.from_pretrained("vinai/phobert-base")
        self.l1 = torch.nn.Linear(768, 256)
        self.l2 = torch.nn.Linear(256, n_classes)
        self.d1 = torch.nn.Dropout(drop_out)

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask)
        output = output[1]
        output = self.l1(output)
        output = self.d1(output)
        output = self.l2(output)
        return output

In [None]:
def loss_fn_kd(outputs, labels, teacher_outputs, alpha, T):
    KD_loss = nn.KLDivLoss()(F.log_softmax(outputs/T, dim=1),
                             F.softmax(teacher_outputs/T, dim=1)) * (alpha * T * T) + F.cross_entropy(outputs, labels) * (1. - alpha)
    return KD_loss

In [None]:
def train_kd(model, teacher_model, optimizer, scheduler, train_loader, alpha, T):
    # set model to training mode
    model.train()
    teacher_model.eval()
    
    total_loss = total = 0
    total_correct = 0
    progress_bar = tqdm(train_loader, desc='Training', leave=False)
    predictions = []
    labels = []
    for batch in progress_bar:

        label = batch[2].type(torch.LongTensor)
        label = label.to(device)
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)

        # Clean old gradients
        optimizer.zero_grad()

        # Forwards pass
        student_output = model(input_ids, attention_masks)
        student_output = student_output.logits

        with torch.no_grad():
            teacher_output = teacher_model(input_ids, attention_masks)
        
        # Calculate how wrong the model is
        loss = loss_fn_kd(student_output, label, teacher_output, alpha, T)
        preds = torch.argmax(student_output, dim=1)

        total_correct += torch.sum(preds == label.data)
        
        # Perform gradient descent, backwards pass
        loss.backward()

        # Take a step in the right direction
        optimizer.step()
        scheduler.step()

        # Record metrics
        total_loss += loss.item()
        total += len(label)
        
    return total_correct / total, total_loss / total

def validate_kd(model, valid_loader):
    model.eval()
    total_loss = total = 0
    total_correct = 0
    predictions = []
    labels = []
    with torch.no_grad():
        progress_bar = tqdm(valid_loader, desc='Validating', leave=False)
        for batch in progress_bar:
            label = batch[2].type(torch.LongTensor)
            label = label.to(device)
            input_ids = batch[0].to(device)
            attention_masks = batch[1].to(device)

            # Forwards pass
            output = model(input_ids, attention_masks)
            output = output.logits

            preds = torch.argmax(output, 1)
            predictions.append(preds.flatten())
            labels.append(label.data)
            
            total_correct += torch.sum(preds == label.data)

            # Calculate how wrong the model is
            loss = criterion(output, label)

            # Record metrics
            total_loss += loss.item()
            total += len(label)

    predictions = torch.cat(predictions).detach().cpu()
    labels = torch.cat(labels).detach().cpu()
    print(classification_report(labels, predictions, digits = 4))
    return total_correct / total, total_loss / total, predictions

In [None]:
def predict_test_kd(model, test_loader):
    model.eval()
    predictions = []
    labels = []
    with torch.no_grad():
        progress_bar = tqdm(test_loader, desc='Validating', leave=False)
        for batch in progress_bar:
            label = batch[2].type(torch.LongTensor)
            label = label.to(device)
            input_ids = batch[0].to(device)
            attention_masks = batch[1].to(device)

            # Forwards pass
            output = model(input_ids, attention_masks)

            _, preds = torch.max(output.logits, 1)
            predictions.append(preds.flatten())
            labels.append(label.data)

    predictions = torch.cat(predictions).detach().cpu()
    return predictions

In [None]:
teacher_model = PhoBertBase(n_classes=3)
best_model_cp = torch.load('/kaggle/input/teacher-chkpt/best_merge_base_42.pt')
teacher_model.load_state_dict(best_model_cp, strict=False)
teacher_model.to(device)

In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertConfig

config = DistilBertConfig(
    num_labels=3,
    vocab_size=64001,
    max_position_embeddings=258
)
student_model = DistilBertForSequenceClassification(config)
student_model.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.Dropout(0.2),
            nn.Linear(256, 3),)
student_model.to(device)

In [None]:
weights = torch.FloatTensor([0.3, 0.45, 0.25])
criterion = nn.CrossEntropyLoss(weight=weights.to(device))
# criterion = nn.CrossEntropyLoss()

param_optimizer = list(student_model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
]

optimizer = optim.AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, eps=1e-8)
    
num_training_steps = EPOCHS * len(train_dataloader)

lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
alpha = 0.8
T = 10

In [None]:
import time

best_valid_loss = 1
train_losses, valid_losses = [], []

training_start_time = time.time()

for epoch in range(EPOCHS):

    train_acc, train_loss = train_kd(student_model, teacher_model, optimizer, lr_scheduler, train_dataloader, alpha, T)
    valid_acc, valid_loss, val_pred = validate_kd(student_model, val_dataloader)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(student_model.state_dict(), '/kaggle/working/best_model.pt') 

    print('Epoch {}/{}'.format(epoch, EPOCHS - 1))
    print('-' * 10)
    print('Training Loss: {:.2e} Acc: {:.8f}'.format(train_loss, train_acc))
    print('Validate Loss: {:.2e} Acc: {:.8f}'.format(valid_loss, valid_acc))
    # print('ROC AUC Score: {: .8f}'.format(roc_auc))
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

print('Training finished, took {:.2f}s'.format(time.time() - training_start_time))

In [None]:
del student_model
gc.collect()

In [None]:
epoch_ticks = range(1, EPOCHS + 1)
plt.plot(epoch_ticks, train_losses)
plt.plot(epoch_ticks, valid_losses)
plt.legend(['Train Loss', 'Valid Loss'])
plt.title('Losses') 
plt.xlabel('Epoch #')
plt.ylabel('Loss')
plt.xticks(epoch_ticks)
plt.show()

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
config = DistilBertConfig(
    num_labels=3,
    vocab_size=64001,
    max_position_embeddings=258
)
student_model = DistilBertForSequenceClassification(config)
student_model.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.Dropout(0.2),
            nn.Linear(256, 3),)
best_model_cp = torch.load('/kaggle/working/best_model.pt')
student_model.load_state_dict(best_model_cp)
student_model.to(device)

count = count_parameters(student_model)
print(count)

# Test

In [None]:
# uetcfs
test_input_ids_1, test_attention_masks_1, test_encoded_labels_1 = encoding(test_data_1, tokenizer, max_token_len=MAX_LEN)
test_dataset_1 = TensorDataset(test_input_ids_1, test_attention_masks_1, test_encoded_labels_1)
test_dataloader_1 = DataLoader(test_dataset_1, batch_size=BATCH_SIZE, shuffle=False)

# sguet
test_input_ids_2, test_attention_masks_2, test_encoded_labels_2 = encoding(test_data_2, tokenizer, max_token_len=MAX_LEN)
test_dataset_2 = TensorDataset(test_input_ids_2, test_attention_masks_2, test_encoded_labels_2)
test_dataloader_2 = DataLoader(test_dataset_2, batch_size=BATCH_SIZE, shuffle=False)

# uit
test_input_ids_3, test_attention_masks_3, test_encoded_labels_3 = encoding(uit_test_data, tokenizer, max_token_len=MAX_LEN)
test_dataset_3 = TensorDataset(test_input_ids_3, test_attention_masks_3, test_encoded_labels_3)
test_dataloader_3 = DataLoader(test_dataset_3, batch_size=BATCH_SIZE, shuffle=False)

del test_input_ids_1, test_attention_masks_1, test_encoded_labels_1
del test_input_ids_2, test_attention_masks_2, test_encoded_labels_2
del test_input_ids_3, test_attention_masks_3, test_encoded_labels_3
del test_dataset_1, test_dataset_2, test_dataset_3
gc.collect()

In [None]:
predictions_uetcfs = predict_test_kd(student_model, test_dataloader_1)
predictions_sguet = predict_test_kd(student_model, test_dataloader_2)
predictions_uit = predict_test_kd(student_model, test_dataloader_3)
predictions = predict_test_kd(student_model, test_dataloader)

In [None]:
guess = pd.DataFrame()
guess['content'] = test_data_1["content"]
guess['sentiment'] = list(map(float, predictions_uetcfs))
guess

In [None]:
print(classification_report(test_data_1['sentiment'], guess['sentiment'], digits = 4))

In [None]:
guess = pd.DataFrame()
guess['content'] = test_data_2["content"]
guess['sentiment'] = list(map(float, predictions_sguet))
guess

In [None]:
print(classification_report(test_data_2['sentiment'], guess['sentiment'], digits = 4))

In [None]:
guess = pd.DataFrame()
guess['content'] = uit_test_data["content"]
guess['sentiment'] = list(map(float, predictions_uit))
guess

In [None]:
print(classification_report(uit_test_data['sentiment'], guess['sentiment'], digits = 4))