In [1]:
from google.colab import drive

# Liên kết Google Drive với Google Colab
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModel

In [3]:
model_path = '/content/drive/MyDrive/Colab Notebooks/best_model.pt'  # Đường dẫn đến model đã lưu
model_name = 'allenai/scibert_scivocab_uncased'

In [4]:
# Load tokenizer and model
LMTokenizer = AutoTokenizer.from_pretrained(model_name)
LMModel = AutoModel.from_pretrained(model_name)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the validation dataset
validation_dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/validation.csv', sep=',', names=['CGT','CDT','CC','label'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

In [5]:
MAX_LEN = 512
VALID_BATCH_SIZE = 4
tokenizer = LMTokenizer

In [6]:

class LMClass(torch.nn.Module):
    def __init__(self):
        super(LMClass, self).__init__()
        self.l1 = LMModel
        self.pre_classifier = torch.nn.Linear(768*2, 768)
        self.dropout = torch.nn.Dropout(drop_out)
        self.classifier = torch.nn.Linear(768, 2)

    def forward(self, data):

        input_ids = data['CC_ids'].to(device, dtype = torch.long)
        attention_mask = data['CC_mask'].to(device, dtype = torch.long)

        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state1 = output_1[0]

        input_ids = data['CDT_ids'].to(device, dtype = torch.long)
        attention_mask = data['CDT_mask'].to(device, dtype = torch.long)

        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state2 = output_1[0]
        # print(hidden_state2.shape)
        pooler = torch.cat((hidden_state1[:, 0],hidden_state2[:, 0]),1)
        # print(pooler.shape)
        # pooler = hidden_state1[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [7]:

class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        CGT = str(self.data.CGT[index])
        CGT = " ".join(CGT.split())
        inputs = self.tokenizer.encode_plus(
            CGT,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        CGT_ids = inputs['input_ids']
        CGT_mask = inputs['attention_mask']


        CDT = str(self.data.CDT[index])
        CDT = " ".join(CDT.split())
        inputs = self.tokenizer.encode_plus(
            CDT,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        CDT_ids = inputs['input_ids']
        CDT_mask = inputs['attention_mask']


        CC = str(self.data.CC[index])
        # print(CC)
        CC = " ".join(CC.split())
        inputs = self.tokenizer.encode_plus(
            CC,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        CC_ids = inputs['input_ids']
        CC_mask = inputs['attention_mask']

        return {
            'CGT_ids': torch.tensor(CGT_ids, dtype=torch.long),
            'CGT_mask': torch.tensor(CGT_mask, dtype=torch.long),

            'CDT_ids': torch.tensor(CDT_ids, dtype=torch.long),
            'CDT_mask': torch.tensor(CDT_mask, dtype=torch.long),

            'CC_ids': torch.tensor(CC_ids, dtype=torch.long),
            'CC_mask': torch.tensor(CC_mask, dtype=torch.long),

            'targets': torch.tensor(self.data.label[index], dtype=torch.long)
        }

    def __len__(self):
        return self.len


In [8]:
drop_out = 0.1

In [9]:

# Prepare data loader for validation set
validation_set = Triage(validation_dataset, tokenizer, MAX_LEN)
valid_params = {'batch_size': VALID_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}
validation_loader = DataLoader(validation_set, **valid_params)


In [10]:
# Load the best model
model = LMClass()
model.load_state_dict(torch.load(model_path))
model.to(device)

# Loss function
loss_function = torch.nn.CrossEntropyLoss()

  model.load_state_dict(torch.load(model_path))


In [16]:
def calcuate_accu(big_idx, targets):
    return (big_idx == targets).sum().item()

def valid(model, validation_loader):
    model.eval()
    n_correct = 0
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    pred = []
    act = []
    with torch.no_grad():
        for _, data in enumerate(validation_loader, 0):
            targets = data['targets'].to(device, dtype=torch.long)
            outputs = model(data).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)
            pred += big_idx.tolist()
            act += targets.tolist()
            nb_tr_steps += 1
            nb_tr_examples += targets.size(0)

    epoch_loss = tr_loss / nb_tr_steps
    epoch_accu = (n_correct * 100) / nb_tr_examples
    mf1 = f1_score(act, pred, average='macro')

    # Print the results
    print(f"Validation Loss: {epoch_loss}")
    print(f"Validation Accuracy: {epoch_accu}")
    print(f"Validation Macro F1: {mf1}")

    output_file = '/content/drive/MyDrive/Colab Notebooks/result.txt'
    # Save the results to a file
    with open(output_file, 'w') as file:
        file.write(f"Validation Accuracy: {epoch_accu}\n")
        file.write(f"Validation Macro F1: {mf1}\n")

        file.write("Final Results\n")
        file.write(f"Accuracy: {acc}\n")
        file.write(f"Macro F1 Score: {mf1}\n")
        file.close()
    return mf1, epoch_accu

# Validate the loaded model
mf1, acc = valid(model, validation_loader)



Validation Loss: 0.6177607938051224
Validation Accuracy: 67.8
Validation Macro F1: 0.672805459474619
