In [1]:
from google.colab import drive

# Liên kết Google Drive với Google Colab
drive.mount('/content/drive')

# Sau khi thực hiện đoạn mã trên, Drive của bạn sẽ được gắn vào đường dẫn /content/drive/
# Bạn có thể truy cập các file trong Google Drive thông qua đường dẫn này.


Mounted at /content/drive


In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModel

In [4]:
model_path = '/content/drive/MyDrive/3c-citation-text-classification/Task 1/best_model.pt'  # Đường dẫn đến model đã lưu
model_name = 'allenai/scibert_scivocab_uncased'

In [5]:
# Load tokenizer and model
LMTokenizer = AutoTokenizer.from_pretrained(model_name)
LMModel = AutoModel.from_pretrained(model_name)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the validation dataset
validation_dataset = pd.read_csv('/content/drive/MyDrive/3c-citation-text-classification/Task 1/validation.csv', sep=',', names=['CGT','CDT','CC','label'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

In [6]:
MAX_LEN = 512
VALID_BATCH_SIZE = 4
tokenizer = LMTokenizer

In [7]:
# Define the Triage dataset class
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        CGT = str(self.data.CGT[index])
        CGT = " ".join(CGT.split())
        inputs = self.tokenizer.encode_plus(
            CGT,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        CGT_ids = inputs['input_ids']
        CGT_mask = inputs['attention_mask']

        CDT = str(self.data.CDT[index])
        CDT = " ".join(CDT.split())
        inputs = self.tokenizer.encode_plus(
            CDT,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        CDT_ids = inputs['input_ids']
        CDT_mask = inputs['attention_mask']

        CC = str(self.data.CC[index])
        CC = " ".join(CC.split())
        inputs = self.tokenizer.encode_plus(
            CC,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        CC_ids = inputs['input_ids']
        CC_mask = inputs['attention_mask']

        return {
            'CGT_ids': torch.tensor(CGT_ids, dtype=torch.long),
            'CGT_mask': torch.tensor(CGT_mask, dtype=torch.long),

            'CDT_ids': torch.tensor(CDT_ids, dtype=torch.long),
            'CDT_mask': torch.tensor(CDT_mask, dtype=torch.long),

            'CC_ids': torch.tensor(CC_ids, dtype=torch.long),
            'CC_mask': torch.tensor(CC_mask, dtype=torch.long),

            'targets': torch.tensor(self.data.label[index], dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [9]:
class LMClass(torch.nn.Module):
    def __init__(self):
        super(LMClass, self).__init__()
        self.l1 = LMModel
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(drop_out)
        self.classifier = torch.nn.Linear(768, 6)

    def forward(self, data):

        input_ids = data['CC_ids'].to(device, dtype = torch.long)
        attention_mask = data['CC_mask'].to(device, dtype = torch.long)

        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state1 = output_1[0]

        pooler = hidden_state1[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [11]:
drop_out = 0.1

In [12]:
# Prepare the DataLoader for validation set
testing_set = Triage(validation_dataset, tokenizer, MAX_LEN)
test_params = {'batch_size': VALID_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}
testing_loader = DataLoader(testing_set, **test_params)

# Load the saved model
model = LMClass()
model.load_state_dict(torch.load(model_path))
model.to(device)

# Define the loss function
weights = [0.30435841, 1.34843581, 2.91375291, 7.57575758, 1.78062678,1.06837607]
class_weights = torch.FloatTensor(weights).to(device)
loss_function = torch.nn.CrossEntropyLoss(weight=class_weights)


  model.load_state_dict(torch.load(model_path))


In [13]:
# Validation function
def valid(model, testing_loader):
    model.eval()
    n_correct = 0
    n_wrong = 0
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    pred = []
    act = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            targets = data['targets'].to(device, dtype=torch.long)
            outputs = model(data).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += (big_idx == targets).sum().item()
            pred += big_idx.tolist()
            act += targets.tolist()
            nb_tr_steps += 1
            nb_tr_examples += targets.size(0)

    epoch_loss = tr_loss / nb_tr_steps
    epoch_accu = (n_correct * 100) / nb_tr_examples
    mf1 = f1_score(act, pred, average='macro')

    # Print the results
    print(f"Validation Loss: {epoch_loss}")
    print(f"Validation Accuracy: {epoch_accu}")
    print(f"Validation Macro F1: {mf1}")

    output_file = '/content/drive/MyDrive/3c-citation-text-classification/Task 1/result.txt'
    # Save the results to a file
    with open(ouotput_file, 'w') as f:
        f.write(f"Validation Accuracy: {epoch_accu}\n")
        f.write(f"Validation Macro F1: {mf1}\n")

    return mf1, epoch_accu

# Run validation
mf1, acc = valid(model, testing_loader)



Validation Loss: 1.6234958009719849
Validation Accuracy: 63.4
Validation Macro F1: 0.2966758688398859
