In [None]:
from google.colab import drive

# Liên kết Google Drive với Google Colab
drive.mount('/content/drive')

# Sau khi thực hiện đoạn mã trên, Drive của bạn sẽ được gắn vào đường dẫn /content/drive/
# Bạn có thể truy cập các file trong Google Drive thông qua đường dẫn này.


Mounted at /content/drive


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from nltk.tokenize import sent_tokenize

# Ensure you have NLTK tokenizer downloaded
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import pandas as pd
import transformers
from torch.utils.data import Dataset, DataLoader
from torch import cuda
import sys
from sklearn.metrics import f1_score


In [None]:
model_path = '/content/drive/MyDrive/Colab Notebooks/best_model.pt'  # Đường dẫn đến model đã lưu
model_name = 'allenai/scibert_scivocab_uncased'

In [None]:
drop_out = 0.1

In [None]:

LMTokenizer = AutoTokenizer.from_pretrained(model_name)
LMModel = AutoModel.from_pretrained(model_name)

device = 'cuda' if cuda.is_available() else 'cpu'


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

In [None]:
class LMClass(torch.nn.Module):
    def __init__(self):
        super(LMClass, self).__init__()
        self.l1 = LMModel
        self.pre_classifier = torch.nn.Linear(768*2, 768)
        self.dropout = torch.nn.Dropout(drop_out)
        self.classifier = torch.nn.Linear(768, 2)

    def forward(self, data):
        input_ids = data['CC_ids'].to(device, dtype=torch.long)
        attention_mask = data['CC_mask'].to(device, dtype=torch.long)

        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state1 = output_1[0]

        input_ids = data['CDT_ids'].to(device, dtype=torch.long)
        attention_mask = data['CDT_mask'].to(device, dtype=torch.long)

        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state2 = output_1[0]

        pooler = torch.cat((hidden_state1[:, 0], hidden_state2[:, 0]), 1)
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
class LMClassPredictor:
    def __init__(self, model_path, tokenizer_name, device='cpu'):
        self.device = device
        self.model = LMClass()
        self.model.load_state_dict(torch.load(model_path, map_location=device))
        self.model.to(device)
        self.model.eval()
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    def predict_sentence(self, sentence):
        inputs = self.tokenizer.encode_plus(
            sentence,
            None,
            add_special_tokens=True,
            max_length=512,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        input_ids = torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0).to(self.device)
        attention_mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).unsqueeze(0).to(self.device)

        # Ensure all required keys are present
        data = {
            'CC_ids': input_ids,
            'CC_mask': attention_mask,
            'CDT_ids': input_ids,  # Use the same input for CDT
            'CDT_mask': attention_mask
        }

        with torch.no_grad():
            outputs = self.model(data)
            probabilities = torch.softmax(outputs, dim=1)
            predicted_label = torch.argmax(probabilities, dim=1).item()

        return predicted_label, probabilities

    def predict_paragraph(self, paragraph, output_file):
        sentences = sent_tokenize(paragraph)
        results = []

        with open(output_file, 'w') as file:
            for sentence in sentences:
                predicted_label, probabilities = self.predict_sentence(sentence)
                results.append((sentence, predicted_label, probabilities))
                file.write(f"Sentence: {sentence}\n")
                file.write(f"Predicted label: {predicted_label}, Probabilities: {probabilities}\n")
                file.write("\n")

        return results

In [None]:
predictor = LMClassPredictor(model_path=model_path, tokenizer_name=model_name, device=device)

output_file = '/content/drive/MyDrive/Colab Notebooks/predictions.txt'

paragraph = "This is the first sentence. Here is another one."

results = predictor.predict_paragraph(paragraph, output_file)

for sentence, predicted_label, probabilities in results:
    print(f"Sentence: {sentence}")
    print(f"Predicted label: {predicted_label}, Probabilities: {probabilities}")
    print("\n")


  self.model.load_state_dict(torch.load(model_path, map_location=device))


Sentence: This is the first sentence.
Predicted label: 1, Probabilities: tensor([[0.2130, 0.7870]], device='cuda:0')


Sentence: Here is another one.
Predicted label: 1, Probabilities: tensor([[0.3600, 0.6400]], device='cuda:0')


