#**Config PipeLine**

## Evidence Retrieval

*   BM25Okapi: k1=1.5, b=0.75, epsilon=0.25
*   SBERT (Bi-Encoder Cosine): *paraphrase-xlm-r-multilingual-v1*

## Classification

*   Model: *xlm-roberta-large*
*   max_length = 256, batch_size = 16
*   dev_set = 20%
*   Optim: Adamw (lr = 0.5e-05, weight_decay=1e-5)
*   num epochs: 20
*   EarlyStopping: patience = 3

## Result:
*  Strict ACC: 76.3343 (4)	ACC: 81.6716 (5)	ACC@1: 78.1134 (4)



In [None]:
!pip install gdown underthesea transformers pyvi nltk pandas torch d2l

Collecting underthesea
  Downloading underthesea-6.8.0-py3-none-any.whl (20.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
Collecting d2l
  Downloading d2l-1.0.3-py3-none-any.whl (111 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.7/111.7 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl (657 kB)
[2K 

In [None]:
import warnings
warnings.filterwarnings('ignore')
import logging
logging.disable(logging.WARNING)

# Importing necessary libraries
import os
import sys
import json
import logging
import math
import copy
import re

# Data handling libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler

# PyTorch libraries
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR

# Transformers library
import transformers
from transformers import AutoModel, AutoTokenizer
transformers.logging.set_verbosity_error()

# NLP libraries
from nltk.tokenize import sent_tokenize as nltk_sent_tokenize
import nltk
from underthesea import sent_tokenize as under_sent_tokenize
from underthesea import text_normalize
from pyvi.ViTokenizer import tokenize
nltk.download('punkt')

# Distance and correlation libraries
from scipy.spatial.distance import euclidean, cityblock
from scipy.stats import pearsonr, spearmanr

# Multithreading libraries
from multiprocessing import Pool, cpu_count

# Progress bar library
from tqdm.auto import tqdm

# Google Drive download library
import gdown

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Evidence Retrieval

In [None]:
!gdown '1VA3fYB0Llf29cvu5AHbRn6bCpAWRODgo'
!gdown '1DbIwYHBrofwQXeLhteHDWMQ69RkiBIBv'
!gdown '1iPN3hOb0Kt0Qkzq2MoEASXMTliEUJs0a'

with open("/content/ise-dsc01-train_v2.json") as f:
    json_data = json.load(f)
with open("/content/ise-dsc01-private-test-offcial.json") as f:
    json_data_public_test = json.load(f)

data_test = pd.DataFrame.from_dict(json_data_public_test, orient="index")
data_test["id"] =  data_test.index

data = pd.DataFrame.from_dict(json_data, orient="index")
data = data.drop_duplicates()
data.head(5)

Downloading...
From: https://drive.google.com/uc?id=1VA3fYB0Llf29cvu5AHbRn6bCpAWRODgo
To: /content/ise-dsc01-public-test-offcial.json
100% 19.3M/19.3M [00:00<00:00, 132MB/s] 
Downloading...
From (original): https://drive.google.com/uc?id=1DbIwYHBrofwQXeLhteHDWMQ69RkiBIBv
From (redirected): https://drive.google.com/uc?id=1DbIwYHBrofwQXeLhteHDWMQ69RkiBIBv&confirm=t&uuid=758edeee-22db-41de-a56a-69e226b799ec
To: /content/ise-dsc01-train_v2.json
100% 161M/161M [00:01<00:00, 135MB/s]
Downloading...
From: https://drive.google.com/uc?id=1iPN3hOb0Kt0Qkzq2MoEASXMTliEUJs0a
To: /content/ise-dsc01-private-test-offcial.json
100% 21.1M/21.1M [00:00<00:00, 206MB/s]


Unnamed: 0,context,claim,verdict,evidence,domain
7125,"Phát biểu tại lễ ký kết vào ngày 17/11, Giám đ...","Ngoài việc không giới hạn mức lương, công ty c...",SUPPORTED,"Công ty cũng có chế độ đãi ngộ tốt, có lương t...",giao-duc
7126,"Phát biểu tại lễ ký kết vào ngày 17/11, Giám đ...","Thành lập năm 2016, phát triển phần mềm, tổ ch...",SUPPORTED,"FABA Technology thành lập năm 2016, chuyên cun...",giao-duc
7127,"Phát biểu tại lễ ký kết vào ngày 17/11, Giám đ...","Nhằm phát triển đội ngũ, FABA thường tổ chức n...",SUPPORTED,"Bên cạnh đó, FABA thường xuyên tổ chức những d...",giao-duc
7128,"Đối với các nhà khoa học, trí thức, điều kiện ...",Sở dĩ vị trí kỹ sư phần mềm có mức lương cao n...,SUPPORTED,"Theo Adeco Việt Nam, mức lương cao nhất đối vớ...",giao-duc
7129,"Đối với các nhà khoa học, trí thức, điều kiện ...","Theo Adeco Việt Nam, mức lương cao nhất đối vớ...",SUPPORTED,"Trong khi đó, theo báo cáo ""Thị trường nhân lự...",giao-duc


## BM25

In [None]:
class BM25:
    def __init__(self, corpus, tokenizer=None):
        self.corpus_size = 0
        self.avgdl = 0
        self.doc_freqs = []
        self.idf = {}
        self.doc_len = []
        self.tokenizer = tokenizer

        if tokenizer:
            corpus = self._tokenize_corpus(corpus)

        nd = self._initialize(corpus)
        self._calc_idf(nd)

    def _initialize(self, corpus):
        nd = {}  # word -> number of documents with word
        num_doc = 0
        for document in corpus:
            self.doc_len.append(len(document))
            num_doc += len(document)

            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.doc_freqs.append(frequencies)

            for word, freq in frequencies.items():
                try:
                    nd[word]+=1
                except KeyError:
                    nd[word] = 1

            self.corpus_size += 1

        self.avgdl = num_doc / self.corpus_size
        return nd

    def _tokenize_corpus(self, corpus):
        pool = Pool(cpu_count())
        tokenized_corpus = pool.map(self.tokenizer, corpus)
        return tokenized_corpus

    def _calc_idf(self, nd):
        raise NotImplementedError()

    def get_scores(self, query):
        raise NotImplementedError()

    def get_batch_scores(self, query, doc_ids):
        raise NotImplementedError()

    def get_top_n(self, query, documents, n=5):
        assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"

        scores = self.get_scores(query)
        min_score = np.min(scores)
        max_score = np.max(scores)

        # Scale scores to 0-1 range
        if max_score != min_score:
            scaled_scores = (scores - min_score) / (max_score - min_score)
        else:
            scaled_scores = np.ones(self.corpus_size)

        top_n_indices = np.argsort(scaled_scores)[::-1][:n]
        top_n_scaled_scores = [scaled_scores[i] for i in top_n_indices]

        return [documents[i] for i in top_n_indices], top_n_scaled_scores


class BM25Okapi(BM25):
    def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, epsilon=0.25):
        self.k1 = k1
        self.b = b
        self.epsilon = epsilon
        super().__init__(corpus, tokenizer)

    def _calc_idf(self, nd):
        """
        Calculates frequencies of terms in documents and in corpus.
        This algorithm sets a floor on the idf values to eps * average_idf
        """
        # collect idf sum to calculate an average idf for epsilon value
        idf_sum = 0
        # collect words with negative idf to set them a special epsilon value.
        # idf can be negative if word is contained in more than half of documents
        negative_idfs = []
        for word, freq in nd.items():
            idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
            self.idf[word] = idf
            idf_sum += idf
            if idf < 0:
                negative_idfs.append(word)
        self.average_idf = idf_sum / len(self.idf)

        eps = self.epsilon * self.average_idf
        for word in negative_idfs:
            self.idf[word] = eps

    def get_scores(self, query):
        """
        The ATIRE BM25 variant uses an idf function which uses a log(idf) score. To prevent negative idf scores,
        this algorithm also adds a floor to the idf value of epsilon.
        See [Trotman, A., X. Jia, M. Crane, Towards an Efficient and Effective Search Engine] for more info
        :param query:
        :return:
        """
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in query:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
                                               (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
        return score

    def get_batch_scores(self, query, doc_ids):
        """
        Calculate bm25 scores between query and subset of all docs
        """
        assert all(di < len(self.doc_freqs) for di in doc_ids)
        score = np.zeros(len(doc_ids))
        doc_len = np.array(self.doc_len)[doc_ids]
        for q in query:
            q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
            score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
                                               (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
        return score.tolist()

In [None]:
def preprocess_text(text: str) -> str:
    text = re.sub(r"['\",\.\?:\-!]", "", text)
    text = text.strip()
    text = " ".join(text.split())
    text = text.lower()
    return text

def evidence_top_n(context, query):
    sentences = split_text(context)
    tokenized_sentences = [str(doc).split(" ") for doc in sentences]
    bm25 = BM25Okapi(tokenized_sentences)
    tokenized_query = query.split(" ")
    top_docs, top_scores = bm25.get_top_n(tokenized_query, sentences, n=5)

    return top_docs, top_scores

## SBERT

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
tokenizer_sbert = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-xlm-r-multilingual-v1')
model_sbert = AutoModel.from_pretrained('sentence-transformers/paraphrase-xlm-r-multilingual-v1').to(device)

cuda


tokenizer_config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def similarities(context: list, text: str, weight: list):
    sentences = [text] + context

    encoded_input = tokenizer_sbert(sentences, padding=True, truncation=True, return_tensors='pt')
    encoded_input = {key: value.to('cuda') for key, value in encoded_input.items()}

    # Compute token embeddings
    with torch.no_grad():
        model_output = model_sbert(**encoded_input)
    # Perform pooling. In this case, mean pooling.
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    similarities = []
    claim_embeddings = sentence_embeddings[0].unsqueeze(0)
    for i in (range(1, len(sentence_embeddings))):
        evidence_embeddings = sentence_embeddings[i].unsqueeze(0)
        cosine = nn.CosineSimilarity(dim=1, eps=1e-6)
        similarity = cosine(claim_embeddings.to(device), evidence_embeddings.to(device)).item()
        # scaled_similarity = ((similarity + 1) / 2) * weight[i-1]
        similarities.append((sentences[i], similarity))

    simi_values = [s[1] for s in similarities]
    scaler = MinMaxScaler()
    scaled_simi_values = scaler.fit_transform(np.array(simi_values).reshape(-1, 1)).flatten()
    similarities = [(sentences[i+1], scaled_value * weight[i]) for i, scaled_value in enumerate(scaled_simi_values)]

    similarities.sort(key=lambda x: x[1], reverse=True)
    top_k = [item[0] for item in similarities[:1]]
    simi = [item[1] for item in similarities[:1]]
    top_5 = [item[0] for item in similarities[:5]]
    return top_k, simi, top_5

## Clean and Split Text

In [None]:
def clean_quotes(sentence):
    # Replace characters within quotes
      return re.sub(r'"([^"]*)"', lambda m: m.group(0).replace('!', '').replace(',', '').replace('?', ''), sentence)

def remove_brackets(text):
    return re.sub(r'\([^)]*\)', lambda m: m.group(0).replace('...', '').replace('.', ''), text)

def split_text(content):
    # Split the text by "\n\n"
    paragraphs = content.split('\n\n')

    # Split each paragraph into sentences
    sentences = []
    for paragraph in paragraphs:
        paragraph = paragraph.replace('...)', ')')
        paragraph = paragraph.replace('... ,', ',')
        paragraph = re.sub(r'\.\.\.(?=\")', '', paragraph)
        paragraph = paragraph.replace('\n', ' ')  # Remove internal line breaks
        paragraph = clean_quotes(paragraph)
        paragraph = re.sub(r'\.(\s[a-z])', lambda match: match.group(1).upper(), paragraph)
        paragraph = paragraph.replace(' .', '.')  # Remove space before period
        paragraph = re.sub(r'\?(?=\s+[a-z])', ' ', paragraph)
        paragraph = re.sub(r'\.\.\.(?=\,)', '', paragraph)
        paragraph = re.sub(r'\.\.\.(?=\s+[a-z])', ' ', paragraph)
        paragraph = paragraph.replace('...', '. ')  # Replace "..." with ". "
        paragraph = paragraph.replace('..', '. ')  # Replace ".." with ". "
        paragraph = paragraph.replace('. ', ' . ')  # Add space after period
        paragraph = paragraph.replace('  ', ' ')  # Remove extra spaces
        paragraph = paragraph.strip()  # Strip leading/trailing spaces

        # Tokenize the paragraph into sentences using NLTK
        paragraph_sentences = nltk.sent_tokenize(paragraph)
        sentences.extend(paragraph_sentences)

    return sentences

## Evidence Retrival BM25 + XML

### Training data

In [None]:
list_evidence_top5 = []
list_evidence_top1 = []

for i in tqdm(range(len(data))):
    statement = data.claim[i]
    context = data.context[i]

    evidence_top5, top5_consine = evidence_top_n(context, statement) # top 5
    evidence_top1, top1_consine, rank_5 = similarities(evidence_top5, statement, top5_consine) # top1

    evidence_top1 = "".join(evidence_top1)

    list_evidence_top5.append(rank_5)
    list_evidence_top1.append(evidence_top1)

data['evidence_top5'] = list_evidence_top5
data['evidence_top1'] = list_evidence_top1

### Test data

In [None]:
list_evidence_top5 = []
list_evidence_top1 = []

for i in tqdm(range(len(data_test))):
    statement = data_test.claim[i]
    context = data_test.context[i]

    evidence_top5, top5_consine = evidence_top_n(context, statement) # top 5
    evidence_top1, top1_consine, rank_5 = similarities(evidence_top5, statement, top5_consine) # top1

    evidence_top1 = "".join(evidence_top1)

    list_evidence_top5.append(rank_5)
    list_evidence_top1.append(evidence_top1)

data_test['evidence_top5'] = list_evidence_top5
data_test['evidence'] = list_evidence_top1

  0%|          | 0/5396 [00:00<?, ?it/s]

# Classification

## EarlyStopping + Model

In [None]:
class EarlyStopping:
    def __init__(self, patience=3, verbose=False, delta=0):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), 'checkpoint.pt')
        self.val_loss_min = val_loss


In [None]:
class SentencePairDataset(Dataset):
  def __init__(self, sentence_pairs, labels, tokenizer, max_length):
    self.sentence_pairs = sentence_pairs
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.sentence_pairs)

  def __getitem__(self, idx):
    sentence1, sentence2 = self.sentence_pairs[idx]
    label = self.labels[idx]
    encoding = self.tokenizer.encode_plus(
        sentence1,
        text_pair=sentence2,
        add_special_tokens=True,
        max_length=self.max_length,
        return_token_type_ids=False,
        padding="max_length",
        return_attention_mask=True,
        return_tensors="pt",
        truncation=True,
    )
    return {
        "input_ids": encoding["input_ids"].flatten(),
        "attention_mask": encoding["attention_mask"].flatten(),
        "label": torch.tensor(label, dtype=torch.long),
    }

In [None]:
class PhoBERTClassifier(nn.Module):
    def __init__(self, phobert, num_classes):
        super(PhoBERTClassifier, self).__init__()
        self.phobert = phobert
        self.layer_norm = nn.LayerNorm(self.phobert.config.hidden_size)
        self.dropout = nn.Dropout(0.2)
        self.batch_norm = nn.BatchNorm1d(self.phobert.config.hidden_size)
        self.linear = nn.LazyLinear(num_classes)
        self.activation = nn.ELU()

    def forward(self, input_ids, attention_mask):
        _,pooled_output = self.phobert( input_ids=input_ids, attention_mask=attention_mask, return_dict=False,)
        norm_output = self.layer_norm(pooled_output)
        batch_norm_output = self.batch_norm(norm_output)
        logits = self.linear(batch_norm_output)
        activated_output = self.activation(logits)
        dropout_output = self.dropout(activated_output)
        return dropout_output

## XLM-R

In [None]:
modelname = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(modelname)
phobert = AutoModel.from_pretrained(modelname)

Downloading (…)lve/main/config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [None]:
X1 = data['claim']
X2 = data['evidence_top1']

X = [(X1, X2) for (X1, X2) in zip(X1, X2)]
data['verdict'] = data['verdict'].replace("SUPPORTED",0)
data['verdict'] = data['verdict'].replace("REFUTED",1)
data['verdict'] = data['verdict'].replace("NEI",2)
BTC_e = list(data['evidence'])
y = list(data['verdict'])

In [None]:
max_length = 256
X_train, X_dev, y_train, y_dev,BTC_e_train,BTC_e_dev = train_test_split(X, y,BTC_e ,test_size = 0.2, random_state = 42, stratify = y)

train_dataset = SentencePairDataset(X_train, y_train, tokenizer, max_length)
dev_dataset = SentencePairDataset(X_dev, y_dev, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PhoBERTClassifier(phobert, num_classes=3).to(device)
print(device)

cuda


In [None]:
import gc
gc.collect()

torch.cuda.empty_cache()
torch.cuda.ipc_collect()

## Training

In [None]:
# Khởi tạo Loss, optimizer
from torch.optim.lr_scheduler import StepLR
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.AdamW(model.parameters(), lr = 0.5e-5,weight_decay=1e-5)
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)
from tqdm import tqdm
epochs = 20
early_stopping = EarlyStopping(patience=3, verbose=True)
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    scheduler.step()
    avg_val_train = train_loss / len(train_loader)
    model.eval()
    val_loss = 0.0
    predictions = []
    true_labels = []
    soft_max = []
    for batch in tqdm(dev_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            predicted =  torch.softmax(outputs, dim=1)
            soft_max.extend(predicted.cpu().numpy().tolist())
            predicted =  torch.argmax(predicted, dim=1)
            predictions.extend(predicted.cpu().numpy().tolist())
            true_labels.extend(labels.cpu().numpy().tolist())

    avg_val_loss = val_loss / len(dev_loader)
    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Validation Loss: {avg_val_loss:.6f}")
    print(f"Training Loss: {avg_val_train:.6f}")
    print(classification_report(true_labels, predictions, digits=4))
    # Check early stopping
    early_stopping(avg_val_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break

# Load the best model
model.load_state_dict(torch.load('checkpoint.pt'))

100%|██████████| 1862/1862 [08:24<00:00,  3.69it/s]
100%|██████████| 466/466 [00:37<00:00, 12.54it/s]


Epoch 1/20
Validation Loss: 0.363170
Training Loss: 0.693674
              precision    recall  f1-score   support

           0     0.8302    0.8938    0.8608      2457
           1     0.9359    0.8514    0.8917      2504
           2     0.8448    0.8577    0.8512      2488

    accuracy                         0.8675      7449
   macro avg     0.8703    0.8676    0.8679      7449
weighted avg     0.8706    0.8675    0.8680      7449

Validation loss decreased (inf --> 0.363170).  Saving model ...


100%|██████████| 1862/1862 [08:23<00:00,  3.69it/s]
100%|██████████| 466/466 [00:37<00:00, 12.56it/s]


Epoch 2/20
Validation Loss: 0.319841
Training Loss: 0.461561
              precision    recall  f1-score   support

           0     0.9086    0.8734    0.8906      2457
           1     0.9450    0.8710    0.9065      2504
           2     0.8248    0.9212    0.8703      2488

    accuracy                         0.8886      7449
   macro avg     0.8928    0.8886    0.8891      7449
weighted avg     0.8928    0.8886    0.8892      7449

Validation loss decreased (0.363170 --> 0.319841).  Saving model ...


100%|██████████| 1862/1862 [08:23<00:00,  3.70it/s]
100%|██████████| 466/466 [00:37<00:00, 12.58it/s]


Epoch 3/20
Validation Loss: 0.305265
Training Loss: 0.389214
              precision    recall  f1-score   support

           0     0.9188    0.8844    0.9013      2457
           1     0.9313    0.8930    0.9117      2504
           2     0.8543    0.9212    0.8865      2488

    accuracy                         0.8996      7449
   macro avg     0.9015    0.8995    0.8998      7449
weighted avg     0.9014    0.8996    0.8998      7449

Validation loss decreased (0.319841 --> 0.305265).  Saving model ...


100%|██████████| 1862/1862 [08:23<00:00,  3.70it/s]
100%|██████████| 466/466 [00:36<00:00, 12.60it/s]


Epoch 4/20
Validation Loss: 0.297697
Training Loss: 0.338494
              precision    recall  f1-score   support

           0     0.8902    0.9039    0.8970      2457
           1     0.9201    0.9157    0.9179      2504
           2     0.8940    0.8846    0.8893      2488

    accuracy                         0.9015      7449
   macro avg     0.9014    0.9014    0.9014      7449
weighted avg     0.9015    0.9015    0.9015      7449

Validation loss decreased (0.305265 --> 0.297697).  Saving model ...


100%|██████████| 1862/1862 [08:23<00:00,  3.70it/s]
100%|██████████| 466/466 [00:37<00:00, 12.56it/s]


Epoch 5/20
Validation Loss: 0.287385
Training Loss: 0.307361
              precision    recall  f1-score   support

           0     0.9060    0.9023    0.9042      2457
           1     0.9350    0.9193    0.9271      2504
           2     0.8902    0.9088    0.8994      2488

    accuracy                         0.9102      7449
   macro avg     0.9104    0.9101    0.9102      7449
weighted avg     0.9105    0.9102    0.9103      7449

Validation loss decreased (0.297697 --> 0.287385).  Saving model ...


100%|██████████| 1862/1862 [08:24<00:00,  3.69it/s]
100%|██████████| 466/466 [00:36<00:00, 12.62it/s]


Epoch 6/20
Validation Loss: 0.290173
Training Loss: 0.240078
              precision    recall  f1-score   support

           0     0.9149    0.9056    0.9102      2457
           1     0.9426    0.9109    0.9265      2504
           2     0.8841    0.9228    0.9030      2488

    accuracy                         0.9131      7449
   macro avg     0.9138    0.9131    0.9132      7449
weighted avg     0.9139    0.9131    0.9133      7449



100%|██████████| 1862/1862 [08:24<00:00,  3.69it/s]
100%|██████████| 466/466 [00:37<00:00, 12.56it/s]


Epoch 7/20
Validation Loss: 0.296537
Training Loss: 0.225336
              precision    recall  f1-score   support

           0     0.9172    0.9105    0.9138      2457
           1     0.9442    0.9121    0.9279      2504
           2     0.8888    0.9256    0.9069      2488

    accuracy                         0.9161      7449
   macro avg     0.9167    0.9161    0.9162      7449
weighted avg     0.9168    0.9161    0.9162      7449



100%|██████████| 1862/1862 [08:24<00:00,  3.69it/s]
100%|██████████| 466/466 [00:37<00:00, 12.52it/s]


Epoch 8/20
Validation Loss: 0.298938
Training Loss: 0.211119
              precision    recall  f1-score   support

           0     0.9187    0.9105    0.9146      2457
           1     0.9431    0.9141    0.9284      2504
           2     0.8902    0.9256    0.9076      2488

    accuracy                         0.9168      7449
   macro avg     0.9173    0.9167    0.9169      7449
weighted avg     0.9174    0.9168    0.9169      7449

Early stopping


<All keys matched successfully>

## Test Model

In [None]:
model = PhoBERTClassifier(phobert, num_classes=3).to(device)
tokenizer = AutoTokenizer.from_pretrained(modelname)
model.load_state_dict(torch.load('checkpoint.pt'))

<All keys matched successfully>

In [None]:
X1_pub_test = data_test['claim']
X2_pub_test = data_test['evidence']
X_pub_test = [(X1_pub_test, X2_pub_test) for (X1_pub_test, X2_pub_test) in zip(X1_pub_test, X2_pub_test)]
y_pub_test = [1 for _ in range(5396)]

test_dataset = SentencePairDataset(X_pub_test, y_pub_test, tokenizer, 256)

test_loader_pub = DataLoader(test_dataset, batch_size=8)

In [None]:
model.eval()
predictions = []
for batch in tqdm(test_loader_pub):
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        predicted =  torch.argmax(outputs, dim=1)
        predictions.extend(predicted.cpu().numpy().tolist())

100%|██████████| 675/675 [00:28<00:00, 23.97it/s]


In [None]:
data_submit = data_test.copy()
data_submit['verdict'] = predictions
data_submit = data_submit.drop(['context', 'claim', 'evidence_top5', 'top3','top5','top2'], axis=1)

In [None]:
for i in range(len(data_submit)):
    if(data_submit['verdict'][i] == 2):
        data_submit['evidence'][i] =""

In [None]:
data_submit['verdict'] = data_submit['verdict'].replace(0,"SUPPORTED")
data_submit['verdict'] = data_submit['verdict'].replace(1,"REFUTED")
data_submit['verdict'] = data_submit['verdict'].replace(2,"NEI")

In [None]:
data_submit = data_test.copy()
data_submit = data_submit.drop(['context', 'claim', 'evidence_top5'], axis=1)

In [None]:
import zipfile
import json

output = {}
for _, row in data_submit.iterrows():
    output[str(row['id'])] = {
        'verdict': "REFUTED",
      #  'verdict': row['verdict'],
        'evidence': row['evidence']
    }

json_str = json.dumps(output, indent=4, ensure_ascii=False)

with open('private_result.json', 'w', encoding='utf-8') as f:
    json.dump(output, f, ensure_ascii=False, indent=4)

with zipfile.ZipFile('results.zip', 'w') as zipf:
    zipf.writestr('private_result.json', json_str)

print("Done.")

Done.


In [None]:
distribution = data_submit['verdict'].value_counts()
print(distribution)

KeyError: 'verdict'