In [8]:
!pip install SentencePiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import transformers
from transformers import AlbertTokenizer, AlbertModel
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm
from gensim.models import KeyedVectors
from transformers import AlbertTokenizer, AlbertModel
from sklearn.metrics.pairwise import cosine_similarity

label_names = {'SUPPORTS': 0, 'NOT_ENOUGH_INFO': 1, 'REFUTES': 2, 'DISPUTED': 3}
evd = pd.read_json('/content/drive/MyDrive/Colab Notebooks/NLP project/FactChecker_NLP/data/evidence.json', orient='index')
evidences_list = evd.iloc[:, 0].tolist()
org_train = pd.read_json('/content/drive/MyDrive/Colab Notebooks/NLP project/FactChecker_NLP/data/train-claims.json').T
org_dev = pd.read_json('/content/drive/MyDrive/Colab Notebooks/NLP project/FactChecker_NLP/data/dev-claims.json').T

#prep the data needed for evidence label classification
def bert_classifier_prep(train_claims):
    bert_train = []
    #loop every row of the train_claims
    for row in train_claims.iterrows():
        label = label_names[row[1]['claim_label']]
        if label == 3:
            continue
        else:
            claim_text = row[1]['claim_text']

            for evidence_id in row[1]['evidences']:
                evidence_id = int(evidence_id.split('-')[1])
                evidence_text = evidences_list[evidence_id]
                bert_train.append((claim_text, label, evidence_text))
            
    return bert_train


class TrainDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=120):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        claim_text, claim_label, evidence_text = self.data[idx]
        inputs = self.tokenizer(
            claim_text,
            evidence_text,
            return_attention_mask=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors='pt'
        )
        token_ids = inputs['input_ids'][0]
        mask = inputs['attention_mask'][0]
        
        return {'input_ids':token_ids, 'attention_mask':mask, 'target':torch.tensor(claim_label, dtype=torch.long)}

class EvidenceClassifier(torch.nn.Module):
    def __init__(self):
        super(EvidenceClassifier, self).__init__()
        self.bert = transformers.AlbertModel.from_pretrained('albert-base-v2')
        self.dropout1 = torch.nn.Dropout(p=0.2)  # increased dropout
        self.fc1 = torch.nn.Linear(self.bert.config.hidden_size, 32)  # decreased model complexity
        self.dropout2 = torch.nn.Dropout(p=0.2)  # add another dropout layer
        self.fc2 = torch.nn.Linear(32, 3)  # final fully connected layer
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout1(pooled_output)
        fc1_output = self.fc1(pooled_output)
        fc1_output = self.dropout2(fc1_output)
        logits = self.fc2(fc1_output)
        return logits


def evaluate(model, dataloader, criterion):
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            input_ids = batch['input_ids'].cuda()
            attention_mask = batch['attention_mask'].cuda()
            target = batch['target'].cuda()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            loss = criterion(outputs, target)

            running_loss += loss.item() * input_ids.size(0)
            _, preds = torch.max(outputs, 1)
            running_corrects += torch.sum(preds == target)

    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = running_corrects.double() / len(dataloader.dataset)

    return epoch_loss, epoch_acc

def train(model, dataloader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    running_corrects = 0
    label_names = [0,1,2,3]
    label_corrects = [0] * len(label_names)
    label_totals = [0] * len(label_names)
    for i, batch in enumerate(dataloader):
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        target = batch['target'].cuda()
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * input_ids.size(0)
        _, preds = torch.max(outputs, 1)
        running_corrects += torch.sum(preds == target)
        for i in range(len(label_names)):
          label_corrects[i] += torch.sum((preds == i) & (target == i))
          label_totals[i] += torch.sum(target == i)
    test_accs = [correct.item() / total.item() if total.item() != 0 else 0 for correct, total in zip(label_corrects, label_totals)]
    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = running_corrects.double() / len(dataloader.dataset)
    for label, acc in zip(label_names, test_accs):
      print(f'Test Acc ({label}): {acc:.4f}')
    return epoch_loss, epoch_acc


def pretrain_bert(early_stop_epochs=2):
    tokenizer = transformers.AlbertTokenizer.from_pretrained('albert-base-v2')
    train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)
    model = EvidenceClassifier().cuda()
    weight_decay = 1e-5
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=weight_decay)

    best_loss = None
    no_improve_epochs = 0

    for epoch in range(8):
        train_loss, train_acc = train(model, train_dataloader, criterion, optimizer)
        test_loss, test_acc = evaluate(model, test_dataloader, criterion)

        if best_loss is None or test_loss < best_loss:
            best_loss = test_loss
            no_improve_epochs = 0
            # save model
            torch.save(model.state_dict(), 'albert_classifier.pt')
        else:
            no_improve_epochs += 1

        print(f'Epoch: {epoch+1} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}')

        if no_improve_epochs >= early_stop_epochs:
            print("Early stopping due to no improvement in validation loss for {} epochs.".format(early_stop_epochs))
            break


tokenizer = transformers.AlbertTokenizer.from_pretrained('albert-base-v2')
train_dataset = TrainDataset(bert_classifier_prep(org_train), tokenizer, 300)
test_dataset = TrainDataset(bert_classifier_prep(org_dev), tokenizer, 300)
pretrain_bert()

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.decoder.weight', 'predictions.LayerNorm.bias', 'predictions.bias', 'predictions.dense.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing 

Test Acc (0): 0.3135
Test Acc (1): 0.7886
Test Acc (2): 0.0263
Test Acc (3): 0.0000


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Epoch: 1 | Train Loss: 0.9517 | Train Acc: 0.5241 | Test Loss: 0.9345 | Test Acc: 0.5843


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Test Acc (0): 0.7327
Test Acc (1): 0.8601
Test Acc (2): 0.0000
Test Acc (3): 0.0000


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Epoch: 2 | Train Loss: 0.7533 | Train Acc: 0.7088 | Test Loss: 0.9200 | Test Acc: 0.5982


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Test Acc (0): 0.8995
Test Acc (1): 0.9487
Test Acc (2): 0.0044
Test Acc (3): 0.0000
Epoch: 3 | Train Loss: 0.5328 | Train Acc: 0.8153 | Test Loss: 0.9888 | Test Acc: 0.6097


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Test Acc (0): 0.9635
Test Acc (1): 0.9860
Test Acc (2): 0.1247
Test Acc (3): 0.0000
Epoch: 4 | Train Loss: 0.3709 | Train Acc: 0.8724 | Test Loss: 1.1103 | Test Acc: 0.5866
Early stopping due to no improvement in validation loss for 2 epochs.
