In [1]:
import os
os.environ['CUDA_ENVIRONMENT_DEVICES'] = "0"

In [2]:
import sys
import numpy as np
from numpy import asarray,zeros
import pandas as pd 
from sklearn.model_selection import train_test_split
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
import copy
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
import transformers
from transformers import BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW, get_linear_schedule_with_warmup
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import timm

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
# Load the data
bert_source = np.load("../data/bert_text_only_source.npy")
bert_target = np.load("../data/bert_text_only_target.npy")
labels_data = np.load("../data/labels.npy").squeeze(1)
ids_data = np.load("../data/ids.npy").squeeze(1)
# Printing the shapes
print(bert_source.shape)
print(bert_target.shape)
print(labels_data.shape)
print(ids_data.shape)

(11766, 768)
(11766, 768)
(11766,)
(11766,)


# Data Loader

In [5]:
def get_data_loader(batch_size, target_data, source_data, labels, split_type = 'train'):
    target_data = torch.tensor(target_data)
    source_data = torch.tensor(source_data)
    labels = torch.tensor(labels, dtype=torch.long)
    data = TensorDataset(target_data, source_data, labels)
    if split_type == 'train':
        sampler = RandomSampler(data)
    elif split_type == 'val':
        sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
    return data, sampler, dataloader

In [6]:
# Split data
train_target_data, test_target_data, train_source_data, test_source_data, train_labels, test_labels = train_test_split(bert_target, bert_source, labels_data, test_size=0.2, random_state=43)

In [7]:
train_ids, test_ids = train_test_split(ids_data, test_size=0.2, random_state=43)

In [8]:
batch_size = 64
train_data, train_sampler, train_dataloader = get_data_loader(batch_size, train_target_data, train_source_data, train_labels, 'train')
test_data, test_sampler, test_dataloader = get_data_loader(batch_size, test_target_data, test_source_data, test_labels, 'val')

# Model

In [10]:
# Text Model
class BERTModel(nn.Module):
    def __init__(self):
        super(BERTModel, self).__init__()
        #  Instantiating BERT-based model object
        self.linear_1 = nn.Linear(1536, 512, bias=True)
        self.tanh1 = nn.Tanh()
        self.linear_2 = nn.Linear(512, 128, bias=True)
        self.tanh2 = nn.Tanh()
        self.classification = nn.Linear(128, 2)
    def forward(self, target_features, source_features):
        concat_tensor = torch.cat([target_features, source_features], dim=1)
        inter_1 = self.tanh1(self.linear_1(concat_tensor))
        inter_2 = self.tanh2(self.linear_2(inter_1))
        logits = self.classification(inter_2)
        return logits

In [11]:
# Get model
model = BERTModel().to(device)

# Training

In [12]:
# Optimizer and scheduler
def get_optimizer_scheduler(name, model, train_dataloader_len, epochs, lr_set):
    optimizer = AdamW(model.parameters(),
                lr = lr_set, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
    )

    total_steps = train_dataloader_len * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = total_steps//2, # Default value in run_glue.py
                                                num_training_steps = total_steps)
    return optimizer, scheduler

In [13]:
# Getting the optimizer and scheduler
epochs = 5
lr = 3e-5 # Less LR
# lr = 0.5
iters_to_accumulate = 2
name = "Adam"
# name = "LARS-SGD"
criterion = nn.CrossEntropyLoss()
optimizer, scheduler = get_optimizer_scheduler(name, model, len(train_dataloader), epochs, lr)

In [14]:
################ Evaluating Loss ######################
#######################################################
def evaluate_loss(net, device, criterion, dataloader):
    net.eval()
    mean_loss = 0
    count = 0
    with torch.no_grad():
        for it, (target_inputs, source_inputs, labels) in enumerate(tqdm(dataloader)):
            target_inputs, source_inputs, labels = target_inputs.to(device), source_inputs.to(device), labels.to(device)
            logits = net(target_inputs, source_inputs)
            mean_loss += criterion(logits.squeeze(-1), labels).item() # initially it was logits.squeeze(-1)
            count += 1
    return mean_loss / count

In [15]:
################ Flat Accuracy Calculation ####################
###############################################################
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
################ Validation Accuracy Calculation ####################
###############################################################
def evaluate_accuracy(model, device, validation_dataloader):
    model.eval()
    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)	    
        # Unpack the inputs from our dataloader
        b_t_inputs, b_s_inputs, b_labels = batch	    

        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad(): 
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            logits = model(b_t_inputs, b_s_inputs)       

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1
    accuracy = eval_accuracy/nb_eval_steps
    return accuracy

In [16]:
def train_model(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):
    best_loss = np.Inf
    best_ep = 1
    nb_iterations = len(train_loader)
    print_every = nb_iterations // 5  # print the training loss 5 times per epoch
    iters = []
    train_losses = []
    val_losses = []
    # Iterating over all epochs
    for ep in range(epochs):
        net.train()
        running_loss = 0.0
        for it, (target_inputs, source_inputs, labels) in enumerate(tqdm(train_loader)):

            # Converting to cuda tensors
            target_inputs, source_inputs, labels = target_inputs.to(device), source_inputs.to(device), labels.to(device)
    		
            # Obtaining the logits from the model
            logits = net(target_inputs, source_inputs)
            # print(logits.device)

            # Computing loss
            # print(logits.squeeze(-1).shape)
            # print(labels.shape)
            loss = criterion(logits.squeeze(-1), labels)
            loss = loss / iters_to_accumulate  # Normalize the loss because it is averaged

            # Backpropagating the gradients
            # Calls backward()
            loss.backward()

            if (it + 1) % iters_to_accumulate == 0:
                # Optimization step
                # scaler.step() first unscales the gradients of the optimizer's assigned params.
                # If these gradients do not contain infs or NaNs, opti.step() is then called,
                # otherwise, opti.step() is skipped.
                opti.step()
                # Adjust the learning rate based on the number of iterations.
                lr_scheduler.step()
                # Clear gradients
                net.zero_grad()


            running_loss += loss.item()

            if (it + 1) % print_every == 0:  # Print training loss information
                print()
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                      .format(it+1, nb_iterations, ep+1, running_loss / print_every))

                running_loss = 0.0


        val_loss = evaluate_loss(net, device, criterion, val_loader)  # Compute validation loss
        val_accuracy = evaluate_accuracy(net, device, val_loader)
        print()
        print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))
        print("Epoch {} complete! Validation Accuracy : {}".format(ep+1, val_accuracy))

        if val_loss < best_loss:
            print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
            print()
            net_copy = copy.deepcopy(net)  # save a copy of the model
            best_loss = val_loss
            best_ep = ep + 1

    # Saving the model
    path_to_model='saved_models/lr_{}_val_loss_{}_ep_{}_text_only.pt'.format(lr, round(best_loss, 5), best_ep)
    torch.save(net_copy.state_dict(), path_to_model)
    net.load_state_dict(torch.load(path_to_model)) # Re-Loading the best model
    print("The model has been saved in {}".format(path_to_model))

    del loss
    torch.cuda.empty_cache()
    return net

In [17]:
# Train the model
model = train_model(model, criterion, optimizer, lr, scheduler, train_dataloader, test_dataloader, epochs, iters_to_accumulate)

 35%|███▌      | 52/148 [00:00<00:00, 174.46it/s]


Iteration 29/148 of epoch 1 complete. Loss : 0.3485366712356436 

Iteration 58/148 of epoch 1 complete. Loss : 0.34677284853211765 


 77%|███████▋  | 114/148 [00:00<00:00, 195.87it/s]


Iteration 87/148 of epoch 1 complete. Loss : 0.34017938272706394 

Iteration 116/148 of epoch 1 complete. Loss : 0.33595694344619226 


100%|██████████| 148/148 [00:00<00:00, 190.03it/s]



Iteration 145/148 of epoch 1 complete. Loss : 0.33229528418902693 


100%|██████████| 37/37 [00:00<00:00, 459.33it/s]



Epoch 1 complete! Validation Loss : 0.6685686981355822
Epoch 1 complete! Validation Accuracy : 0.5626013513513514
Best validation loss improved from inf to 0.6685686981355822



 14%|█▍        | 21/148 [00:00<00:00, 207.33it/s]


Iteration 29/148 of epoch 2 complete. Loss : 0.32639308016875695 


 43%|████▎     | 64/148 [00:00<00:00, 211.08it/s]


Iteration 58/148 of epoch 2 complete. Loss : 0.32718196614035244 


 58%|█████▊    | 86/148 [00:00<00:00, 212.97it/s]


Iteration 87/148 of epoch 2 complete. Loss : 0.31860919553658057 


 88%|████████▊ | 130/148 [00:00<00:00, 214.75it/s]


Iteration 116/148 of epoch 2 complete. Loss : 0.316150039434433 


100%|██████████| 148/148 [00:00<00:00, 212.94it/s]



Iteration 145/148 of epoch 2 complete. Loss : 0.30957569027769155 


100%|██████████| 37/37 [00:00<00:00, 539.70it/s]



Epoch 2 complete! Validation Loss : 0.6160167520110672
Epoch 2 complete! Validation Accuracy : 0.6463851351351352
Best validation loss improved from 0.6685686981355822 to 0.6160167520110672



 15%|█▍        | 22/148 [00:00<00:00, 216.39it/s]


Iteration 29/148 of epoch 3 complete. Loss : 0.30176105375947626 


 45%|████▌     | 67/148 [00:00<00:00, 219.39it/s]


Iteration 58/148 of epoch 3 complete. Loss : 0.2956420450374998 


 60%|██████    | 89/148 [00:00<00:00, 216.68it/s]


Iteration 87/148 of epoch 3 complete. Loss : 0.2882735462024294 


 75%|███████▌  | 111/148 [00:00<00:00, 211.90it/s]


Iteration 116/148 of epoch 3 complete. Loss : 0.284658400149181 


100%|██████████| 148/148 [00:00<00:00, 213.74it/s]



Iteration 145/148 of epoch 3 complete. Loss : 0.273715587011699 


100%|██████████| 37/37 [00:00<00:00, 535.73it/s]



Epoch 3 complete! Validation Loss : 0.5429091582427154
Epoch 3 complete! Validation Accuracy : 0.7459797297297297
Best validation loss improved from 0.6160167520110672 to 0.5429091582427154



 15%|█▍        | 22/148 [00:00<00:00, 218.90it/s]


Iteration 29/148 of epoch 4 complete. Loss : 0.26840604482025937 


 44%|████▍     | 65/148 [00:00<00:00, 208.40it/s]


Iteration 58/148 of epoch 4 complete. Loss : 0.25910601019859314 


 58%|█████▊    | 86/148 [00:00<00:00, 207.95it/s]


Iteration 87/148 of epoch 4 complete. Loss : 0.24787691903525386 


 86%|████████▋ | 128/148 [00:00<00:00, 206.60it/s]


Iteration 116/148 of epoch 4 complete. Loss : 0.24377916907442027 


100%|██████████| 148/148 [00:00<00:00, 208.28it/s]



Iteration 145/148 of epoch 4 complete. Loss : 0.2372358842142697 


100%|██████████| 37/37 [00:00<00:00, 512.02it/s]



Epoch 4 complete! Validation Loss : 0.46383886965545446
Epoch 4 complete! Validation Accuracy : 0.8057601351351351
Best validation loss improved from 0.5429091582427154 to 0.46383886965545446



 14%|█▍        | 21/148 [00:00<00:00, 209.67it/s]


Iteration 29/148 of epoch 5 complete. Loss : 0.2175942027363284 


 43%|████▎     | 64/148 [00:00<00:00, 207.05it/s]


Iteration 58/148 of epoch 5 complete. Loss : 0.23491663768373686 


 58%|█████▊    | 86/148 [00:00<00:00, 208.77it/s]


Iteration 87/148 of epoch 5 complete. Loss : 0.2148934412619163 


 87%|████████▋ | 129/148 [00:00<00:00, 208.92it/s]


Iteration 116/148 of epoch 5 complete. Loss : 0.2106730619381214 


100%|██████████| 148/148 [00:00<00:00, 208.54it/s]



Iteration 145/148 of epoch 5 complete. Loss : 0.20410434747564382 


100%|██████████| 37/37 [00:00<00:00, 525.58it/s]



Epoch 5 complete! Validation Loss : 0.4100561278897363
Epoch 5 complete! Validation Accuracy : 0.8169763513513513
Best validation loss improved from 0.46383886965545446 to 0.4100561278897363

The model has been saved in saved_models/lr_3e-05_val_loss_0.41006_ep_5_text_only.pt


In [20]:
def evaluate(prediction_dataloader, model, model_name, path_to_model, load = False):
    # Prediction on test set
    if load:
        print("Loading the weights of the model...")
        model.load_state_dict(torch.load(path_to_model))

    print('Evaluating on the testset')

    # Put model in evaluation mode
    model.eval()

    # Tracking variables 
    predictions , true_labels = [], []

    # Predict 
    for batch in prediction_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_t_inputs, b_s_inputs, b_labels = batch

        # Telling the model not to compute or store gradients, saving memory and 
        # speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            logits = model(b_t_inputs, b_s_inputs)

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()

        # Store predictions and true labels
        predictions.extend(pred_flat)
        true_labels.extend(labels_flat)
    # Code for result display
    print(model_name, 'Text Only BERT Classification accuracy is')
    print(metrics.accuracy_score(true_labels, predictions)*100)
    print(classification_report(true_labels, predictions, target_names = ['fake', 'real']))
	# For error analysis
    new_df = pd.DataFrame()
    new_df['ids'], new_df['ground_truth'], new_df['predicted'] = test_ids, true_labels, predictions
    new_df.to_csv(str('error_analysis/'+model_name+'.csv'), index=False)  
	# Converting to csv
	# Removed transpose - check if actually required
    clsf_report = pd.DataFrame(classification_report(y_true = true_labels, y_pred = predictions, output_dict=True, target_names = ['fake', 'real']))
    clsf_report.to_csv(str('saved_models/'+model_name+'.csv'), index= True)

In [21]:
model_name = 'text_only_bert'
path_to_model = 'saved_models/class_contrast_visualbert_lr_3e-05_val_loss_0.35285_ep_100.pt'
evaluate(test_dataloader, model, model_name, path_to_model = path_to_model, load = False)

Evaluating on the testset
text_only_bert Text Only BERT Classification accuracy is
81.64825828377231
              precision    recall  f1-score   support

        fake       0.77      0.84      0.80      1031
        real       0.86      0.80      0.83      1323

    accuracy                           0.82      2354
   macro avg       0.81      0.82      0.82      2354
weighted avg       0.82      0.82      0.82      2354

