In [None]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from transformers import AutoTokenizer,  AutoConfig, AutoModel, DistilBertTokenizer, AdamW, get_linear_schedule_with_warmup, DistilBertModel
from torch.optim import Adam
from torch.optim.lr_scheduler import LinearLR
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from itertools import islice, count

In [None]:
params = {
    "max_token_len": 50,
    "batch_size": 512,
    "n_epochs": 15,
    "gpus": 1,
    "early_stop_patience": 2,
    "dropout": 0.2,
    "model_name": "roberta-base",
    "tokenizer_model_name": "roberta-base",
    "val_size": 0.2,
    "lr": 1e-5,
    "base_dir": r".\base_dir",
    "base_save_name": "roberta_base_full",
    "best_save_name": "roberta_best_full.pt",
    "non_vendor_columns": ["ID", "vendor_id", 'Vendor_Name', 'Description', 'Cat Code Mod',
       'Category', 'desc_expanded', 'description_cleaned_raw',
       'description_cleaned', 'encoded_category'], #source
    "include_others": True,
    "others_percentage": 1.0,
    "n_samp": 10,
    "eps": 1e-6,
    "unc_rate": 1
}

In [None]:
#Setting Random Seeds
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# setting up the device type
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
train_data = pd.read_csv(r"D:\workspace\Rishabh\Uncertainty_NN\Rebate_Data\2_non_vendor_7000_20.csv")  
train_data['GT'], _ = pd.factorize(train_data['encoded_category'])
num_classes = train_data['GT'].nunique()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(params['tokenizer_model_name'])
tokenizer

In [None]:
#define the custom dataset class here
class TorchDataset(torch.utils.data.Dataset):    
    def __init__(self, data, tokenizer, max_token_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        
        raw_description = data_row.cleaned_desc 
        labels = data_row.GT    #encoded_category 
        uid = data_row.uid
        
        encoding = self.tokenizer.encode_plus(
            raw_description,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        return_item = dict(
            uid= torch.Tensor([uid]).long().squeeze(),
            input_ids= encoding['input_ids'].flatten(),
            attention_mask= encoding["attention_mask"].flatten(),
            labels= torch.Tensor([labels]).long().squeeze()
        )
        return return_item

In [29]:

class OriginalModel(nn.Module):
    def __init__(self):
        super(OriginalModel, self).__init__()
        self.fc1 = nn.Linear(784, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 20)  # Assuming 10 output classes
    
    def forward(self, x):
        x = torch.flatten(x, 1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class ExtendableNet(nn.Module):
	def __init__(self, base_model):
		super(ExtendableNet, self).__init__()
		self.base_model = base_model

		# Get the modules before the last layer (assuming output layer is last)
		self.pre_output_modules = nn.Sequential(*list(base_model.children())[:-1])
		# Define the new linear layer
		self.extended_layer = nn.Linear(self.get_pre_output_features(), 
										list(self.base_model.children())[-1].out_features)  # Same as logits dimension

	def get_pre_output_features(self):
		"""
		Extracts the input feature size of the logits layer.

		This function iterates through the modules of the base model
		and returns the number of features before the last layer.
		"""
		modules = [mod for name, mod in self.base_model.named_children()]
		last_non_linear_layer = modules[-1]
		if isinstance(last_non_linear_layer, nn.Linear):
			return last_non_linear_layer.in_features  # Linear layer
		else:
			# Handle cases where the last layer is not a linear layer (e.g., activation)
			return last_non_linear_layer.out_features  # Assuming final output dimension

	def forward(self, x):
		# Pass input through all layers except the last
		pre_output = self.pre_output_modules(x)

		# Reshape pre_output to a vector (1xN)
		pre_output = pre_output.view(pre_output.shape[0], -1)

		# Pass the reshaped output through the extended layer
		extended_output = self.extended_layer(pre_output)

		# Get the final output from the base model (logits)
		base_output = self.base_model(x)

		# Return both outputs (original and extended)
		return extended_output, base_output

# Example usage: assuming your base model is defined as `BaseNet`
base_model = OriginalModel()  # Replace with the actual model class
extended_model = ExtendableNet(base_model)

data = torch.randn(1,784)
# Pass your data through the extended model
extended_output1, extended_output2 = extended_model(data)

print("Extended Layer 1 Output:", extended_output1.shape)  # Shape of the output of the first extended layer
print("Extended Layer 2 Output:", extended_output2.shape)  # Shape of the output of the second extended layer

Extended Layer 1 Output: torch.Size([1, 20])
Extended Layer 2 Output: torch.Size([1, 20])


In [19]:
class ExtendedCELoss(nn.Module):
    """ use modified CE loss for variance calculation with UncertainLinear network """
    def forward(self, out:torch.Tensor, y:torch.Tensor, n_samp:int=10) -> torch.Tensor:
        f = nn.CrossEntropyLoss()
        logit, sigma = out   
        dist = torch.distributions.Normal(logit, torch.exp(sigma))
        mc_logs = dist.rsample((n_samp,))
        loss = 0.

        for mc_log in mc_logs:
            loss += f(mc_log, y)
            
        loss /= n_samp

        return loss

20

In [None]:
def predict(model, x, n_samp:int=25, is_target = True):                              
    """ This function predicts the model and data uncertainty where samples are drawn from the target distribution for uncertainty prediction
    
    Input:
        model: model object instance 
        x (tuple): (input_ids, attention_mask)
        n_samp (int) :  number of samples used for uncertainty prediction.
        is_target (Boolean) : If it is True, samples are drawn from target distribution for uncertainty estimation else the same input is passed multiple times to get different predictions for uncertainty estimation

    Return : 
        epistemic (float): model uncertainty 
        aleatpry (float) : data uncertainty
    """

    if is_target: 
        logit, sigma = model.forward(x[0], x[1])
        dist = torch.distributions.Normal(logit, torch.exp(sigma))
        mc_logs = dist.rsample((n_samp,))
        probits = torch.sigmoid(mc_logs)
        epistemic = probits.var(dim=0, unbiased=True)
        aleatory = torch.exp(sigma)
        return epistemic, aleatory
    
    else:
        out = [model.forward(x[0], x[1]) for _ in range(n_samp)]
        logits = torch.stack([o[0] for o in out]).detach().cpu()   #shape = (n_samp, 512, 10)
        sigmas = torch.stack([o[1] for o in out]).detach().cpu()
        probits = torch.sigmoid(logits)
        epistemic = probits.var(dim=0, unbiased=True)
        aleatory = torch.exp(sigmas).mean(dim=0)
        return epistemic, aleatory


def get_metrics(model, x, y, n_samp:int, eps:float): 
    ''' This function helps us getting the epistemic, aleatory and scibilic uncertainty(epistemic/aleatoric) values '''

    state = model.training
    model.eval()
    with torch.no_grad():
        ep, al = predict(model, x, n_samp)
        sb = ep / (al + eps)
        eu, au, su = ep.cpu().numpy().mean(), al.cpu().numpy().mean(), sb.cpu().numpy().mean()
    model.train(state)

    return eu, au, su     

In [None]:
train_batch = train_data[train_data.set == 'TRAIN']
val_batch = train_data[train_data.set != 'TRAIN']

train_dataloader = DataLoader(TorchDataset(train_batch, tokenizer, max_token_len=params["max_token_len"]), batch_size=params["batch_size"], shuffle=True)
val_dataloader = DataLoader(TorchDataset(val_batch, tokenizer, max_token_len=params["max_token_len"]), batch_size=params["batch_size"])

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
criterion = ExtendedCELoss()
model = ExtendableNet(num_classes, params["dropout"])
optimizer = Adam(model.parameters(), lr=0.00005)
model = model.to(device)
criterion = criterion.to(device)

### Model Training

In [None]:
train_losses, valid_losses = [], []
t_ep_unc, t_en_unc, t_al_unc, t_sb_unc = [], [], [], []
v_ep_unc, v_en_unc, v_al_unc, v_sb_unc = [], [], [], []
n_batches = len(train_dataloader)

In [None]:
last_val_acc = -1
patience = params["early_stop_patience"]
triggertimes = 0
best_val_acc = -1

for t in range(1, params['n_epochs']+1):
    # training
    t_losses, t_ep, t_al, t_sb = [], [], [], [] #t_en,
    total_acc_train = 0
    total_loss_train = 0
    total_acc_val = 0
    total_loss_val = 0
    avg_batch_acc = []

    model.train()
    
    for i,item_dict in enumerate(tqdm(train_dataloader)):
        train_label = item_dict['labels'].to(device)
        train_input_ids = item_dict['input_ids'].to(device)
        train_attention_mask = item_dict['attention_mask'].to(device)
        train_uid = item_dict['uid'].to(device)
        output = model(train_input_ids, train_attention_mask) #input: 256,1 train_label:256
        out, var= output         

        batch_loss = criterion(output, train_label)
        
        total_loss_train += batch_loss
        
        acc = (out.argmax(dim=1) == train_label).sum().item()  
        
        total_acc_train += acc

        avg_batch_acc.append(acc/len(train_batch))

        
        t_losses.append(batch_loss.item())

        model.zero_grad()
        batch_loss.backward()
        optimizer.step()     #ep : 512x10
        if i % params['unc_rate'] == 0:
            ep, al, sb = get_metrics(model, (train_input_ids, train_attention_mask), train_label, params['n_samp'], params['eps'])   
            t_ep.append(ep)  
            t_al.append(al); t_sb.append(sb)
    train_losses.append(t_losses)
    t_ep_unc.append(t_ep); 
    t_al_unc.append(t_al); t_sb_unc.append(t_sb)

    print('********** VALIDATION STARTS **********')
    # validation
    v_losses, v_ep, v_en, v_al, v_sb = [], [], [], [], []
    model.eval()
    with torch.no_grad():
        for val_item_dict in tqdm(val_dataloader):
            val_label = val_item_dict['labels'].to(device)
            val_input_ids = val_item_dict['input_ids'].to(device)
            val_attention_mask = val_item_dict['attention_mask'].to(device)
            val_uid = val_item_dict['uid'].to(device) 
            
            output = model(val_input_ids, val_attention_mask)
            out, var = output    
    
            batch_loss = criterion(output, val_label)
            total_loss_val += batch_loss.item()
            
            acc = (out.argmax(dim=1) == val_label).sum().item()   
            total_acc_val += acc

            ep, al, sb = get_metrics(model, (val_input_ids, val_attention_mask), val_label, params['n_samp'], params['eps'])
            v_losses.append(batch_loss.item())
            v_ep.append(ep) 
            v_al.append(al); v_sb.append(sb)
        valid_losses.append(v_losses)
        v_ep_unc.append(v_ep)
        v_al_unc.append(v_al); v_sb_unc.append(v_sb)


    print(f'Epochs: {t} | Train Loss: {total_loss_train / len(train_batch): .3f} \
                | Train Accuracy: {total_acc_train / len(train_batch): .3f} \
                | Val Loss: {total_loss_val / len(val_batch): .3f} \
                | Val Accuracy: {total_acc_val / len(val_batch): .3f}\
                | Avg Batch Acc: {np.mean(avg_batch_acc): .3f},  TL: {np.mean(t_losses):.3f}, VL: {np.mean(v_losses): .3f}, tEU: {np.mean(t_ep): .3f}, vEU: {np.mean(v_ep): .3f} | tAU: {np.mean(t_al): .3f}, vAU: {np.mean(v_al): .3f}')

    if not np.all(np.isfinite(t_losses)): 
        raise RuntimeError('NaN or Inf in training loss, cannot recover. Exiting.')
   

    if total_acc_val > best_val_acc:
        best_val_acc = total_acc_val

        save_name = os.path.join(params["base_dir"], params["best_save_name"])
        
        torch.save({'epoch': t,'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(),'Train loss': total_loss_train/len(train_batch),
                    'Train Accuracy': total_acc_train / len(train_batch),'Val loss': total_loss_val/len(val_batch),'Val Accuracy': total_acc_val / len(val_batch),
                    't_ep_unc': t_ep_unc,  't_al_unc': t_al_unc, 't_sb_unc': t_sb_unc, 'v_ep_unc': v_ep_unc, 'v_al_unc': v_al_unc, 'v_sb_unc': v_sb_unc}, save_name)   



    save_name = os.path.join(params["base_dir"], str(t) + "_" + f'{total_acc_val / len(val_batch): .3f}' + ".pt")
    
    torch.save({'epoch': t,'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(),'Train loss': total_loss_train/len(train_batch),
                    'Train Accuracy': total_acc_train / len(train_batch),'Val loss': total_loss_val/len(val_batch),'Val Accuracy': total_acc_val / len(val_batch),
                    't_ep_unc': t_ep_unc, 't_al_unc': t_al_unc, 't_sb_unc': t_sb_unc, 
                    'v_ep_unc': v_ep_unc, 'v_al_unc': v_al_unc, 'v_sb_unc': v_sb_unc}, save_name)  
    
    if last_val_acc >= total_acc_val:
        print(last_val_acc, total_acc_val)
        triggertimes += 1
        print('Trigger Times:', triggertimes)
        
        if triggertimes >= patience:
            print('Early stopping!\n')
            break
    else:
        triggertimes = 0
        
    last_val_acc = total_acc_val

In [None]:
np.savetxt('t_ep_unc.csv', t_ep_unc)
np.savetxt('t_al_unc.csv', t_al_unc)
np.savetxt('t_sb_unc.csv', t_sb_unc)
np.savetxt('train_losses.csv', train_losses)
np.savetxt('v_ep_unc.csv', v_ep_unc)
np.savetxt('v_al_unc.csv', v_al_unc)
np.savetxt('v_sb_unc.csv', v_sb_unc)
np.savetxt('valid_losses.csv', valid_losses)

In [None]:
def tidy_losses(train, valid):
    '''
    This function helps us get the dataframe with loss values during each phase(train or validation) for each epoch
    '''
    out = {'epoch': [], 'type': [], 'value': [], 'phase': []}
    for i, (tl,vl) in enumerate(zip(train,valid),1):
        for tli in tl:
            out['epoch'].append(i)
            out['type'].append('loss')
            out['value'].append(tli)
            out['phase'].append('train')
        for vli in vl:
            out['epoch'].append(i)
            out['type'].append('loss')
            out['value'].append(vli)
            out['phase'].append('valid')
    return pd.DataFrame(out)

def tidy_uncertainty(ep, al, sb):
    '''
    This function helps us get the dataframe with epistemic and aleatory uncertainty values during each phase(train or validation) for each epoch
    '''
    out = {'epoch': [], 'type': [], 'value': [], 'phase': []}
    for i, (epi, ali, sbi) in enumerate(zip(ep, al, sb)):
        phase = 'train' if i == 0 else 'valid'
        for j, (epij,alij,sbij) in enumerate(zip(epi,ali,sbi),1):
            for epijk in epij:
                out['epoch'].append(j)
                out['type'].append('epistemic')
                out['value'].append(epijk)
                out['phase'].append(phase)
            for alijk in alij:
                out['epoch'].append(j)
                out['type'].append('aleatory')
                out['value'].append(alijk)
                out['phase'].append(phase)
    return pd.DataFrame(out)

In [None]:
losses = tidy_losses(train_losses, valid_losses)
uncert = tidy_uncertainty((t_ep_unc, v_ep_unc), 
                          (t_al_unc, v_al_unc), 
                          (t_sb_unc, v_sb_unc))

In [None]:
f, ax1 = plt.subplots(1,1,figsize=(12, 8),sharey=True)
sns.lineplot(x='epoch',y='value',hue='phase',data=losses,ax=ax1,lw=3);  #,ci='sd'
ax1.set_title('Losses')

In [None]:
#plot of epistemic uncertainty value during training.
epistem_uncert = uncert[uncert['type']== 'epistemic']
f, ax1 = plt.subplots(1,1,figsize=(12,8),sharey=True) 
sns.lineplot(x='epoch', y='value', style='phase', ci='sd',data= epistem_uncert, ax=ax1, lw=3)
ax1.set_title('Epistemic Uncertainty')

In [None]:
f, ax1 = plt.subplots(1,1,figsize=(12,8),sharey=True) 
# if use_log: ax1.set(yscale='log')
sns.lineplot(x='epoch',y='value',hue='type',style='phase',ci='sd',data=uncert,ax=ax1,lw=3);
ax1.set_title('Uncertainty')