In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from models import *
from collections import namedtuple
import tqdm
import matplotlib.pyplot as plt
import torch.nn.utils.prune as prune


import tokenization
import models
import optim as optim
import train_org as train
from utils import set_seeds, get_device, truncate_tokens_pair
from classify import dataset_class, Tokenizing, AddSpecialTokensWithTruncation, TokenIndexing, Classifier


2024-02-24 12:53:39.211333: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-24 12:53:39.275955: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Const Values

In [3]:
# Const Values for the file locations
task='mrpc'

train_cfg= f'config/train_{task}.json'
model_cfg='config/bert_base.json'

train_data_file = f'/home/common_algorithm/dataset/GLUE/{task.upper()}/train.tsv'
test_data_file=f'/home/common_algorithm/dataset/GLUE/{task.upper()}/dev.tsv'

pretrain_file = './PRE_TRAINED_MODEL/bert_model.ckpt'

In [4]:
# Const values for the network 
max_len=128

#vocab='./PRE_TRAINED_MODEL/vocab.txt'
vocab = '/home/common_algorithm/checkpoints/GLUE/PRE_TRAINED_MODEL/vocab.txt'
save_dir = './SAVE'
model_file = '/home/prbhatnagar/Desktop/Final_Project (copy)/finetuned/finetuned_mrpc'

In [5]:
cfg = train.Config.from_json(train_cfg)
model_cfg = models.Config.from_json(model_cfg)
set_seeds(cfg.seed)

In [6]:
model_cfg

Config(vocab_size=30522, dim=768, n_layers=12, n_heads=12, dim_ff=3072, p_drop_hidden=0.1, p_drop_attn=0.1, max_len=512, n_segments=2)

In [7]:
tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True)
TaskDataset = dataset_class(task) # task dataset class according to the task
pipeline = [Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
            AddSpecialTokensWithTruncation(max_len),
            TokenIndexing(tokenizer.convert_tokens_to_ids,
                            TaskDataset.labels, max_len)]

In [8]:
train_dataset = TaskDataset(train_data_file, pipeline)
test_dataset = TaskDataset(test_data_file, pipeline)
train_data_iter = DataLoader(train_dataset, batch_size=16, shuffle=False)
test_data_iter = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Model Part

In [9]:
def evaluate(model, batch):
    input_ids, segment_ids, input_mask, label_id = batch
    logits = model(input_ids, segment_ids, input_mask)
    _, label_pred = logits.max(1)
    result = (label_pred == label_id).float() #.cpu().numpy()
    accuracy = result.mean()
    return accuracy, result

In [10]:
def get_loss(model, batch, global_step): # make sure loss is a scalar tensor
    input_ids, segment_ids, input_mask, label_id = batch
    logits = model(input_ids, segment_ids, input_mask)
    loss = criterion(logits, label_id)
    return loss

criterion = nn.CrossEntropyLoss()

In [11]:
model = Classifier(model_cfg, len(TaskDataset.labels)).cuda()
trainer = train.Trainer(cfg, model, train_data_iter, test_data_iter, optim.optim4GPU(cfg, model),save_dir, get_device())
trainer.model.load_state_dict(torch.load(model_file))

cuda (2 GPUs)


<All keys matched successfully>

In [12]:
decomp_model = Classifier(model_cfg, len(TaskDataset.labels)).cuda()
decomp_trainer = train.Trainer(cfg, decomp_model, train_data_iter, test_data_iter, optim.optim4GPU(cfg, decomp_model),save_dir, get_device())
decomp_trainer.model.load_state_dict(torch.load(model_file))

cuda (2 GPUs)


<All keys matched successfully>

In [13]:
results = trainer.eval(evaluate)
total_accuracy = torch.cat(results).mean().item()
print(total_accuracy)

Iter(acc=1.000): 100%|████████████████████████| 102/102 [00:03<00:00, 30.87it/s]

0.867647111415863





### Fully Connected Compression

In [14]:
def gelu(x):
    "Implementation of the gelu activation function by Hugging Face"
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

In [15]:
class FCCompression(nn.Module):
    def __init__(self, cfg, k):
        super().__init__()
       
        self.fc1_u = nn.Linear(768, k, bias = False)
        self.fc1_vs = nn.Linear(k, 3072)
        
        self.fc2_u = nn.Linear(3072, k)
        self.fc2_vs = nn.Linear(k, 768)

    def forward(self, x):
        out = gelu(self.fc1_vs(self.fc1_u(x)))
        out = self.fc2_vs(self.fc2_u(out))
        return out

In [16]:
def decompose_matrix(mat, k):
    U, s, VT = np.linalg.svd(mat, full_matrices=False)
    Sigma = np.diag(s[:k])
    U_truncated = U[:, :k]
    VT_truncated = VT[:k, :]
    return U_truncated, Sigma, VT_truncated


In [17]:
def decomp_model_func(decomp_trainer, rank):
    for b in range(len(trainer.model.transformer.blocks)):
        
        block = trainer.model.transformer.blocks[b]
        target_block = decomp_trainer.model.transformer.blocks[b]

        target_block.pwff = FCCompression(model_cfg, rank[b])
        
        fc1_U, fc1_Sigma, fc1_V = decompose_matrix(block.pwff.fc1.weight.detach().cpu().numpy().T, rank[b])
        target_block.pwff.fc1_u.weight.data = torch.from_numpy(fc1_U).T.cuda().contiguous()
        target_block.pwff.fc1_vs.weight.data = torch.from_numpy(fc1_Sigma@fc1_V).T.cuda().contiguous()
        target_block.pwff.fc1_vs.bias.data = block.pwff.fc1.bias
        
        fc2_U, fc2_Sigma, fc2_V = decompose_matrix(block.pwff.fc2.weight.detach().cpu().numpy().T, rank[b])
        target_block.pwff.fc2_u.weight.data = torch.from_numpy(fc2_U).T.cuda().contiguous()
        target_block.pwff.fc2_vs.weight.data = torch.from_numpy(fc2_Sigma@fc2_V).T.cuda().contiguous()
        target_block.pwff.fc2_vs.bias.data = block.pwff.fc2.bias
        

In [18]:
rank = [400]*12
decomp_model_func(decomp_trainer, rank)

In [19]:
compression = 0
for r in rank:
    compression += (3072*r + 768*r)

compression = 1 - compression/ (12*768*3072)
print(compression)

0.34895833333333337


In [20]:
# Model performance after FC Compression
results = decomp_trainer.eval(evaluate)
total_accuracy = torch.cat(results).mean().item()
print(total_accuracy)

Iter(acc=0.750): 100%|████████████████████████| 102/102 [00:02<00:00, 42.27it/s]

0.7524510025978088





## Attention

In [26]:
class MultiHeadProjection(nn.Module):
    """ Multi-Headed Dot Product Attention """
    def __init__(self, cfg, k):
        super().__init__()
        
        self.drop = nn.Dropout(cfg.p_drop_attn)
        
        self.proj_q_u = nn.Linear(768,k)
        self.proj_q_vs = nn.Linear(k, 768, bias = False)
        
        self.proj_k_u = nn.Linear(768,k)
        self.proj_k_vs = nn.Linear(k, 768, bias = False)
        
        self.proj_v_u = nn.Linear(768,k)
        self.proj_v_vs = nn.Linear(k, 768, bias = False)
            
    def forward(self, x, mask):
        """
        x, q(query), k(key), v(value) : (B(batch_size), S(seq_len), D(dim))
        mask : (B(batch_size) x S(seq_len))
        * split D(dim) into (H(n_heads), W(width of head)) ; D = H * W
        """
    
        q = self.proj_q_u(self.proj_q_vs(x))
        k = self.proj_k_u(self.proj_k_vs(x))
        v = self.proj_v_u(self.proj_v_vs(x))
        
        q, k, v = (split_last(x, (12, -1)).transpose(1, 2) for x in [q, k, v])
        
        scores = q @ k.transpose(-2, -1) / np.sqrt(k.size(-1))
        
        if mask is not None:
            mask = mask[:, None, None, :].float()
            scores -= 10000.0 * (1.0 - mask)

        scores = self.drop(F.softmax(scores, dim=-1))
        h = (scores @ v).transpose(1, 2).contiguous()
        h = merge_last(h, 2)
        return h

In [27]:
def decomp_proj(decom_trainer, rank):
    for b in range(len(trainer.model.transformer.blocks)):
        
        block = trainer.model.transformer.blocks[b]
        target_block = decomp_trainer.model.transformer.blocks[b]

        target_block.attn = MultiHeadProjection(model_cfg, rank[b]) #, thres = decomp_trainer.model.thres
        
        proj_q_u, proj_q_s, proj_q_v = decompose_matrix(block.attn.proj_q.weight.T.detach().cpu().numpy().T, rank[b])
        target_block.attn.proj_q_u.weight.data = torch.from_numpy(proj_q_u).cuda().contiguous()
        target_block.attn.proj_q_vs.weight.data = torch.from_numpy(proj_q_s@proj_q_v).cuda().contiguous()
        target_block.attn.proj_q_u.bias.data = block.attn.proj_q.bias
        
        proj_k_u, proj_k_s, proj_k_v = decompose_matrix(block.attn.proj_k.weight.T.detach().cpu().numpy().T, rank[b])
        target_block.attn.proj_k_u.weight.data = torch.from_numpy(proj_k_u).cuda().contiguous()
        target_block.attn.proj_k_vs.weight.data = torch.from_numpy(proj_k_s@proj_k_v).cuda().contiguous()
        target_block.attn.proj_k_u.bias.data = block.attn.proj_k.bias
        
        proj_v_u, proj_v_s, proj_v_v = decompose_matrix(block.attn.proj_v.weight.T.detach().cpu().numpy().T, rank[b])
        target_block.attn.proj_v_u.weight.data = torch.from_numpy(proj_v_u).cuda().contiguous()
        target_block.attn.proj_v_vs.weight.data = torch.from_numpy(proj_v_s@proj_v_v).cuda().contiguous()
        target_block.attn.proj_v_u.bias.data = block.attn.proj_v.bias

In [28]:
rank = [300]*12
decomp_proj(decomp_trainer, rank)

In [29]:
cr = [(2*x*768) / (768*768) for x in rank]
print(1 - np.mean(cr))

0.21875


In [30]:
# Model performance after Attn Compression
results = decomp_trainer.eval(evaluate)
total_accuracy = torch.cat(results).mean().item()
print(total_accuracy)

Iter(acc=0.750): 100%|████████████████████████| 102/102 [00:02<00:00, 37.99it/s]

0.7034313678741455





In [35]:
decomp_trainer.model.transformer.blocks[0].attn

MultiHeadProjection(
  (drop): Dropout(p=0.1, inplace=False)
  (proj_q_u): Linear(in_features=768, out_features=300, bias=True)
  (proj_q_vs): Linear(in_features=300, out_features=768, bias=False)
  (proj_k_u): Linear(in_features=768, out_features=300, bias=True)
  (proj_k_vs): Linear(in_features=300, out_features=768, bias=False)
  (proj_v_u): Linear(in_features=768, out_features=300, bias=True)
  (proj_v_vs): Linear(in_features=300, out_features=768, bias=False)
)

In [36]:
decomp_trainer.train(get_loss, n_epochs = 3)

Iter (loss=0.057): 100%|██████████████████████| 230/230 [00:34<00:00,  6.61it/s]


Epoch 1/3 : Average Loss 0.189


Iter (loss=0.007): 100%|██████████████████████| 230/230 [00:31<00:00,  7.37it/s]


Epoch 2/3 : Average Loss 0.143


Iter (loss=0.003): 100%|██████████████████████| 230/230 [00:30<00:00,  7.43it/s]


Epoch 3/3 : Average Loss 0.101


In [37]:
# Model performance after Finetuning
results = decomp_trainer.eval(evaluate)
total_accuracy = torch.cat(results).mean().item()
print(total_accuracy)

Iter(acc=0.750): 100%|████████████████████████| 102/102 [00:02<00:00, 36.85it/s]

0.843137264251709



