In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#!pip install fire

In [3]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from models import *
from collections import namedtuple
import tqdm
import matplotlib.pyplot as plt
import torch.nn.utils.prune as prune
import tokenization
import models
import optim as optim
import train_org as train
from utils import set_seeds, get_device, truncate_tokens_pair
from classify import dataset_class, Tokenizing, AddSpecialTokensWithTruncation, TokenIndexing, Classifier
from classify_SVM import dataset_class, Tokenizing, AddSpecialTokensWithTruncation, TokenIndexing, Classifier_feats
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score


2024-02-29 23:40:21.824402: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Const Values

In [4]:
# Const Values for the file locations
task='mrpc'

train_cfg= f'config/train_{task}.json'
model_cfg='config/bert_base.json'

train_data_file = './train.tsv'
test_data_file='./dev.tsv'

pretrain_file = './bert_model.ckpt'

In [5]:
# Const values for the network 
max_len=128

#vocab='./PRE_TRAINED_MODEL/vocab.txt'
vocab = './vocab.txt'
save_dir = './'
model_file = './finetuned_mrpc'

In [6]:
cfg = train.Config.from_json(train_cfg)
model_cfg = models.Config.from_json(model_cfg)
set_seeds(cfg.seed)

In [7]:
model_cfg

Config(vocab_size=30522, dim=768, n_layers=12, n_heads=12, dim_ff=3072, p_drop_hidden=0.1, p_drop_attn=0.1, max_len=512, n_segments=2)

In [8]:
tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True)
TaskDataset = dataset_class(task) # task dataset class according to the task
pipeline = [Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
            AddSpecialTokensWithTruncation(max_len),
            TokenIndexing(tokenizer.convert_tokens_to_ids,
                            TaskDataset.labels, max_len)]

In [9]:
train_dataset = TaskDataset(train_data_file, pipeline)
test_dataset = TaskDataset(test_data_file, pipeline)
train_data_iter = DataLoader(train_dataset, batch_size=16, shuffle=False)
test_data_iter = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [10]:
n_components = 1
margin = 1.0

# Model Part

In [11]:
def evaluate(model, batch):
    input_ids, segment_ids, input_mask, label_id = batch
    logits = model(input_ids, segment_ids, input_mask)
    _, label_pred = logits.max(1)
    result = (label_pred == label_id).float() #.cpu().numpy()
    accuracy = result.mean()
    return accuracy, result

In [12]:
def get_loss(model, batch, global_step): # make sure loss is a scalar tensor
    model.cuda()
    input_ids, segment_ids, input_mask, label_id = batch
    logits = model(input_ids, segment_ids, input_mask)
    loss_function = LDA_Loss(n_components, margin)
    loss = loss_function(label_id, logits)
    return loss


class LDA_Loss(nn.Module):
    def __init__(self, n_components, margin):
        super(LDA_Loss, self).__init__()
        self.n_components = n_components
        self.margin = margin

    def forward(self, y_true, y_pred):
        r = 1e-4

        # Initialize groups
        groups = torch.unique(y_true)
      #  print(y_pred.shape)
        def compute_cov(group, Xt, yt):
            Xgt = Xt[yt == group]
            Xgt_bar = Xgt - torch.mean(Xgt, axis=0)
            m = float(Xgt_bar.shape[0])
            if m > 1:
                return (1.0 / (m - 1)) * torch.matmul(Xgt_bar.T, Xgt_bar)
            else:
                # If there's only one sample or no sample for this group,
                # return a zero matrix of the appropriate size
                return torch.zeros_like(torch.matmul(Xgt_bar.T, Xgt_bar))

        # Scan over groups
        covs_t = torch.stack([compute_cov(group, y_pred, y_true) for group in groups])

        # Compute average covariance matrix (within scatter)
        Sw_t = torch.mean(covs_t, axis=0)

        # Compute total scatter
        Xt_bar = y_pred - torch.mean(y_pred, axis=0)
        m = float(Xt_bar.shape[0])
        St_t = (1.0 / (m - 1)) * torch.matmul(Xt_bar.T, Xt_bar)

        # Compute between scatter
        Sb_t = St_t - Sw_t

        # Cope for numerical instability (regularize)
        Sw_t += torch.eye(Sw_t.shape[0], device=Sw_t.device) * r

        # Compute eigenvalues
        evals_t = torch.linalg.eigvalsh(Sb_t, UPLO='U')  # Use UPLO='U' for upper triangular portion

        # Get top eigenvalues
        top_k_evals = evals_t[-self.n_components:]

        # Maximize variance between classes
        thresh = torch.min(top_k_evals) + self.margin
        top_k_evals = top_k_evals[top_k_evals <= thresh]
        costs = torch.mean(top_k_evals)

        return -costs


criterion = LDA_Loss(n_components, margin)

In [13]:
model = Classifier_feats(model_cfg, len(TaskDataset.labels)).cuda()
trainer = train.Trainer(cfg, model, train_data_iter, test_data_iter, optim.optim4GPU(cfg, model),save_dir, get_device())
trainer.model.load_state_dict(torch.load(model_file))

cuda (1 GPUs)


<All keys matched successfully>

In [14]:
trainer.train(get_loss, n_epochs = 10)

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at ../torch/csrc/utils/python_arg_parser.cpp:1519.)
  next_m.mul_(beta1).add_(1 - beta1, grad)
Iter (loss=-1013.036): 100%|██████████| 230/230 [01:00<00:00,  3.80it/s]


Epoch 1/3 : Average Loss -470.616


Iter (loss=-1046.926): 100%|██████████| 230/230 [01:00<00:00,  3.81it/s]


Epoch 2/3 : Average Loss -619.523


Iter (loss=-1060.568): 100%|██████████| 230/230 [01:00<00:00,  3.79it/s]


Epoch 3/3 : Average Loss -650.526


Iter (loss=-812.154):   0%|          | 1/230 [00:00<01:00,  3.76it/s]

Epoch 4/3 : Average Loss -812.154
The Total Steps have been reached.


Iter (loss=-1060.263): 100%|██████████| 230/230 [01:00<00:00,  3.78it/s]


Epoch 4/3 : Average Loss -657.427


Iter (loss=-1060.543): 100%|██████████| 230/230 [01:00<00:00,  3.78it/s]


Epoch 5/3 : Average Loss -654.804


Iter (loss=-1050.990): 100%|██████████| 230/230 [01:00<00:00,  3.78it/s]


Epoch 6/3 : Average Loss -651.203


Iter (loss=-771.283):   1%|          | 2/230 [00:00<00:59,  3.80it/s]

Epoch 7/3 : Average Loss -789.579
The Total Steps have been reached.


Iter (loss=-1049.748): 100%|██████████| 230/230 [01:00<00:00,  3.80it/s]


Epoch 7/3 : Average Loss -653.465


Iter (loss=-1036.728): 100%|██████████| 230/230 [01:00<00:00,  3.80it/s]


Epoch 8/3 : Average Loss -665.838


Iter (loss=-1066.565): 100%|██████████| 230/230 [01:00<00:00,  3.80it/s]


Epoch 9/3 : Average Loss -668.580


Iter (loss=-496.924):   1%|▏         | 3/230 [00:00<00:59,  3.81it/s]

Epoch 10/3 : Average Loss -690.969
The Total Steps have been reached.


Iter (loss=-1056.080): 100%|██████████| 230/230 [01:00<00:00,  3.81it/s]


Epoch 10/3 : Average Loss -667.721


In [24]:
def get_features_and_labels(data_iter, model):
    model.eval()
    all_features = []
    all_labels = []
    with torch.no_grad():
        for batch in data_iter:
            input_ids, segment_ids, input_mask, label_id = batch
            feats = model(input_ids.cuda(), segment_ids.cuda(), input_mask.cuda())
            all_features.append(feats.cpu())  
            all_labels.append(label_id.cpu())
    return torch.cat(all_features), torch.cat(all_labels)


In [15]:

# Get new features and labels for training data
x_train, y_train = get_features_and_labels(train_data_iter, model)

x_test, y_test = get_features_and_labels(test_data_iter, model)



In [18]:
x_train

tensor([[-1.0000, -1.0000,  1.0000,  ..., -1.0000, -1.0000,  1.0000],
        [ 1.0000,  1.0000, -1.0000,  ...,  1.0000,  1.0000, -1.0000],
        [-1.0000, -1.0000,  1.0000,  ..., -1.0000, -1.0000,  1.0000],
        ...,
        [-1.0000, -1.0000,  1.0000,  ..., -1.0000, -1.0000,  1.0000],
        [-1.0000, -1.0000,  1.0000,  ..., -1.0000, -1.0000,  1.0000],
        [ 1.0000,  1.0000, -1.0000,  ...,  1.0000,  1.0000, -1.0000]])

In [26]:
def train_classifier(features, labels,classifier):
    if classifier=='svm':
        svm = SVC(kernel='linear',random_state=42)
        svm.fit(features, labels)
        return svm
    elif classifier=='rf':
        rf = RandomForestClassifier(random_state = 42)
        rf.fit(features, labels)
        return rf
    elif classifier=='boosting':
        adb = AdaBoostClassifier(n_estimators=100, random_state = 42)
        adb.fit(features, labels)
        return adb
    else:
        gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
        gb.fit(features,labels)
        return gb
    

def evaluate_classifier(classifier, features, labels):
    predictions = classifier.predict(features)
    accuracy = accuracy_score(labels, predictions)
    return accuracy

In [21]:
svm_classifier = train_classifier(x_train, y_train, 'svm')
rf_classifier = train_classifier(x_train, y_train, 'rf')
adb_classifier = train_classifier(x_train, y_train, 'boosting')
gb_classifier = train_classifier(x_train, y_train, 'gb')

svm_accuracy = evaluate_classifier(svm_classifier, x_test, y_test)
rf_accuracy = evaluate_classifier(rf_classifier,  x_test, y_test)
adb_accuracy = evaluate_classifier(adb_classifier, x_test, y_test)
gb_accuracy = evaluate_classifier(gb_classifier, x_test, y_test)

# Print accuracies
print("SVM Accuracy:", svm_accuracy)
print("Random Forest Accuracy:", rf_accuracy)
print("AdaBoost Accuracy:", adb_accuracy)
print("Gradient Boosting Accuracy:", gb_accuracy)

SVM Accuracy: 0.8382352941176471
Random Forest Accuracy: 0.8406862745098039
AdaBoost Accuracy: 0.8480392156862745
Gradient Boosting Accuracy: 0.8357843137254902


In [13]:
model = Classifier_feats(model_cfg, len(TaskDataset.labels)).cuda()
trainer = train.Trainer(cfg, model, train_data_iter, test_data_iter, optim.optim4GPU(cfg, model),save_dir, get_device())
trainer.model.load_state_dict(torch.load(model_file))

cuda (1 GPUs)


<All keys matched successfully>

In [14]:
decomp_model = Classifier_feats(model_cfg, len(TaskDataset.labels)).cuda()
decomp_trainer = train.Trainer(cfg, decomp_model, train_data_iter, test_data_iter, optim.optim4GPU(cfg, decomp_model),save_dir,'cuda')
decomp_trainer.model.load_state_dict(torch.load(model_file))

<All keys matched successfully>

### Fully Connected Compression

In [15]:
def gelu(x):
    "Implementation of the gelu activation function by Hugging Face"
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

In [16]:
class FCCompression(nn.Module):
    def __init__(self, cfg, k):
        super().__init__()
       
        self.fc1_u = nn.Linear(768, k, bias = False)
        self.fc1_vs = nn.Linear(k, 3072)
        
        self.fc2_u = nn.Linear(3072, k)
        self.fc2_vs = nn.Linear(k, 768)

    def forward(self, x):
        out = gelu(self.fc1_vs(self.fc1_u(x)))
        out = self.fc2_vs(self.fc2_u(out))
        return out

In [17]:
def decompose_matrix(mat, k):
    U, s, VT = np.linalg.svd(mat, full_matrices=False)
    Sigma = np.diag(s[:k])
    U_truncated = U[:, :k]
    VT_truncated = VT[:k, :]
    return U_truncated, Sigma, VT_truncated


In [18]:
def decomp_model_func(decomp_trainer, rank):
    for b in range(len(trainer.model.transformer.blocks)):
        
        block = trainer.model.transformer.blocks[b]
        target_block = decomp_trainer.model.transformer.blocks[b]

        target_block.pwff = FCCompression(model_cfg, rank[b])
        
        fc1_U, fc1_Sigma, fc1_V = decompose_matrix(block.pwff.fc1.weight.detach().cpu().numpy().T, rank[b])
        target_block.pwff.fc1_u.weight.data = torch.from_numpy(fc1_U).T.cuda().contiguous()
        target_block.pwff.fc1_vs.weight.data = torch.from_numpy(fc1_Sigma@fc1_V).T.cuda().contiguous()
        target_block.pwff.fc1_vs.bias.data = block.pwff.fc1.bias
        
        fc2_U, fc2_Sigma, fc2_V = decompose_matrix(block.pwff.fc2.weight.detach().cpu().numpy().T, rank[b])
        target_block.pwff.fc2_u.weight.data = torch.from_numpy(fc2_U).T.cuda().contiguous()
        target_block.pwff.fc2_vs.weight.data = torch.from_numpy(fc2_Sigma@fc2_V).T.cuda().contiguous()
        target_block.pwff.fc2_vs.bias.data = block.pwff.fc2.bias
        

In [19]:
rank = [300]*12
rank[0] = rank[1] = 384
decomp_model_func(decomp_trainer, rank)

In [20]:
compression = 0
for r in rank:
    compression += (3072*r + 768*r)

compression = 1 - compression/ (12*768*3072)
print(compression)

0.48893229166666663


In [21]:
print('hi')

hi


In [22]:
decomp_trainer.train(get_loss, n_epochs = 2)

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at ../torch/csrc/utils/python_arg_parser.cpp:1519.)
  next_m.mul_(beta1).add_(1 - beta1, grad)
Iter (loss=-920.346): 100%|██████████| 230/230 [00:51<00:00,  4.49it/s]


Epoch 1/3 : Average Loss -211.897


Iter (loss=-1028.436): 100%|██████████| 230/230 [00:51<00:00,  4.50it/s]


Epoch 2/3 : Average Loss -374.427


In [25]:
# Get features and labels for training data
x_train, y_train = get_features_and_labels(train_data_iter, decomp_model)

# Get features and labels for testing data
x_test, y_test = get_features_and_labels(test_data_iter, decomp_model)

In [27]:
svm_classifier = train_classifier(x_train, y_train, 'svm')
rf_classifier = train_classifier(x_train, y_train, 'rf')
adb_classifier = train_classifier(x_train, y_train, 'boosting')
gb_classifier = train_classifier(x_train, y_train, 'gb')

svm_accuracy = evaluate_classifier(svm_classifier, x_test, y_test)
rf_accuracy = evaluate_classifier(rf_classifier,  x_test, y_test)
adb_accuracy = evaluate_classifier(adb_classifier, x_test, y_test)
gb_accuracy = evaluate_classifier(gb_classifier, x_test, y_test)

# Print accuracies
print("SVM Accuracy:", svm_accuracy)
print("Random Forest Accuracy:", rf_accuracy)
print("AdaBoost Accuracy:", adb_accuracy)
print("Gradient Boosting Accuracy:", gb_accuracy)

SVM Accuracy: 0.7843137254901961
Random Forest Accuracy: 0.8186274509803921
AdaBoost Accuracy: 0.8014705882352942
Gradient Boosting Accuracy: 0.821078431372549


## Attention

In [28]:
class MultiHeadProjection(nn.Module):
    """ Multi-Headed Dot Product Attention """
    def __init__(self, cfg, k):
        super().__init__()
        
        self.drop = nn.Dropout(cfg.p_drop_attn)
        
        self.proj_q_u = nn.Linear(768,k)
        self.proj_q_vs = nn.Linear(k, 768, bias = False)
        
        self.proj_k_u = nn.Linear(768,k)
        self.proj_k_vs = nn.Linear(k, 768, bias = False)
        
        self.proj_v_u = nn.Linear(768,k)
        self.proj_v_vs = nn.Linear(k, 768, bias = False)
            
    def forward(self, x, mask):
        """
        x, q(query), k(key), v(value) : (B(batch_size), S(seq_len), D(dim))
        mask : (B(batch_size) x S(seq_len))
        * split D(dim) into (H(n_heads), W(width of head)) ; D = H * W
        """
    
        q = self.proj_q_u(self.proj_q_vs(x))
        k = self.proj_k_u(self.proj_k_vs(x))
        v = self.proj_v_u(self.proj_v_vs(x))
        
        q, k, v = (split_last(x, (12, -1)).transpose(1, 2) for x in [q, k, v])
        
        scores = q @ k.transpose(-2, -1) / np.sqrt(k.size(-1))
        
        if mask is not None:
            mask = mask[:, None, None, :].float()
            scores -= 10000.0 * (1.0 - mask)

        scores = self.drop(F.softmax(scores, dim=-1))
        h = (scores @ v).transpose(1, 2).contiguous()
        h = merge_last(h, 2)
        return h

In [29]:
def decomp_proj(decom_trainer, rank):
    for b in range(len(trainer.model.transformer.blocks)):
        
        block = trainer.model.transformer.blocks[b]
        target_block = decomp_trainer.model.transformer.blocks[b]

        target_block.attn = MultiHeadProjection(model_cfg, rank[b]) #, thres = decomp_trainer.model.thres
        
        proj_q_u, proj_q_s, proj_q_v = decompose_matrix(block.attn.proj_q.weight.T.detach().cpu().numpy().T, rank[b])
        target_block.attn.proj_q_u.weight.data = torch.from_numpy(proj_q_u).cuda().contiguous()
        target_block.attn.proj_q_vs.weight.data = torch.from_numpy(proj_q_s@proj_q_v).cuda().contiguous()
        target_block.attn.proj_q_u.bias.data = block.attn.proj_q.bias
        
        proj_k_u, proj_k_s, proj_k_v = decompose_matrix(block.attn.proj_k.weight.T.detach().cpu().numpy().T, rank[b])
        target_block.attn.proj_k_u.weight.data = torch.from_numpy(proj_k_u).cuda().contiguous()
        target_block.attn.proj_k_vs.weight.data = torch.from_numpy(proj_k_s@proj_k_v).cuda().contiguous()
        target_block.attn.proj_k_u.bias.data = block.attn.proj_k.bias
        
        proj_v_u, proj_v_s, proj_v_v = decompose_matrix(block.attn.proj_v.weight.T.detach().cpu().numpy().T, rank[b])
        target_block.attn.proj_v_u.weight.data = torch.from_numpy(proj_v_u).cuda().contiguous()
        target_block.attn.proj_v_vs.weight.data = torch.from_numpy(proj_v_s@proj_v_v).cuda().contiguous()
        target_block.attn.proj_v_u.bias.data = block.attn.proj_v.bias

In [31]:
rank = [225]*12
rank[0] = rank[1] = 384
decomp_proj(decomp_trainer, rank)

In [32]:
cr = [(2*x*768) / (768*768) for x in rank]
print(1 - np.mean(cr))

0.34505208333333337


In [33]:
decomp_trainer.train(get_loss, n_epochs = 2)

Iter (loss=-1033.359): 100%|██████████| 230/230 [00:47<00:00,  4.89it/s]


Epoch 1/3 : Average Loss -241.105


Iter (loss=-1049.900): 100%|██████████| 230/230 [00:47<00:00,  4.88it/s]


Epoch 2/3 : Average Loss -343.038


In [34]:
# Get features and labels for training data
x_train, y_train = get_features_and_labels(train_data_iter, decomp_model)

# Get features and labels for testing data
x_test, y_test = get_features_and_labels(test_data_iter, decomp_model)

In [35]:
x_train

tensor([[ 0.9963,  0.9951, -0.9919,  ...,  0.9969,  0.9944, -0.9975],
        [ 0.9955,  0.9959, -0.9959,  ...,  0.9968,  0.9969, -0.9974],
        [-0.9912, -0.9946,  0.9927,  ..., -0.9936, -0.9923,  0.9927],
        ...,
        [-0.9974, -0.9979,  0.9959,  ..., -0.9949, -0.9978,  0.9974],
        [-0.9960, -0.9965,  0.9932,  ..., -0.9931, -0.9963,  0.9963],
        [ 0.9966,  0.9971, -0.9973,  ...,  0.9976,  0.9977, -0.9981]])

In [36]:
svm_classifier = train_classifier(x_train, y_train, 'svm')
rf_classifier = train_classifier(x_train, y_train, 'rf')
adb_classifier = train_classifier(x_train, y_train, 'boosting')
gb_classifier = train_classifier(x_train, y_train, 'gb')

svm_accuracy = evaluate_classifier(svm_classifier, x_test, y_test)
rf_accuracy = evaluate_classifier(rf_classifier,  x_test, y_test)
adb_accuracy = evaluate_classifier(adb_classifier, x_test, y_test)
gb_accuracy = evaluate_classifier(gb_classifier, x_test, y_test)

# Print accuracies
print("SVM Accuracy:", svm_accuracy)
print("Random Forest Accuracy:", rf_accuracy)
print("AdaBoost Accuracy:", adb_accuracy)
print("Gradient Boosting Accuracy:", gb_accuracy)

SVM Accuracy: 0.7745098039215687
Random Forest Accuracy: 0.7818627450980392
AdaBoost Accuracy: 0.7745098039215687
Gradient Boosting Accuracy: 0.7794117647058824
