In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from models import *
from collections import namedtuple
import tqdm
import matplotlib.pyplot as plt
import torch.nn.utils.prune as prune
import tokenization
import models
import optim as optim
import train_org as train
from utils import set_seeds, get_device, truncate_tokens_pair
from classify import dataset_class, Tokenizing, AddSpecialTokensWithTruncation, TokenIndexing, Classifier
from classify_SVM import dataset_class, Tokenizing, AddSpecialTokensWithTruncation, TokenIndexing, Classifier_feats
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score





DATA LOADING

In [3]:
# Const Values for the file locations
task='mrpc'

train_cfg= f'config/train_{task}.json'
model_cfg='config/bert_base.json'

train_data_file = f'C:/Users/Vyshnavi S K/Downloads/GLUE-baselines-master/GLUE-baselines-master/glue_data/MRPC/train.tsv'
test_data_file=f'C:/Users/Vyshnavi S K/Downloads/GLUE-baselines-master/GLUE-baselines-master/glue_data/MRPC/dev.tsv'

pretrain_file = 'C:/Users/Vyshnavi S K/Downloads/pytorch-pretrained-BERT-master/pytorch-pretrained-BERT-master/uncased_L-12_H-768_A-12/bert_model.ckpt'

In [4]:
# Const values for the network 
max_len=128

#vocab='./PRE_TRAINED_MODEL/vocab.txt'
vocab = 'C:/Users/Vyshnavi S K/Downloads/vocab.txt'
save_dir = './SAVE'
model_file = 'C:/Users/Vyshnavi S K/Downloads/finetuned_mrpc'

In [5]:
cfg = train.Config.from_json(train_cfg)
model_cfg = models.Config.from_json(model_cfg)
set_seeds(cfg.seed)

In [6]:
model_cfg

Config(vocab_size=30522, dim=768, n_layers=12, n_heads=12, dim_ff=3072, p_drop_hidden=0.1, p_drop_attn=0.1, max_len=512, n_segments=2)

PREPROCESSING - TOKENIZATION

In [7]:
tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True)
TaskDataset = dataset_class(task) # task dataset class according to the task
pipeline = [Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
            AddSpecialTokensWithTruncation(max_len),
            TokenIndexing(tokenizer.convert_tokens_to_ids,
                            TaskDataset.labels, max_len)]

In [8]:
train_dataset = TaskDataset(train_data_file, pipeline)
test_dataset = TaskDataset(test_data_file, pipeline)
train_data_iter = DataLoader(train_dataset, batch_size=16, shuffle=False)
test_data_iter = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [9]:
print(len(train_dataset),len(test_dataset))

3668 408


MODEL - BERT - pretrained

In [10]:
def evaluate(model, batch):
    input_ids, segment_ids, input_mask, label_id = batch
    logits = model(input_ids, segment_ids, input_mask)
    _, label_pred = logits.max(1)
    result = (label_pred == label_id).float() #.cpu().numpy()
    accuracy = result.mean()
    return accuracy, result

In [11]:
def get_loss(model, batch, global_step): # make sure loss is a scalar tensor
    input_ids, segment_ids, input_mask, label_id = batch
    logits = model(input_ids, segment_ids, input_mask)
    loss = criterion(logits, label_id)
    return loss

criterion = nn.CrossEntropyLoss()

In [12]:
model = Classifier(model_cfg, len(TaskDataset.labels)).cuda()
trainer = train.Trainer(cfg, model, train_data_iter, test_data_iter, optim.optim4GPU(cfg, model),save_dir, get_device())
trainer.model.load_state_dict(torch.load(model_file))

cuda (1 GPUs)


<All keys matched successfully>

In [13]:
results = trainer.eval(evaluate)
total_accuracy = torch.cat(results).mean().item()
print(total_accuracy)

Iter(acc=1.000): 100%|██████████| 102/102 [00:18<00:00,  5.46it/s]


0.867647111415863


DIFFERENT CLASSIFIERS

In [14]:
#loading features from BERT model
model_feats = Classifier_feats(model_cfg, len(TaskDataset.labels)).cuda()
trainer = train.Trainer(cfg, model_feats, train_data_iter, test_data_iter, optim.optim4GPU(cfg, model_feats),save_dir, get_device())
trainer.model.load_state_dict(torch.load(model_file))

cuda (1 GPUs)


<All keys matched successfully>

In [15]:
def extract_features(model, data_iter, device):
    all_features = []
    all_labels = []
    model.eval()
    with torch.no_grad():
        for batch in data_iter:
            input_ids, segment_ids, input_mask, label_id = [tensor.to(device) for tensor in batch]
            features = model(input_ids, segment_ids, input_mask)
            all_features.append(features.cpu().numpy())
            all_labels.append(label_id.cpu().numpy())
    all_features = np.concatenate(all_features, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    return all_features, all_labels


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
train_features, train_labels = extract_features(model_feats, train_data_iter, device)
test_features, test_labels = extract_features(model_feats, test_data_iter, device)


In [17]:
def train_classifier(features, labels,classifier):
    if classifier=='svm':
        svm = SVC(kernel='linear',random_state=42)
        svm.fit(features, labels)
        return svm
    elif classifier=='rf':
        rf = RandomForestClassifier(random_state = 42)
        rf.fit(features, labels)
        return rf
    elif classifier=='boosting':
        adb = AdaBoostClassifier(n_estimators=100, random_state = 42)
        adb.fit(features, labels)
        return adb
    else:
        gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
        gb.fit(features,labels)
        return gb
    

def evaluate_classifier(classifier, features, labels):
    predictions = classifier.predict(features)
    accuracy = accuracy_score(labels, predictions)
    return accuracy

In [18]:
svm_classifier = train_classifier(train_features, train_labels, 'svm')
rf_classifier = train_classifier(train_features, train_labels, 'rf')
adb_classifier = train_classifier(train_features, train_labels, 'boosting')
gb_classifier = train_classifier(train_features, train_labels, 'gb')

svm_accuracy = evaluate_classifier(svm_classifier, test_features, test_labels)
rf_accuracy = evaluate_classifier(rf_classifier, test_features, test_labels)
adb_accuracy = evaluate_classifier(adb_classifier, test_features, test_labels)
gb_accuracy = evaluate_classifier(gb_classifier,test_features, test_labels)

# Print accuracies
print("SVM Accuracy:", svm_accuracy)
print("Random Forest Accuracy:", rf_accuracy)
print("AdaBoost Accuracy:", adb_accuracy)
print("Gradient Boosting Accuracy:", gb_accuracy)




SVM Accuracy: 0.875
Random Forest Accuracy: 0.8676470588235294
AdaBoost Accuracy: 0.8529411764705882
Gradient Boosting Accuracy: 0.8676470588235294


FULLY CONNECTED COMPRESSION

In [19]:
def gelu(x):
    "Implementation of the gelu activation function by Hugging Face"
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

In [20]:
class FCCompression(nn.Module):
    def __init__(self, cfg, k):
        super().__init__()
       
        self.fc1_u = nn.Linear(768, k, bias = False)
        self.fc1_vs = nn.Linear(k, 3072)
        
        self.fc2_u = nn.Linear(3072, k)
        self.fc2_vs = nn.Linear(k, 768)

    def forward(self, x):
        out = gelu(self.fc1_vs(self.fc1_u(x)))
        out = self.fc2_vs(self.fc2_u(out))
        return out

In [21]:
def decompose_matrix(mat, k):
    U, s, VT = np.linalg.svd(mat, full_matrices=False)
    Sigma = np.diag(s[:k])
    U_truncated = U[:, :k]
    VT_truncated = VT[:k, :]
    return U_truncated, Sigma, VT_truncated


In [22]:
def decomp_model_func(decomp_trainer, rank):
    for b in range(len(trainer.model.transformer.blocks)):
        
        block = trainer.model.transformer.blocks[b]
        target_block = decomp_trainer.model.transformer.blocks[b]

        target_block.pwff = FCCompression(model_cfg, rank[b])
        
        fc1_U, fc1_Sigma, fc1_V = decompose_matrix(block.pwff.fc1.weight.detach().cpu().numpy().T, rank[b])
        target_block.pwff.fc1_u.weight.data = torch.from_numpy(fc1_U).T.cuda().contiguous()
        target_block.pwff.fc1_vs.weight.data = torch.from_numpy(fc1_Sigma@fc1_V).T.cuda().contiguous()
        target_block.pwff.fc1_vs.bias.data = block.pwff.fc1.bias
        
        fc2_U, fc2_Sigma, fc2_V = decompose_matrix(block.pwff.fc2.weight.detach().cpu().numpy().T, rank[b])
        target_block.pwff.fc2_u.weight.data = torch.from_numpy(fc2_U).T.cuda().contiguous()
        target_block.pwff.fc2_vs.weight.data = torch.from_numpy(fc2_Sigma@fc2_V).T.cuda().contiguous()
        target_block.pwff.fc2_vs.bias.data = block.pwff.fc2.bias
        

In [23]:
model = Classifier(model_cfg, len(TaskDataset.labels)).cuda()
trainer = train.Trainer(cfg, model, train_data_iter, test_data_iter, optim.optim4GPU(cfg, model),save_dir, get_device())
trainer.model.load_state_dict(torch.load(model_file))

cuda (1 GPUs)


<All keys matched successfully>

In [24]:
decomp_model = Classifier(model_cfg, len(TaskDataset.labels)).cuda()
decomp_trainer = train.Trainer(cfg, decomp_model, train_data_iter, test_data_iter, optim.optim4GPU(cfg, decomp_model),save_dir, get_device())
decomp_trainer.model.load_state_dict(torch.load(model_file))

cuda (1 GPUs)


<All keys matched successfully>

In [25]:
rank = [300]*12
rank[0] = rank[1] = 384
decomp_model_func(decomp_trainer, rank)

In [26]:
compression = 0
for r in rank:
    compression += (3072*r + 768*r)

compression = 1 - compression/ (12*768*3072)
print(compression)

0.48893229166666663


In [27]:
#fine tune after compression - TODO


In [28]:
# Model performance after FC Compression
results = decomp_trainer.eval(evaluate)
total_accuracy = torch.cat(results).mean().item()
print(total_accuracy)

Iter(acc=0.750): 100%|██████████| 102/102 [00:16<00:00,  6.09it/s]


0.7279412150382996


CLASSIFIER

In [29]:
#loading features from BERT model
model_feats = Classifier_feats(model_cfg, len(TaskDataset.labels)).cuda()
trainer = train.Trainer(cfg, model_feats, train_data_iter, test_data_iter, optim.optim4GPU(cfg, model_feats),save_dir, get_device())
trainer.model.load_state_dict(torch.load(model_file))

cuda (1 GPUs)


<All keys matched successfully>

In [30]:
decomp_model = Classifier(model_cfg, len(TaskDataset.labels)).cuda()
decomp_trainer = train.Trainer(cfg, decomp_model, train_data_iter, test_data_iter, optim.optim4GPU(cfg, decomp_model),save_dir, get_device())
decomp_trainer.model.load_state_dict(torch.load(model_file))

cuda (1 GPUs)


<All keys matched successfully>

In [31]:
rank = [300]*12
rank[0] = rank[1] = 384
decomp_model_func(decomp_trainer, rank)

In [32]:
compression = 0
for r in rank:
    compression += (3072*r + 768*r)

compression = 1 - compression/ (12*768*3072)
print(compression)

0.48893229166666663


In [None]:
#FINETUNE _ TODO

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
decomp_model.to(device)
train_features, train_labels = extract_features(decomp_model, train_data_iter, device)
test_features, test_labels = extract_features(decomp_model, test_data_iter, device)


In [34]:
svm_classifier = train_classifier(train_features, train_labels, 'svm')
rf_classifier = train_classifier(train_features, train_labels, 'rf')
adb_classifier = train_classifier(train_features, train_labels, 'boosting')
gb_classifier = train_classifier(train_features, train_labels, 'gb')

svm_accuracy = evaluate_classifier(svm_classifier, test_features, test_labels)
rf_accuracy = evaluate_classifier(rf_classifier, test_features, test_labels)
adb_accuracy = evaluate_classifier(adb_classifier, test_features, test_labels)
gb_accuracy = evaluate_classifier(gb_classifier,test_features, test_labels)

# Print accuracies
print("SVM Accuracy:", svm_accuracy)
print("Random Forest Accuracy:", rf_accuracy)
print("AdaBoost Accuracy:", adb_accuracy)
print("Gradient Boosting Accuracy:", gb_accuracy)




SVM Accuracy: 0.7230392156862745
Random Forest Accuracy: 0.7034313725490197
AdaBoost Accuracy: 0.7205882352941176
Gradient Boosting Accuracy: 0.7107843137254902


ATTENTION

In [35]:
class MultiHeadProjection(nn.Module):
    """ Multi-Headed Dot Product Attention """
    def __init__(self, cfg, k):
        super().__init__()
        
        self.drop = nn.Dropout(cfg.p_drop_attn)
        
        self.proj_q_u = nn.Linear(768,k)
        self.proj_q_vs = nn.Linear(k, 768, bias = False)
        
        self.proj_k_u = nn.Linear(768,k)
        self.proj_k_vs = nn.Linear(k, 768, bias = False)
        
        self.proj_v_u = nn.Linear(768,k)
        self.proj_v_vs = nn.Linear(k, 768, bias = False)
            
    def forward(self, x, mask):
        """
        x, q(query), k(key), v(value) : (B(batch_size), S(seq_len), D(dim))
        mask : (B(batch_size) x S(seq_len))
        * split D(dim) into (H(n_heads), W(width of head)) ; D = H * W
        """
    
        q = self.proj_q_u(self.proj_q_vs(x))
        k = self.proj_k_u(self.proj_k_vs(x))
        v = self.proj_v_u(self.proj_v_vs(x))
        
        q, k, v = (split_last(x, (12, -1)).transpose(1, 2) for x in [q, k, v])
        
        scores = q @ k.transpose(-2, -1) / np.sqrt(k.size(-1))
        
        if mask is not None:
            mask = mask[:, None, None, :].float()
            scores -= 10000.0 * (1.0 - mask)

        scores = self.drop(F.softmax(scores, dim=-1))
        h = (scores @ v).transpose(1, 2).contiguous()
        h = merge_last(h, 2)
        return h

In [36]:
def decomp_proj(decom_trainer, rank):
    for b in range(len(trainer.model.transformer.blocks)):
        
        block = trainer.model.transformer.blocks[b]
        target_block = decomp_trainer.model.transformer.blocks[b]

        target_block.attn = MultiHeadProjection(model_cfg, rank[b]) #, thres = decomp_trainer.model.thres
        
        proj_q_u, proj_q_s, proj_q_v = decompose_matrix(block.attn.proj_q.weight.T.detach().cpu().numpy().T, rank[b])
        target_block.attn.proj_q_u.weight.data = torch.from_numpy(proj_q_u).cuda().contiguous()
        target_block.attn.proj_q_vs.weight.data = torch.from_numpy(proj_q_s@proj_q_v).cuda().contiguous()
        target_block.attn.proj_q_u.bias.data = block.attn.proj_q.bias
        
        proj_k_u, proj_k_s, proj_k_v = decompose_matrix(block.attn.proj_k.weight.T.detach().cpu().numpy().T, rank[b])
        target_block.attn.proj_k_u.weight.data = torch.from_numpy(proj_k_u).cuda().contiguous()
        target_block.attn.proj_k_vs.weight.data = torch.from_numpy(proj_k_s@proj_k_v).cuda().contiguous()
        target_block.attn.proj_k_u.bias.data = block.attn.proj_k.bias
        
        proj_v_u, proj_v_s, proj_v_v = decompose_matrix(block.attn.proj_v.weight.T.detach().cpu().numpy().T, rank[b])
        target_block.attn.proj_v_u.weight.data = torch.from_numpy(proj_v_u).cuda().contiguous()
        target_block.attn.proj_v_vs.weight.data = torch.from_numpy(proj_v_s@proj_v_v).cuda().contiguous()
        target_block.attn.proj_v_u.bias.data = block.attn.proj_v.bias

In [37]:
rank = [225]*12
rank[0] = rank[1] = 384
decomp_proj(decomp_trainer, rank)

In [38]:
cr = [(2*x*768) / (768*768) for x in rank]
print(1 - np.mean(cr))

0.34505208333333337


In [None]:
#FINETUNE - TODO

In [39]:
# Model performance after Attn Compression
results = decomp_trainer.eval(evaluate)
total_accuracy = torch.cat(results).mean().item()
print(total_accuracy)

Iter(acc=0.750): 100%|██████████| 102/102 [00:16<00:00,  6.21it/s]


0.6887255311012268


In [44]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
decomp_model.to(device)
train_features, train_labels = extract_features(decomp_model, train_data_iter, device)
test_features, test_labels = extract_features(decomp_model, test_data_iter, device)


In [45]:
svm_classifier = train_classifier(train_features, train_labels, 'svm')
rf_classifier = train_classifier(train_features, train_labels, 'rf')
adb_classifier = train_classifier(train_features, train_labels, 'boosting')
gb_classifier = train_classifier(train_features, train_labels, 'gb')

svm_accuracy = evaluate_classifier(svm_classifier, test_features, test_labels)
rf_accuracy = evaluate_classifier(rf_classifier, test_features, test_labels)
adb_accuracy = evaluate_classifier(adb_classifier, test_features, test_labels)
gb_accuracy = evaluate_classifier(gb_classifier,test_features, test_labels)

# Print accuracies
print("SVM Accuracy:", svm_accuracy)
print("Random Forest Accuracy:", rf_accuracy)
print("AdaBoost Accuracy:", adb_accuracy)
print("Gradient Boosting Accuracy:", gb_accuracy)




SVM Accuracy: 0.6838235294117647
Random Forest Accuracy: 0.6102941176470589
AdaBoost Accuracy: 0.6862745098039216
Gradient Boosting Accuracy: 0.6911764705882353


TUNING CLASSIFIERS

In [46]:
from sklearn.model_selection import GridSearchCV

def tune_classifiers(classifier, param_grid, features, labels):
    grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=3, scoring='accuracy')
    grid_search.fit(features, labels)
    return grid_search.best_estimator_

def tune_and_train_classifier(features, labels, classifier, kernels=None):
    if classifier == 'svm':
        if kernels is None:
            kernels = ['linear', 'poly', 'rbf', 'sigmoid']
        
        param_grid = {'C': [0.1, 1, 10]}
        
        if 'poly' in kernels:
            param_grid['degree'] = [2, 3, 4]
        if 'rbf' in kernels:
            param_grid['gamma'] = ['scale', 'auto']
        if 'sigmoid' in kernels:
            param_grid['coef0'] = [0.0, 0.5, 1.0]
        
        tuned_classifier = tune_classifiers(SVC(), param_grid, features, labels)
    elif classifier == 'rf':
        param_grid = {'n_estimators': [50, 100, 200]}
        tuned_classifier = tune_classifiers(RandomForestClassifier(random_state=42), param_grid, features, labels)
    elif classifier == 'boosting':
        param_grid = {'n_estimators': [50, 100, 200]}
        tuned_classifier = tune_classifiers(AdaBoostClassifier(random_state=42), param_grid, features, labels)
    else:
        param_grid = {'n_estimators': [50, 100, 200]}
        tuned_classifier = tune_classifiers(GradientBoostingClassifier(random_state=42), param_grid, features, labels)
    
    tuned_classifier.fit(features, labels)
    return tuned_classifier


In [47]:
tuned_svm_classifier = tune_and_train_classifier(train_features, train_labels, 'svm', kernels=['linear', 'poly', 'rbf', 'sigmoid'])
svm_accuracy = evaluate_classifier(tuned_svm_classifier, test_features, test_labels)
print(svm_accuracy)

0.6936274509803921


In [48]:
tuned_rf_classifier = tune_and_train_classifier(train_features, train_labels, 'rf')
rf_accuracy = evaluate_classifier(tuned_rf_classifier, test_features, test_labels)
print("Random Forest Accuracy:", rf_accuracy)

Random Forest Accuracy: 0.6029411764705882


In [49]:
tuned_adb_classifier = tune_and_train_classifier(train_features, train_labels, 'boosting')
adb_accuracy = evaluate_classifier(tuned_adb_classifier, test_features, test_labels)
print("AdaBoost Accuracy:", adb_accuracy)




AdaBoost Accuracy: 0.6838235294117647


In [50]:
tuned_gb_classifier = tune_and_train_classifier(train_features, train_labels, 'gb')
gb_accuracy = evaluate_classifier(tuned_gb_classifier, test_features, test_labels)
print("Gradient Boosting Accuracy:", gb_accuracy)

Gradient Boosting Accuracy: 0.6887254901960784
