# Imports

In [1]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration
from Datasets import Api_Dataset, Mbpp_Dataset
import torch
import numpy as np
from Datasets import Test_Dataset
from torch.utils.data import DataLoader
import pandas as pd
from utils import get_module_names, get_module_by_name
import torch.nn.utils.prune as prune
from utils import get_module_by_name, get_module_names, get_bleu_score, fine_tune
import torch.backends.cudnn as cudnn
import pickle
import os

# Load Model

In [2]:
model_path = 'final_model'
# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Your device is {device}.')
# Set up model and tokenizer
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = model.to(device)
if device == 'cuda':
    model = torch.nn.DataParallel(model)
    cudnn.benchmark = True

exp_name = 'exp_0.2'
checkpoint = torch.load(f'./sparse_models/T5ForConditionalGeneration/{exp_name}/sparse_weights.pth')
model.load_state_dict(checkpoint)

Your device is cuda.


  return torch.load(checkpoint_file, map_location="cpu")
  checkpoint = torch.load(f'./sparse_models/T5ForConditionalGeneration/{exp_name}/sparse_weights.pth')


<All keys matched successfully>

# Load Datasets and Data Loaders

## MBPP Dataset

In [3]:
Mbpp_train_dataset = Mbpp_Dataset('mbpp_train.csv', text_length=128, code_length=128)
Mbpp_valid_dataset = Mbpp_Dataset('mbpp_valid.csv', text_length=128, code_length=128)

mbpp_train_loader = DataLoader(Mbpp_train_dataset, batch_size=8, shuffle=True)
mbpp_val_loader = DataLoader(Mbpp_valid_dataset, batch_size=8, shuffle=False)

mbpp_test_dataset = Test_Dataset(data='data/mbpp/mbpp_test.csv', task_prefix='Generate code from natural language: (from Mbpp)')
mbpp_test_loader = DataLoader(dataset=mbpp_test_dataset, batch_size=128, shuffle=False)
MBPP_Test_DF = pd.read_csv('data/mbpp/mbpp_test.csv')



## Python-API Dataset

In [47]:
Api_train_dataset = Api_Dataset('api-mined_train.csv', text_length=256, code_length=64)
Api_valid_dataset = Api_Dataset('api-mined_valid.csv', text_length=256, code_length=64)

api_train_loader = DataLoader(Api_train_dataset, batch_size=16, shuffle=True)
api_val_loader = DataLoader(Api_valid_dataset, batch_size=16, shuffle=False)

api_test_dataset = Test_Dataset(data='data/pythonapi/test_processing.csv', task_prefix='Generate code from natural language: (from PythonAPI)')
api_test_loader = DataLoader(dataset=api_test_dataset, batch_size=256, shuffle=False)
API_Test_DF = pd.read_csv('data/pythonapi/test_processing.csv')



# Pruned Model Performance

## MBPP Dataset

In [4]:
print("BLEU score:", get_bleu_score(model, tokenizer, mbpp_test_loader, MBPP_Test_DF))

BLEU score: 0.14992665956103046


## Python-API Dataset

In [6]:
print("BLEU Score:", get_bleu_score(model, tokenizer, api_test_loader, API_Test_DF))

BLEU Score: 0.2281461434656612


# Check Sparsity

In [5]:
module_names = []
get_module_names(model, '', module_names)
module_names = [
    'encoder.block.0.layer.0.SelfAttention.q',
    'encoder.block.0.layer.0.SelfAttention.k',
    'encoder.block.0.layer.0.SelfAttention.v',
    'encoder.block.0.layer.0.SelfAttention.o',
    'encoder.block.0.layer.0.SelfAttention.relative_attention_bias',
    'encoder.block.0.layer.0.layer_norm',
    'encoder.block.0.layer.1.DenseReluDense.wi',
    'encoder.block.0.layer.1.DenseReluDense.wo',
    'encoder.block.0.layer.1.layer_norm',
]
module_names

['encoder.block.0.layer.0.SelfAttention.q',
 'encoder.block.0.layer.0.SelfAttention.k',
 'encoder.block.0.layer.0.SelfAttention.v',
 'encoder.block.0.layer.0.SelfAttention.o',
 'encoder.block.0.layer.0.SelfAttention.relative_attention_bias',
 'encoder.block.0.layer.0.layer_norm',
 'encoder.block.0.layer.1.DenseReluDense.wi',
 'encoder.block.0.layer.1.DenseReluDense.wo',
 'encoder.block.0.layer.1.layer_norm']

In [6]:
parameters_to_prune = []
pruneable_module_names = []
for module_name in module_names:
    if 'embed_tokens' in module_name:
        continue
    module = get_module_by_name(model, module_name)
    try:
        module.weight.shape
        parameters_to_prune.append((module, 'weight'))
        pruneable_module_names.append((module_name, np.prod(module.weight.shape)))
    except:
        pass
parameters_to_prune[:10]
# pruneable_module_names[:10]

[(Linear(in_features=768, out_features=768, bias=False), 'weight'),
 (Linear(in_features=768, out_features=768, bias=False), 'weight'),
 (Linear(in_features=768, out_features=768, bias=False), 'weight'),
 (Linear(in_features=768, out_features=768, bias=False), 'weight'),
 (Embedding(32, 12), 'weight'),
 (T5LayerNorm(), 'weight'),
 (Linear(in_features=768, out_features=3072, bias=False), 'weight'),
 (Linear(in_features=3072, out_features=768, bias=False), 'weight'),
 (T5LayerNorm(), 'weight')]

In [7]:
count = 0
net_size = 0
pairs = []
sparsity_levels = []
sparse_parameters = 0
total_parameters = 0

for name, param in model.named_parameters():
    if param.requires_grad and name[:-7] in module_names:
        print(name[:-7], param.size())
        count += 1
        net_size += np.prod(param.size())
        pairs.append((name, np.prod(param.size())))
        layer_weights = param.to('cpu')
        layer_weights = np.array(layer_weights.detach())
        sparsity = (layer_weights.size - np.count_nonzero(layer_weights))/layer_weights.size
        sparsity_levels.append(sparsity)
        sparse_parameters += (layer_weights.size - np.count_nonzero(layer_weights))
        total_parameters += layer_weights.size

print(f"Total number of layers: {count}")
print(f"Total number of parameters: {net_size}")
print(f"Sparsity Levels: {sparsity_levels}")
print(f"Sparse Parameters: {sparse_parameters}")
print(f"Total Parameters: {total_parameters}")
print(f"Net Sparsity: {sparse_parameters/total_parameters}")

encoder.block.0.layer.0.SelfAttention.q torch.Size([768, 768])
encoder.block.0.layer.0.SelfAttention.k torch.Size([768, 768])
encoder.block.0.layer.0.SelfAttention.v torch.Size([768, 768])
encoder.block.0.layer.0.SelfAttention.o torch.Size([768, 768])
encoder.block.0.layer.0.SelfAttention.relative_attention_bias torch.Size([32, 12])
encoder.block.0.layer.0.layer_norm torch.Size([768])
encoder.block.0.layer.1.DenseReluDense.wi torch.Size([3072, 768])
encoder.block.0.layer.1.DenseReluDense.wo torch.Size([768, 3072])
encoder.block.0.layer.1.layer_norm torch.Size([768])
Total number of layers: 9
Total number of parameters: 7079808
Sparsity Levels: [0.6037156846788194, 0.5508287217881944, 0.6263020833333334, 0.14462449815538195, 0.34375, 0.2708333333333333, 0.4542893303765191, 0.5067676968044705, 0.048177083333333336]
Sparse Parameters: 3403484
Total Parameters: 7079808
Net Sparsity: 0.48073111587206885


# Fine-tuning

## Apply the pruning masks

In [8]:
class CustomPruningMethod(prune.BasePruningMethod):
    """
    A custom pruning method that extends PyTorch's BasePruningMethod to implement
    an unstructured pruning technique using a solution mask provided.

    Attributes:
        PRUNING_TYPE (str): Defines the type of pruning as 'unstructured'. This means
            the pruning is not restricted to any particular structure like channels or
            layers, but can occur at individual weight levels across the model.

    Methods:
        compute_mask(t, default_mask):
            Computes a new mask for the tensor 't' using a globally defined 'solution_mask'
            that specifies which elements of the tensor to prune.
    """

    PRUNING_TYPE = 'unstructured'

    def compute_mask(self, t, default_mask):
        """
        Computes and applies a custom pruning mask to the given tensor.
        
        Parameters:
            t (torch.Tensor): The tensor to be pruned.
            default_mask (torch.Tensor): The default binary mask provided by the pruning method.
        
        Returns:
            torch.Tensor: A new mask tensor that has been customized based on the global 'solution_mask'.
        """

        global solution_mask
        if len(solution_mask) != t.numel():
            mask = torch.ones(t.shape)
        else:
            mask = torch.reshape(solution_mask, t.shape)
        mask = mask.to('cuda')
        return mask
    
    
def custom_unstructured(module, name):
    """
    Applies the CustomPruningMethod to a specific module of a neural network. 
    This function allows for the unstructured pruning of the module's specified 
    parameter (typically weights) using a globally defined pruning mask.

    Parameters:
        module (torch.nn.Module): The module from a neural network whose parameter 
                                  is to be pruned.
        name (str): The name of the parameter within the module to prune, e.g., 'weight'.

    Returns:
        torch.nn.Module: The same module with the specified parameter now subjected to 
                         the custom pruning process. This allows for in-place modification
                         and reusability of the module in further operations or training.
    """
    CustomPruningMethod.apply(module, name)
    return module


In [9]:
layer_solutions = {}

for layer_name in module_names:
    try:
        with open(f"./solutions/T5ForConditionalGeneration/{exp_name}/best_solution_{layer_name}.pkl", 'rb') as fp:
            best_solution = pickle.load(fp)
    except:
        continue
    layer_solutions[layer_name] = best_solution
    print(f"Layer {layer_name} mask obtained!")

print()

for layer_name, solution in layer_solutions.items():
    solution_mask = torch.tensor(solution)
    layer = get_module_by_name(model, layer_name)
    custom_unstructured(layer, name='weight')
    print(f"Layer {layer_name} masked!")
print()

print("Test accuracy before fine-tuning: ", get_bleu_score(model, tokenizer, mbpp_test_loader, MBPP_Test_DF))

Layer encoder.block.0.layer.0.SelfAttention.q mask obtained!
Layer encoder.block.0.layer.0.SelfAttention.k mask obtained!
Layer encoder.block.0.layer.0.SelfAttention.v mask obtained!
Layer encoder.block.0.layer.0.SelfAttention.o mask obtained!
Layer encoder.block.0.layer.0.SelfAttention.relative_attention_bias mask obtained!
Layer encoder.block.0.layer.0.layer_norm mask obtained!
Layer encoder.block.0.layer.1.DenseReluDense.wi mask obtained!
Layer encoder.block.0.layer.1.DenseReluDense.wo mask obtained!
Layer encoder.block.0.layer.1.layer_norm mask obtained!

Layer encoder.block.0.layer.0.SelfAttention.q masked!
Layer encoder.block.0.layer.0.SelfAttention.k masked!
Layer encoder.block.0.layer.0.SelfAttention.v masked!
Layer encoder.block.0.layer.0.SelfAttention.o masked!
Layer encoder.block.0.layer.0.SelfAttention.relative_attention_bias masked!
Layer encoder.block.0.layer.0.layer_norm masked!
Layer encoder.block.0.layer.1.DenseReluDense.wi masked!
Layer encoder.block.0.layer.1.DenseRe

## Python-API Dataset

In [16]:
for iter in range(1):
    print(f"Iteration {iter+1}:")
    fine_tune(model, api_train_loader, api_val_loader, epochs=1, learning_rate=1e-6)
    print(f"BLEU Score after {(iter+1)} iterations:", get_bleu_score(model, tokenizer, api_test_loader, API_Test_DF))

Iteration 1:
Epoch 1/1:
Train: batch: 10/750, Average loss:0.4205333173274994, Current loss:0.40900719165802
Train: batch: 20/750, Average loss:0.4155389741063118, Current loss:0.32355183362960815
Train: batch: 30/750, Average loss:0.40862095753351846, Current loss:0.31301403045654297
Train: batch: 40/750, Average loss:0.41205747947096827, Current loss:0.6436066627502441
Train: batch: 50/750, Average loss:0.405461967587471, Current loss:0.3688463866710663
Train: batch: 60/750, Average loss:0.4102784335613251, Current loss:0.3638272285461426
Train: batch: 70/750, Average loss:0.40467696615627835, Current loss:0.5133880972862244
Train: batch: 80/750, Average loss:0.3950720202177763, Current loss:0.3146379888057709
Train: batch: 90/750, Average loss:0.3852678866850005, Current loss:0.28135284781455994
Train: batch: 100/750, Average loss:0.38091221541166304, Current loss:0.22692283987998962
Train: batch: 110/750, Average loss:0.37284686470573597, Current loss:0.33059027791023254
Train: bat

## MBPP Dataset

In [10]:
for iter in range(3):
    print(f"Iteration {iter+1}:")
    fine_tune(model, mbpp_train_loader, mbpp_val_loader, epochs=1, learning_rate=1e-6)
    print(f"BLEU score after {(iter+1)} iterations:", get_bleu_score(model, tokenizer, mbpp_test_loader, MBPP_Test_DF))

Iteration 1:
Epoch 1/1:
Train: batch: 10/98, Average loss:0.06321686990559101, Current loss:0.0262504443526268
Train: batch: 20/98, Average loss:0.06250367611646652, Current loss:0.09344208985567093
Train: batch: 30/98, Average loss:0.06994899275402228, Current loss:0.07343579828739166
Train: batch: 40/98, Average loss:0.06868068198673427, Current loss:0.07464797049760818
Train: batch: 50/98, Average loss:0.0657530989125371, Current loss:0.030841751024127007
Train: batch: 60/98, Average loss:0.06698097198580702, Current loss:0.07480858266353607
Train: batch: 70/98, Average loss:0.06856974711907761, Current loss:0.10628055781126022
Train: batch: 80/98, Average loss:0.06951920615974813, Current loss:0.05590860918164253
Train: batch: 90/98, Average loss:0.06994887662844526, Current loss:0.03594674542546272
Train: batch: 98/98, Average loss:0.07062521006683914, Current loss:0.11665552854537964
Train Loss: 0.07062521006683914
Valid: batch: 10/13, Average loss:0.042269826680421826, Current L

## Remove the masks (make them permanent)

In [12]:
for layer_name, solution in layer_solutions.items():
    layer = get_module_by_name(model, layer_name)
    prune.remove(layer, 'weight')
    print(f"Layer {layer_name} unmasked!")

Layer encoder.block.0.layer.0.SelfAttention.q unmasked!
Layer encoder.block.0.layer.0.SelfAttention.k unmasked!
Layer encoder.block.0.layer.0.SelfAttention.v unmasked!
Layer encoder.block.0.layer.0.SelfAttention.o unmasked!
Layer encoder.block.0.layer.0.SelfAttention.relative_attention_bias unmasked!
Layer encoder.block.0.layer.0.layer_norm unmasked!
Layer encoder.block.0.layer.1.DenseReluDense.wi unmasked!
Layer encoder.block.0.layer.1.DenseReluDense.wo unmasked!
Layer encoder.block.0.layer.1.layer_norm unmasked!


# Performance after Fine-tuning

## MBPP Dataset

In [13]:
print("BLEU score:", get_bleu_score(model, tokenizer, mbpp_test_loader, MBPP_Test_DF))

BLEU score: 0.1344859785908876


## Python-API Dataset

In [20]:
print("BLEU Score:", get_bleu_score(model, tokenizer, api_test_loader, API_Test_DF))

BLEU Score: 0.2917530479398626


# Save Model

In [14]:
directory = f"./sparse_models/T5ForConditionalGeneration/{exp_name}"
if not os.path.exists(directory):
    os.makedirs(directory)
torch.save(model.state_dict(), f'./sparse_models/T5ForConditionalGeneration/{exp_name}/sparse_weights_finetuned.pth')