In [None]:
!pip install pydot --quiet
!pip install tensorflow-datasets --quiet
!pip install transformers --quiet

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [None]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-e8bff272-c84b-f400-02e4-64c672740871)


In [None]:
import os
import time
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, Input, Dense, Lambda, Dropout, Conv1D, GlobalMaxPooling1D, Concatenate, Activation
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel, GPT2Config
import sklearn as sk
import os
from nltk.data import find
import matplotlib.pyplot as plt
import re
import tensorflow as tf

In [None]:
train_data, test_data = tfds.load(
    name="imdb_reviews",
    split=('train[:80%]', 'test[80%:]'),
    as_supervised=True)

train_examples, train_labels = next(iter(train_data.batch(20000)))
val_examples, val_labels = next(iter(train_data.batch(5000)))
test_examples, test_labels = next(iter(test_data.batch(1000)))

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

config = GPT2Config.from_pretrained("gpt2", output_hidden_states=True)
gpt2_model = TFGPT2LMHeadModel.from_pretrained("gpt2", config=config)
tokenizer.pad_token = tokenizer.eos_token
MAX_SEQUENCE_LENGTH = 100

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
#Embedding size of GPT2 tokenizer: 768
#Dictionary size of GPT2 tokenizer: 50257


train_examples_str = [x.decode('utf-8') for x in train_examples.numpy()]
val_examples_str = [x.decode('utf-8') for x in val_examples.numpy()]
test_examples_str = [x.decode('utf-8') for x in test_examples.numpy()]

#training data
gpt_train_tokenized = tokenizer(train_examples_str,
              max_length=MAX_SEQUENCE_LENGTH,
              truncation=True,
              padding='max_length',
              return_tensors='tf')

gpt_train_inputs = {
    'input_ids': gpt_train_tokenized['input_ids'],
    'attention_mask': gpt_train_tokenized['attention_mask']
}

gpt_train_labels = np.array(train_labels)


#validation data
gpt_val_tokenized = tokenizer(val_examples_str,
              max_length=MAX_SEQUENCE_LENGTH,
              truncation=True,
              padding='max_length',
              return_tensors='tf')

gpt_val_inputs = {
    'input_ids': gpt_val_tokenized['input_ids'],
    'attention_mask': gpt_val_tokenized['attention_mask']
}

gpt_val_labels = np.array(val_labels)

#testing data
gpt_test_tokenized = tokenizer(test_examples_str,
              max_length=MAX_SEQUENCE_LENGTH,
              truncation=True,
              padding='max_length',
              return_tensors='tf')

gpt_test_inputs = {
    'input_ids': gpt_test_tokenized['input_ids'],
    'attention_mask': gpt_test_tokenized['attention_mask']
}

gpt_test_labels = np.array(test_labels)


In [None]:
#24 layers of transformer
#A drop out layer + dense layer with 100 hidden layer size on top + final layer with sigmoid as activation function


def create_gpt_last_model(gpt_model,
                          max_sequence_length=MAX_SEQUENCE_LENGTH,
                          hidden_size = 100,
                          dropout=0.3,
                          learning_rate=0.00005):
    """
    Build a simple classification model with gpt. Use the last token output for classification purposes.
    """

    gpt_model.trainable = True #True

    #input layers of gpt, shape (batch, max_sequence_length), model will be fit with gpt_train_inputs
    input_ids = Input(shape=(max_sequence_length,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(max_sequence_length,), dtype=tf.int32, name='attention_mask')

    # GPT-2 model
    #model.generate() for iteratively generating (autoregressive)
    #we only do it one time.
    gpt2_outputs = gpt_model(input_ids=input_ids, attention_mask=attention_mask)

    # Use the last hidden state of the last transformer layer for classification, ingore linear layer and softmax layer
    # Select the last token of hidden state
    last_hidden_state_last_token = gpt2_outputs.logits[:, -1, :]

    #Add a dropout layer
    x = Dropout(dropout)(last_hidden_state_last_token)

    #Add a fully connected layer for classification
    x = Dense(hidden_size, activation='relu')(x)

    #Final output layer for classification, assuming it's binary task
    output = Dense(1, activation='sigmoid')(x)


    # Create the model
    classification_model = Model(inputs=[input_ids, attention_mask], outputs=output)


    #Model complie
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss='binary_crossentropy',
                                 metrics=['accuracy'])

    return classification_model


In [None]:
tf.keras.backend.clear_session()

In [None]:
gpt_logit_model_classification = create_gpt_last_model(gpt2_model)

history = gpt_logit_model_classification.fit(gpt_train_inputs,
                    gpt_train_labels,
                    epochs=2, #2
                    batch_size=8,
                    validation_data=(gpt_val_inputs, gpt_val_labels))

Epoch 1/2
Epoch 2/2


In [None]:
#Model before zeroing out
gpt_logit_model_classification.evaluate(gpt_test_inputs, gpt_test_labels)



[0.5087048411369324, 0.8519999980926514]

In [None]:
import time

start_time = time.time()
prediction = gpt_logit_model_classification.predict(gpt_test_inputs)
end_time = time.time()

elapsed_time = end_time - start_time
print("Elapsed time: {:.2f} seconds".format(elapsed_time))

Elapsed time: 23.37 seconds


In [None]:
"""
Cosine similiarity >　0.35

=================Input 0====================
Layer 10, Neurons: ['Layer 10 Neuron 804']
Layer 11, Neurons: ['Layer 11 Neuron 309', 'Layer 11 Neuron 458', 'Layer 11 Neuron 598', 'Layer 11 Neuron 1120', 'Layer 11 Neuron 1126', 'Layer 11 Neuron 1289', 'Layer 11 Neuron 1322', 'Layer 11 Neuron 1637', 'Layer 11 Neuron 1860']

=================Input 1====================
Layer 10, Neurons: ['Layer 10 Neuron 804', 'Layer 10 Neuron 1410', 'Layer 10 Neuron 1521', 'Layer 10 Neuron 1783', 'Layer 10 Neuron 2931']
Layer 11, Neurons: ['Layer 11 Neuron 309', 'Layer 11 Neuron 458', 'Layer 11 Neuron 480', 'Layer 11 Neuron 598', 'Layer 11 Neuron 796', 'Layer 11 Neuron 847', 'Layer 11 Neuron 869', 'Layer 11 Neuron 975', 'Layer 11 Neuron 1120', 'Layer 11 Neuron 1126',
                    'Layer 11 Neuron 1231', 'Layer 11 Neuron 1289', 'Layer 11 Neuron 1322', 'Layer 11 Neuron 1428', 'Layer 11 Neuron 1575', 'Layer 11 Neuron 1637', 'Layer 11 Neuron 1860', 'Layer 11 Neuron 1998', 'Layer 11 Neuron 2375',
                    'Layer 11 Neuron 2378', 'Layer 11 Neuron 2600', 'Layer 11 Neuron 2822']

=================Input 2====================
Layer 10, Neurons: ['Layer 10 Neuron 804']
Layer 11, Neurons: ['Layer 11 Neuron 309', 'Layer 11 Neuron 458', 'Layer 11 Neuron 598', 'Layer 11 Neuron 1120', 'Layer 11 Neuron 1126', 'Layer 11 Neuron 1289', 'Layer 11 Neuron 1322', 'Layer 11 Neuron 1637', 'Layer 11 Neuron 1860']

=================Input 3====================
Layer 10, Neurons: ['Layer 10 Neuron 804', 'Layer 10 Neuron 1410', 'Layer 10 Neuron 1521', 'Layer 10 Neuron 1783', 'Layer 10 Neuron 2931']
Layer 11, Neurons: ['Layer 11 Neuron 309', 'Layer 11 Neuron 458', 'Layer 11 Neuron 480', 'Layer 11 Neuron 598', 'Layer 11 Neuron 796', 'Layer 11 Neuron 847', 'Layer 11 Neuron 869', 'Layer 11 Neuron 975', 'Layer 11 Neuron 1120', 'Layer 11 Neuron 1126',
                    'Layer 11 Neuron 1231', 'Layer 11 Neuron 1289', 'Layer 11 Neuron 1322', 'Layer 11 Neuron 1428', 'Layer 11 Neuron 1575', 'Layer 11 Neuron 1637', 'Layer 11 Neuron 1860', 'Layer 11 Neuron 1998', 'Layer 11 Neuron 2375',
                    'Layer 11 Neuron 2378', 'Layer 11 Neuron 2600', 'Layer 11 Neuron 2822']

=================Input 4====================
Layer 10, Neurons: ['Layer 10 Neuron 804', 'Layer 10 Neuron 1410']
Layer 11, Neurons: ['Layer 11 Neuron 309', 'Layer 11 Neuron 458', 'Layer 11 Neuron 480', 'Layer 11 Neuron 598', 'Layer 11 Neuron 796', 'Layer 11 Neuron 847', 'Layer 11 Neuron 1120', 'Layer 11 Neuron 1126', 'Layer 11 Neuron 1289', 'Layer 11 Neuron 1322',
                    'Layer 11 Neuron 1637', 'Layer 11 Neuron 1860', 'Layer 11 Neuron 2375', 'Layer 11 Neuron 2378']

=================Input 5====================
Layer 10, Neurons: ['Layer 10 Neuron 804']
Layer 11, Neurons: ['Layer 11 Neuron 309', 'Layer 11 Neuron 458', 'Layer 11 Neuron 1126', 'Layer 11 Neuron 1289', 'Layer 11 Neuron 1637', 'Layer 11 Neuron 1860']

=================Input 6====================
Layer 10, Neurons: ['Layer 10 Neuron 804', 'Layer 10 Neuron 1410']
Layer 11, Neurons: ['Layer 11 Neuron 309', 'Layer 11 Neuron 458', 'Layer 11 Neuron 598', 'Layer 11 Neuron 1120', 'Layer 11 Neuron 1126', 'Layer 11 Neuron 1289', 'Layer 11 Neuron 1322', 'Layer 11 Neuron 1637', 'Layer 11 Neuron 1860', 'Layer 11 Neuron 2375',
                    'Layer 11 Neuron 2378', 'Layer 11 Neuron 2822']

=================Input 7====================
Layer 10, Neurons: ['Layer 10 Neuron 804', 'Layer 10 Neuron 1410', 'Layer 10 Neuron 1783']
Layer 11, Neurons: ['Layer 11 Neuron 309', 'Layer 11 Neuron 458', 'Layer 11 Neuron 480', 'Layer 11 Neuron 598', 'Layer 11 Neuron 796', 'Layer 11 Neuron 847', 'Layer 11 Neuron 975', 'Layer 11 Neuron 1120', 'Layer 11 Neuron 1126', 'Layer 11 Neuron 1231',
                    'Layer 11 Neuron 1289', 'Layer 11 Neuron 1322', 'Layer 11 Neuron 1428', 'Layer 11 Neuron 1637', 'Layer 11 Neuron 1860', 'Layer 11 Neuron 2375', 'Layer 11 Neuron 2378', 'Layer 11 Neuron 2822']


=================Input 8====================
Layer 10, Neurons: ['Layer 10 Neuron 804']
Layer 11, Neurons: ['Layer 11 Neuron 309', 'Layer 11 Neuron 458', 'Layer 11 Neuron 598', 'Layer 11 Neuron 1120', 'Layer 11 Neuron 1126', 'Layer 11 Neuron 1289', 'Layer 11 Neuron 1322', 'Layer 11 Neuron 1637', 'Layer 11 Neuron 1860']

=================Input 9====================
Layer 10, Neurons: ['Layer 10 Neuron 804', 'Layer 10 Neuron 1410']
Layer 11, Neurons: ['Layer 11 Neuron 309', 'Layer 11 Neuron 458', 'Layer 11 Neuron 598', 'Layer 11 Neuron 1120', 'Layer 11 Neuron 1126', 'Layer 11 Neuron 1289', 'Layer 11 Neuron 1322', 'Layer 11 Neuron 1637', 'Layer 11 Neuron 1860', 'Layer 11 Neuron 2375', 'Layer 11 Neuron 2378']
"""




#Create masks for each inputs and zero out the certain portion of neurons randomly except preserved neruons.
#Mask after second layer of FFN
preserved_neuron_list = [[],
                       [],
                       [],
                       [],
                       [],
                       [],
                       [],
                       [],
                       [],
                       [],
                       [804, 1410, 1521, 1783, 2931],
                       [309, 458, 480, 598, 796, 847, 869, 975, 1120, 1126, 1231, 1289, 1322, 1428, 1575, 1637, 1860, 1998, 2375, 2378, 2600, 2822]]

num_neurons = 3072
masks = []

original_list = list(range(3072))
np.random.shuffle(original_list, )
masked_neurons_list = [original_list[:307]]*12

for i, masked_neurons in enumerate(masked_neurons_list):
    mask = np.ones(num_neurons)
    if masked_neurons not in preserved_neuron_list[i]:
      mask[masked_neurons] = 0
      masks.append(mask)

In [None]:
layer = 'tfgpt2lm_head_model/transformer/h_._0/mlp/c_proj/weight:0'
layer.split('/')[2].split('_')

['h', '.', '0']

In [None]:
#zero out -> quantization + sparse matrix pruning
# Assuming bert_model is your pre-trained BERT model
#tf_bert_model/bert/encoder/layer_._0/output/dense/kernel
#  f"tfgpt2lm_head_model/transformer/h_._{layer_num}/mlp/c_proj/weight:0"


for var in gpt2_model.variables:
    if 'mlp/c_proj/weight' in var.name:
        # Extract layer number from variable name
        layer_num = int(var.name.split('/')[2].split('_')[2])

        # Get the current weights
        weights = var.numpy()

        # Apply the mask #(3072,)
        mask = masks[layer_num]
        weights *= mask.reshape(-1, 1)  # Reshape mask and apply to weights

        # Assign the modified weights back to the variable
        var.assign(weights)


In [None]:
#to check in if the weights are correctly zero out
for var in gpt2_model.variables:
    if 'mlp/c_proj/weight' in var.name:
        print(var.name, var.numpy()[0:3, 0:3])  # Print a small section of the weights

tfgpt2lm_head_model/transformer/h_._0/mlp/c_proj/weight:0 [[-0.          0.          0.        ]
 [ 0.0384227  -0.06016427  0.08401392]
 [-0.          0.         -0.        ]]
tfgpt2lm_head_model/transformer/h_._1/mlp/c_proj/weight:0 [[ 0.         -0.         -0.        ]
 [-0.20907356 -0.15552253 -0.08660339]
 [ 0.         -0.          0.        ]]
tfgpt2lm_head_model/transformer/h_._2/mlp/c_proj/weight:0 [[-0.         -0.         -0.        ]
 [ 0.12518509  0.02254442 -0.12007061]
 [-0.         -0.         -0.        ]]
tfgpt2lm_head_model/transformer/h_._3/mlp/c_proj/weight:0 [[ 0.          0.         -0.        ]
 [ 0.02451897 -0.00284892  0.04060924]
 [ 0.          0.          0.        ]]
tfgpt2lm_head_model/transformer/h_._4/mlp/c_proj/weight:0 [[-0.          0.         -0.        ]
 [-0.08047505  0.01466465  0.13464928]
 [-0.          0.          0.        ]]
tfgpt2lm_head_model/transformer/h_._5/mlp/c_proj/weight:0 [[ 0.          0.          0.        ]
 [-0.16617684  0.111095

In [None]:
#bert_model has been zeroed out.
gpt_logit_model_classification.evaluate(gpt_test_inputs, gpt_test_labels)

#80~84
#23 ~27.39
#82, 23.06

#            accuracy process time (1000 inputs)
# 0%        --  0.844 27.39
#10% (307)  --  0.827 11.93   0.856, 11.88 (0.852,23.37) 3684 neruons removed
#30% (921)  --  0.657 11.81                              11052
#50% (1531) --  0.582 12.59                              18372
#70% (2150) --  0.509 11.78                              25800
#80% (2457) --
#90% (2764) --
#95% (2922) --


#Impending for further verifying



[0.5137210488319397, 0.8560000061988831]

In [None]:
import time

start_time = time.time()
prediction = gpt_logit_model_classification.predict(gpt_test_inputs)
end_time = time.time()

elapsed_time = end_time - start_time
print("Elapsed time: {:.2f} seconds".format(elapsed_time))

Elapsed time: 11.88 seconds
