In [1]:
import re
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from copy import deepcopy
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, RobertaModel, RobertaTokenizer

if torch.cuda.is_available():
    device = 'cuda'
    print("Cuda computing enabled")
else:
    device = 'cpu'    
    print("No cuda found")

Cuda computing enabled


## Loading the model and data

In [2]:
"""
Loading the model and tokenizer
""" 

CACHE_PATH = "/mnt/hdd_drive/huggingface/hub/"
MODEL_NAME = "roberta-base"

model = RobertaModel.from_pretrained(MODEL_NAME, cache_dir=CACHE_PATH)
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_PATH)

model = model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
"""
Check the model architecture.
""" 
model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [5]:
"""
Loading the datasets
"""

DATA_PATH = "..."

df_gpt4o_wiki = pd.read_csv(DATA_PATH + "gpt-4-o-wiki-correct-1500.csv")
df_gpt4o_reddit = pd.read_csv(DATA_PATH + "gpt-4-o-reddit-1500.csv")
df_gpt4o_stackex = pd.read_csv(DATA_PATH + "gpt-4-o-stackexchange-1500.csv")

df_gpt3_wiki = pd.read_json(DATA_PATH + "gpt3_davinci_003_wikip.jsonl_pp", lines=True)[:1500]
df_gpt3_reddit = pd.read_json(DATA_PATH + "gpt3_davinci_003_reddit.jsonl_pp", lines=True)[:1500]
df_gpt3_stackex = pd.read_json(DATA_PATH + "gpt3_davinci_003_300_len.jsonl", lines=True)[:1500]

## Main functions

In [4]:
output_folder = "cache/"
def prune_layers(model, layers=[0,], return_weights=True):
    """
    Function to remove attention heads by entire layers. Standart method model.prune_heads({...}) 
    of BERT-like models might not work with GPU-acceleration if there an entire layer of heads is removed (prunned)  
    
    Parameters:
        model          --- transformer model to prune
        layers         --- list of layer numbers (zero-based) to be removed
        return_weights --- boolean flag, whenever to return the pruned weights (needed to repair the model)
    """
    layers_to_remove = layers
    preserved_copies = { }
    print(layers_to_remove)
    with torch.no_grad():
        for j in layers_to_remove:
            # If you plan to use other model than BERT/RoBERTa,
            # change the following 2 lines according to the architecture of your model
            preserved_copies[j] = deepcopy(model.encoder.layer[j].attention.output.dense.weight)
            model.encoder.layer[j].attention.output.dense.weight *= 0.0
    
    if return_weights:  
        return preserved_copies
    
    
def restore_layers(model, preserved_copies):
    """
    Function to repair model pruned by function prune_layers( )
    Parameters:
        model            --- transformer model to prune
        preserved_copies --- dictionary of layer weights returned by prune_layers
    """
   
    with torch.no_grad():
        for j in preserved_copies.keys():
            # If you plan to use other model than BERT/RoBERTa,
            # change the following line according to the architecture of your model
            model.encoder.layer[j].attention.output.dense.weight += preserved_copies[j]

In [6]:
def get_avg_pool(series):
    "Returns mean-pooled embeddings for each text in the container <series> "
    
    cls_set = []
    for text in tqdm(series):
        text = re.sub(r'\s+', ' ', text).strip()
        inpt = tokenizer(text, truncation=True, max_length=510, return_tensors="pt").to(device)
        with torch.no_grad():
            outp = model(**inpt)[0][0]

        cls_set.append(torch.mean(outp, 0).cpu().numpy())

    return np.vstack(cls_set)

### baseline --- no pruning

In [8]:
cls_lst_w1 = get_avg_pool(df_gpt4o_wiki["gold_completion"])
cls_lst_w2 = get_avg_pool(df_gpt4o_wiki["gen_completion"])    
np.save(output_folder + 'avg_roberta_human4o_wiki_none.npy', cls_lst_w1)
np.save(output_folder + 'avg_roberta_gpt4o_wiki_none.npy', cls_lst_w2)

cls_lst_r1 = get_avg_pool(df_gpt4o_reddit["gold_completion"])
cls_lst_r2 = get_avg_pool(df_gpt4o_reddit["gen_completion"])
np.save(output_folder + 'avg_roberta_human4o_reddit_none.npy', cls_lst_r1)
np.save(output_folder + 'avg_roberta_gpt4o_reddit_none.npy', cls_lst_r2)
    
cls_lst_s1 = get_avg_pool(df_gpt4o_stackex["gold_completion"])
cls_lst_s2 = get_avg_pool(df_gpt4o_stackex["gen_completion"])
np.save(output_folder + 'avg_roberta_human4o_stackexchange_none.npy', cls_lst_s1)
np.save(output_folder + 'avg_roberta_gpt4o_stackexchange_none.npy', cls_lst_s2)

100%|██████████| 1500/1500 [00:38<00:00, 39.28it/s]
100%|██████████| 1500/1500 [00:33<00:00, 44.54it/s]
100%|██████████| 1500/1500 [00:18<00:00, 81.42it/s]
100%|██████████| 1500/1500 [00:31<00:00, 47.50it/s]
100%|██████████| 1500/1500 [00:34<00:00, 43.30it/s]
100%|██████████| 1500/1500 [00:40<00:00, 37.34it/s]


In [9]:
cls_lst_w1 = get_avg_pool(df_gpt3_wiki["gold_completion"])
cls_lst_w2 = get_avg_pool(df_gpt3_wiki["gen_completion"])    
np.save(output_folder + 'avg_roberta_human3_wiki_none.npy', cls_lst_w1)
np.save(output_folder + 'avg_roberta_gpt3_wiki_none.npy', cls_lst_w2)

cls_lst_r1 = get_avg_pool(df_gpt3_reddit["gold_completion"])
cls_lst_r2 = get_avg_pool(df_gpt3_reddit["gen_completion"])
np.save(output_folder + 'avg_roberta_human3_reddit_none.npy', cls_lst_r1)
np.save(output_folder + 'avg_roberta_gpt3_reddit_none.npy', cls_lst_r2)
    
cls_lst_s1 = get_avg_pool(df_gpt3_stackex["gold_completion"])
cls_lst_s2 = get_avg_pool(df_gpt3_stackex["gen_completion"])
np.save(output_folder + 'avg_roberta_human3_stackexchange_none.npy', cls_lst_s1)
np.save(output_folder + 'avg_roberta_gpt3_stackexchange_none.npy', cls_lst_s2)

100%|██████████| 1500/1500 [00:35<00:00, 42.35it/s]
100%|██████████| 1500/1500 [00:20<00:00, 74.04it/s]
100%|██████████| 1500/1500 [00:20<00:00, 73.79it/s]
100%|██████████| 1500/1500 [00:22<00:00, 66.43it/s]
100%|██████████| 1500/1500 [00:36<00:00, 41.19it/s]
100%|██████████| 1500/1500 [00:28<00:00, 53.26it/s]


### prunning applied

In [35]:
for i in range(12):
    print("Proceesing layer #" + str(i) + ":")
    preserved_copies = prune_layers(model, [i])
    
    cls_lst_w1 = get_avg_pool(df_gpt3_wiki["gold_completion"])
    cls_lst_w2 = get_avg_pool(df_gpt3_wiki["gen_completion"])
    np.save(output_folder + 'avg_roberta_human3_wiki_l' + str(i) + '.npy', cls_lst_w1)
    np.save(output_folder + 'avg_roberta_gpt3_wiki_l' + str(i) + '.npy', cls_lst_w2)
    
    cls_lst_r1 = get_avg_pool(df_gpt3_reddit["gold_completion"])
    cls_lst_r2 = get_avg_pool(df_gpt3_reddit["gen_completion"])
    np.save(output_folder + 'avg_roberta_human3_reddit_l' + str(i) + '.npy', cls_lst_r1)
    np.save(output_folder + 'avg_roberta_gpt3_reddit_l' + str(i) + '.npy', cls_lst_r2)
    
    cls_lst_s1 = get_avg_pool(df_gpt3_stackex["gold_completion"])
    cls_lst_s2 = get_avg_pool(df_gpt3_stackex["gen_completion"])
    np.save(output_folder + 'avg_roberta_human3_stackexchange_l' + str(i) + '.npy', cls_lst_s1)
    np.save(output_folder + 'avg_roberta_gpt3_stackexchange_l' + str(i) + '.npy', cls_lst_s2)
    
    cls_lst_w1 = get_avg_pool(df_gpt4o_wiki["gold_completion"])
    cls_lst_w2 = get_avg_pool(df_gpt4o_wiki["gen_completion"])
    np.save(output_folder + 'avg_roberta_human4o_wiki_l' + str(i) + '.npy', cls_lst_w1)
    np.save(output_folder + 'avg_roberta_gpt4o_wiki_l' + str(i) + '.npy', cls_lst_w2)
    
    cls_lst_r1 = get_avg_pool(df_gpt4o_reddit["gold_completion"])
    cls_lst_r2 = get_avg_pool(df_gpt4o_reddit["gen_completion"])
    np.save(output_folder + 'avg_roberta_human4o_reddit_l' + str(i) + '.npy', cls_lst_r1)
    np.save(output_folder + 'avg_roberta_gpt4o_reddit_l' + str(i) + '.npy', cls_lst_r2)
    
    cls_lst_s1 = get_avg_pool(df_gpt4o_stackex["gold_completion"])
    cls_lst_s2 = get_avg_pool(df_gpt4o_stackex["gen_completion"])
    np.save(output_folder + 'avg_roberta_human4o_stackexchange_l' + str(i) + '.npy', cls_lst_s1)
    np.save(output_folder + 'avg_roberta_gpt4o_stackexchange_l' + str(i) + '.npy', cls_lst_s2)
    
    restore_layers(model, preserved_copies)
    print('\n\n')

Proceesing layer #1:
[1]


100%|██████████| 1500/1500 [00:36<00:00, 41.59it/s]
100%|██████████| 1500/1500 [00:17<00:00, 84.04it/s]
100%|██████████| 1500/1500 [00:20<00:00, 74.80it/s]
100%|██████████| 1500/1500 [00:26<00:00, 57.05it/s]
100%|██████████| 1500/1500 [00:35<00:00, 41.98it/s]
100%|██████████| 1500/1500 [00:29<00:00, 51.61it/s]





Proceesing layer #2:
[2]


100%|██████████| 1500/1500 [00:34<00:00, 43.68it/s]
100%|██████████| 1500/1500 [00:19<00:00, 75.29it/s]
100%|██████████| 1500/1500 [00:20<00:00, 73.70it/s]
100%|██████████| 1500/1500 [00:22<00:00, 65.25it/s]
100%|██████████| 1500/1500 [00:37<00:00, 40.07it/s]
100%|██████████| 1500/1500 [00:25<00:00, 59.85it/s]





Proceesing layer #3:
[3]


100%|██████████| 1500/1500 [00:36<00:00, 40.70it/s]
100%|██████████| 1500/1500 [00:19<00:00, 76.08it/s]
100%|██████████| 1500/1500 [00:18<00:00, 82.65it/s] 
100%|██████████| 1500/1500 [00:25<00:00, 58.11it/s]
100%|██████████| 1500/1500 [00:35<00:00, 42.53it/s]
100%|██████████| 1500/1500 [00:24<00:00, 60.42it/s]





Proceesing layer #4:
[4]


100%|██████████| 1500/1500 [00:36<00:00, 40.83it/s]
100%|██████████| 1500/1500 [00:18<00:00, 80.63it/s] 
100%|██████████| 1500/1500 [00:19<00:00, 76.18it/s]
100%|██████████| 1500/1500 [00:26<00:00, 56.70it/s]
100%|██████████| 1500/1500 [00:33<00:00, 44.69it/s]
100%|██████████| 1500/1500 [00:27<00:00, 54.50it/s]





Proceesing layer #5:
[5]


100%|██████████| 1500/1500 [00:35<00:00, 41.93it/s]
100%|██████████| 1500/1500 [00:18<00:00, 79.52it/s]
100%|██████████| 1500/1500 [00:20<00:00, 73.62it/s]
100%|██████████| 1500/1500 [00:24<00:00, 62.18it/s]
100%|██████████| 1500/1500 [00:36<00:00, 41.39it/s]
100%|██████████| 1500/1500 [00:27<00:00, 54.37it/s]





Proceesing layer #6:
[6]


100%|██████████| 1500/1500 [00:35<00:00, 42.48it/s]
100%|██████████| 1500/1500 [00:20<00:00, 73.96it/s]
100%|██████████| 1500/1500 [00:19<00:00, 78.29it/s] 
100%|██████████| 1500/1500 [00:21<00:00, 68.43it/s]
100%|██████████| 1500/1500 [00:35<00:00, 41.68it/s]
100%|██████████| 1500/1500 [00:26<00:00, 56.14it/s]





Proceesing layer #7:
[7]


100%|██████████| 1500/1500 [00:36<00:00, 40.63it/s]
100%|██████████| 1500/1500 [00:19<00:00, 75.64it/s]
100%|██████████| 1500/1500 [00:17<00:00, 87.83it/s] 
100%|██████████| 1500/1500 [00:25<00:00, 59.45it/s]
100%|██████████| 1500/1500 [00:36<00:00, 41.39it/s]
100%|██████████| 1500/1500 [00:24<00:00, 60.05it/s]





Proceesing layer #8:
[8]


100%|██████████| 1500/1500 [00:37<00:00, 40.24it/s]
100%|██████████| 1500/1500 [00:16<00:00, 89.61it/s]
100%|██████████| 1500/1500 [00:20<00:00, 74.93it/s]
100%|██████████| 1500/1500 [00:25<00:00, 59.83it/s]
100%|██████████| 1500/1500 [00:34<00:00, 42.95it/s]
100%|██████████| 1500/1500 [00:28<00:00, 52.54it/s]





Proceesing layer #9:
[9]


100%|██████████| 1500/1500 [00:34<00:00, 43.09it/s]
100%|██████████| 1500/1500 [00:19<00:00, 75.81it/s]
100%|██████████| 1500/1500 [00:20<00:00, 73.25it/s] 
100%|██████████| 1500/1500 [00:25<00:00, 59.61it/s]
100%|██████████| 1500/1500 [00:37<00:00, 40.34it/s]
100%|██████████| 1500/1500 [00:27<00:00, 54.75it/s]





Proceesing layer #10:
[10]


100%|██████████| 1500/1500 [00:38<00:00, 38.50it/s]
100%|██████████| 1500/1500 [00:28<00:00, 53.21it/s]
100%|██████████| 1500/1500 [00:27<00:00, 54.09it/s]
100%|██████████| 1500/1500 [00:30<00:00, 49.70it/s]
100%|██████████| 1500/1500 [00:42<00:00, 35.55it/s]
100%|██████████| 1500/1500 [00:32<00:00, 46.63it/s]





Proceesing layer #11:
[11]


100%|██████████| 1500/1500 [00:41<00:00, 36.49it/s]
100%|██████████| 1500/1500 [00:29<00:00, 51.38it/s]
100%|██████████| 1500/1500 [00:25<00:00, 59.39it/s]
100%|██████████| 1500/1500 [00:32<00:00, 46.30it/s]
100%|██████████| 1500/1500 [00:40<00:00, 36.69it/s]
100%|██████████| 1500/1500 [00:30<00:00, 48.65it/s]









### print the results

In [7]:
def print_crossdomen_results(model_name="roberta", config_name="none", C=1):
    avg_in = 0
    avg_out = 0
    avg_3 = 0
    
    line_prefixes = [" GPT-3  ", " GPT-4o "]
    suffixes = ["3", "4o"]
    print(' ' * 21 + "        GPT-3     " + "        GPT-4o    " )
    print(' ' * 21 + "  Wiki Redd. Stac." * 2)
    for iteration in range(2):
        suffix_t = suffixes[iteration]
        suffix_ht = suffix_t
 
        for train in ["wiki", "reddit", "stackexchange"]:
            print(line_prefixes[iteration] + train + ' ' * (13 - len(train)), end=" ")

            X_train = np.vstack([
                np.load("{}avg_{}_human{}_{}_{}.npy".format(output_folder, model_name, suffix_ht, train, config_name))[:1300],
                np.load("{}avg_{}_gpt{}_{}_{}.npy".format(output_folder, model_name, suffix_t, train, config_name))[:1300]
            ])
            y_train = np.zeros(len(X_train))
            y_train[len(X_train) // 2:] = 1
            cls = LogisticRegression(max_iter=1000, C=C).fit(X_train, y_train)

            for suffix_v in suffixes:
                suffix_hv = suffix_v

                for valid in ["wiki", "reddit", "stackexchange"]:
                    X_valid = np.vstack([
                        np.load("{}avg_{}_human{}_{}_{}.npy".format(output_folder, model_name, suffix_hv, valid, config_name))[1300:],
                        np.load("{}avg_{}_gpt{}_{}_{}.npy".format(output_folder, model_name, suffix_v, valid, config_name))[1300:]
                    ])
                    y_val = np.zeros(len(X_valid))
                    y_val[len(X_valid) // 2:] = 1

                    print(format(cls.score(X_valid, y_val), '.3f'), end=" ")
                    if suffix_v != suffix_t:
                        if train == valid:
                            avg_out += cls.score(X_valid, y_val) / 6.0
                        else:
                            avg_3 += cls.score(X_valid, y_val) / 12.0
                    else:
                        if train != valid:
                            avg_in += cls.score(X_valid, y_val) / 12.0      
            print("")
    print("Cross-domain:", avg_in, "; Cross-model: ", avg_out, "; Cross-domain&model: ", avg_3, "\n")

In [8]:
print("All layers intact. Baseline RoBERTa Cross-domain accuracy: ")    
print_crossdomen_results(model_name="roberta", config_name="none", C=1)

All layers intact. Baseline RoBERTa Cross-domain accuracy: 
                             GPT-3             GPT-4o    
                       Wiki Redd. Stac.  Wiki Redd. Stac.
 GPT-3  wiki          0.988 0.593 0.965 0.500 0.323 0.495 
 GPT-3  reddit        0.507 1.000 0.973 0.775 1.000 0.828 
 GPT-3  stackexchange 0.938 0.820 0.995 0.915 0.780 0.578 
 GPT-4o wiki          0.527 0.848 0.802 1.000 0.980 0.907 
 GPT-4o reddit        0.297 0.993 0.920 0.650 0.998 0.968 
 GPT-4o stackexchange 0.733 0.770 0.667 0.890 0.953 1.000 
Cross-domain: 0.8452083333333333 ; Cross-model:  0.7108333333333333 ; Cross-domain&model:  0.7070833333333334 



In [10]:
print("RoBERTa, heads from layer#1 is prunned; Cross-domain accuracy: ")    
print_crossdomen_results(model_name="roberta", config_name="l0", C=1)

RoBERTa, heads from layer#1 is prunned; Cross-domain accuracy: 
                             GPT-3             GPT-4o    
                       Wiki Redd. Stac.  Wiki Redd. Stac.
 GPT-3  wiki          0.988 0.593 0.958 0.530 0.335 0.485 
 GPT-3  reddit        0.598 1.000 0.980 0.875 1.000 0.772 
 GPT-3  stackexchange 0.950 0.762 0.990 0.927 0.715 0.608 
 GPT-4o wiki          0.545 0.935 0.850 1.000 0.985 0.902 
 GPT-4o reddit        0.390 0.988 0.915 0.750 0.993 0.963 
 GPT-4o stackexchange 0.795 0.750 0.693 0.877 0.858 1.000 
Cross-domain: 0.8479166666666667 ; Cross-model:  0.7270833333333333 ; Cross-domain&model:  0.72875 



In [11]:
print("RoBERTa, heads from layer#4 is prunned; Cross-domain accuracy: ")    
print_crossdomen_results(model_name="roberta", config_name="l3", C=1)

RoBERTa, heads from layer#4 is prunned; Cross-domain accuracy: 
                             GPT-3             GPT-4o    
                       Wiki Redd. Stac.  Wiki Redd. Stac.
 GPT-3  wiki          0.985 0.580 0.960 0.512 0.355 0.492 
 GPT-3  reddit        0.522 1.000 0.973 0.787 1.000 0.838 
 GPT-3  stackexchange 0.932 0.850 0.993 0.917 0.812 0.632 
 GPT-4o wiki          0.532 0.875 0.772 1.000 0.980 0.907 
 GPT-4o reddit        0.325 0.985 0.905 0.672 0.995 0.963 
 GPT-4o stackexchange 0.745 0.818 0.675 0.887 0.955 0.998 
Cross-domain: 0.8485416666666667 ; Cross-model:  0.7229166666666667 ; Cross-domain&model:  0.7202083333333333 

