In [1]:
import numpy as np
import plotly.express as px
import pandas as pd

import torch
from torch import nn
import torch.nn.utils.prune as prune
import torch.nn.functional as F
import pandas as pd

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from transformers import AutoModel

from transformers import pipeline
from transformers import DebertaV2Tokenizer, DebertaV2Model, DebertaV2ForQuestionAnswering

from datasets import load_dataset
from evaluate import evaluator
from transformers import AutoModelForSequenceClassification, pipeline

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [47]:
# test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
# test = load_dataset("yhavinga/ccmatrix", "de-en", streaming=True)["train"].take(10)
test = load_dataset("j0hngou/ccmatrix_de-en")
print([x["de"] for x in test["train"]["translation"]][:10])

Using custom data configuration j0hngou--ccmatrix_de-en-1f42eeb206fbff2d
Found cached dataset parquet (/home/christopherkang/.cache/huggingface/datasets/j0hngou___parquet/j0hngou--ccmatrix_de-en-1f42eeb206fbff2d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 594.94it/s]


['Er wurde 2004 unter strengen Auflagen freigelassen.', 'Gibt es eine ideale Form?', 'Sie sind das Hauptwerk des "Naumburger Meisters", eines namentlich unbekannten, wohl aus Frankreich stammenden und dort ausgebildeten Steinbildhauers und Architekten und seines Werktrupps.', '„Ich werde dir nun zum Ausgleich für alles, was du mir gesagt hast, ein paar Informationen geben.', 'Sasha setzt sich sehr dafür ein, dass nicht-binäre Geschlechter anerkannt werden, und hatte sogar eine Online-Petition gestartet, um die Aufmerksamkeit von Präsident Obama auf das Thema zu lenken.', 'Geschäftsagilität wird durch das einfache, modulare Design und eine Vielzahl nützlicher Merkmale sichergestellt, die dazu beitragen, die Bereitstellungszeit auf wenige Stunden oder sogar nur Minuten zu reduzieren.', 'Die Damen entscheiden, mit wem sie sich paaren.', '1) Die Eigentümerstruktur japanischer Unternehmen hat sich von einem geschlossenen, Insider-basierten System zu einem offenen und wettbewerbsorientierten

## GPT2-XL

[Perplexity](https://huggingface.co/docs/transformers/perplexity)

In [10]:
torch.cuda.empty_cache()
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = "cuda"
# model_id = "gpt2-large"
# model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
# tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

# model = AutoModelForCausalLM.from_pretrained("gpt2-xl").to(device)
# tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")

tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/m2m100_418M").to(device)

# parameters_to_prune = [(m, "weight") for m in filter(lambda m: hasattr(m, "weight"), model.modules())]
# parameters_to_prune = [(m[1], "weight") for m in filter(lambda nm: hasattr(nm[1], "weight") and "mlp.c_fc" in nm[0], model.named_modules())]
# prune.global_unstructured(
#     parameters_to_prune,
#     pruning_method=prune.L1Unstructured,
#     amount=0.95,
# )

from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")

import torch
from tqdm import tqdm

# max_length = model.config.n_positions
max_lengh = 1024
stride = 512
seq_len = encodings.input_ids.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over input tokens.
        # Multiply it with trg_len to get the summation instead of average.
        # We will take average over all the tokens to get the true average
        # in the last step of this example.
        neg_log_likelihood = outputs.loss * trg_len

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).sum() / end_loc)

del model

Found cached dataset wikitext (/home/christopherkang/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)
Token indices sequence length is longer than the specified maximum sequence length for this model (331586 > 1024). Running this sequence through the model will result in indexing errors
100%|█████████▉| 646/648 [01:00<00:00, 10.69it/s]


In [11]:
ppl

tensor(8096.9810, device='cuda:0')

In [None]:
## GPT2_XL_results = [14.7877, 14.7825, 36.9467, 4670.9092, OOM, OOM, OOM]
## FB_results = []

## evaluating M2M-100

In [30]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/m2m100_418M").to(device)


In [35]:
parameters_to_prune = [m[1] for m in model.named_modules() if "fc" in m[0]]
parameters_to_prune

[Linear(in_features=1024, out_features=4096, bias=True),
 Linear(in_features=4096, out_features=1024, bias=True),
 Linear(in_features=1024, out_features=4096, bias=True),
 Linear(in_features=4096, out_features=1024, bias=True),
 Linear(in_features=1024, out_features=4096, bias=True),
 Linear(in_features=4096, out_features=1024, bias=True),
 Linear(in_features=1024, out_features=4096, bias=True),
 Linear(in_features=4096, out_features=1024, bias=True),
 Linear(in_features=1024, out_features=4096, bias=True),
 Linear(in_features=4096, out_features=1024, bias=True),
 Linear(in_features=1024, out_features=4096, bias=True),
 Linear(in_features=4096, out_features=1024, bias=True),
 Linear(in_features=1024, out_features=4096, bias=True),
 Linear(in_features=4096, out_features=1024, bias=True),
 Linear(in_features=1024, out_features=4096, bias=True),
 Linear(in_features=4096, out_features=1024, bias=True),
 Linear(in_features=1024, out_features=4096, bias=True),
 Linear(in_features=4096, out_f

In [18]:
ll = [m for m in model.named_modules() if "fc" in m[0] and m[1].out_features==8192]

In [2]:
torch.cuda.empty_cache()
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = "cuda"

# tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")
# model = AutoModelForSeq2SeqLM.from_pretrained("facebook/m2m100_418M").to(device)

tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_1.2B")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/m2m100_1.2B").to(device)

# parameters_to_prune = [(m, "weight") for m in filter(lambda m: hasattr(m, "weight"), model.modules())]
# parameters_to_prune = [(m[1], "weight") for m in filter(lambda nm: hasattr(nm[1], "weight") and "mlp.c_fc" in nm[0], model.named_modules())]

parameters_to_prune = [(m[1], "weight") for m in model.named_modules() if "fc" in m[0] and m[1].out_features==8192]
prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.9,
)

from datasets import load_dataset
import evaluate

# test = load_dataset("yhavinga/ccmatrix", "de-en", split="train", streaming=True).take(1000)
# 
test = load_dataset("j0hngou/ccmatrix_de-en", split="train")[:300]
tokenizer.src_lang = "de"

de_samples = [x["de"] for x in test["translation"]]
en_samples = [x["en"] for x in test["translation"]]

# encodings = tokenizer("\n\n".join([x["de"] for x in test["translation"]]), return_tensors="pt")
# out_encodings = tokenizer("\n\n".join([x["en"] for x in test["translation"]]), return_tensors="pt")

import torch
from tqdm import tqdm

bleu_out = []

all_translations = []

bleu = evaluate.load("bleu")

for idx in tqdm(range(len(de_samples))):
    tokenized = tokenizer(de_samples[idx], return_tensors="pt").to(device)
    outputs = model.generate(**tokenized, forced_bos_token_id=tokenizer.get_lang_id("en"))
    predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    all_translations.append(predictions[0])
    
    if (idx % 10 == 0):
        print (predictions)
        print(en_samples[idx])
    
results = bleu.compute(predictions=all_translations, references=en_samples)
del model

Using custom data configuration j0hngou--ccmatrix_de-en-1f42eeb206fbff2d
Found cached dataset parquet (/home/christopherkang/.cache/huggingface/datasets/j0hngou___parquet/j0hngou--ccmatrix_de-en-1f42eeb206fbff2d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 1/300 [00:01<06:06,  1.23s/it]

['G G G G G']
He was released in 2006 under stringent conditions.


  4%|▍         | 12/300 [00:03<00:50,  5.65it/s]

['I I I I I I I I']
Let’s look at both options and why going through a company is currently the best way to extract Bitcoins for profit.


  7%|▋         | 22/300 [00:04<00:35,  7.87it/s]

['I I I']
I’m familiar with birds.


 11%|█         | 32/300 [00:05<00:43,  6.10it/s]

['I I I I I I I I I I I']
Of course, planning is thinking through how to get where you want to go and taking precautions to ensure you don’t wind up where you don’t want to go.


 14%|█▍        | 42/300 [00:07<00:40,  6.38it/s]

['I I I I I I I']
39 At the end of the two months, she came back to her father.


 17%|█▋        | 52/300 [00:09<00:41,  6.04it/s]

['I I I I']
“So my doctors want me to continue to rest my voice.


 21%|██        | 62/300 [00:10<00:34,  6.87it/s]

['C C C C C C C C']
This particular dimension creates this element much more intensely than any other dimension physically focused.


 24%|██▍       | 72/300 [00:12<00:35,  6.44it/s]

['I I I I I I I']
The PAL photo ID card is valid for five years and shows the machine categories in which the operator has been trained.


 27%|██▋       | 82/300 [00:13<00:30,  7.16it/s]

['']
there are empty bottles


 31%|███       | 92/300 [00:15<00:34,  6.09it/s]

['I I I I I I I I']
Christopher Nolan seems incapable of directing a bad film.


 34%|███▍      | 102/300 [00:17<00:27,  7.13it/s]

['I I I I I I I']
With 2012 well on its way, many business owners are looking to maximize their business potential this year.


 37%|███▋      | 112/300 [00:18<00:32,  5.70it/s]

['I I I I I I I I']
Determination of a threat to the peace breach of the peace or act of aggression under Article 39 of the Charter


 41%|████      | 122/300 [00:20<00:30,  5.82it/s]

['I I I I I I I']
The infoBoard web application "HR Permissions" gives you an overview of all vacation requests and all requests for absenteeism of the employees in your company, which were previously submitted in "IB HRRequests".


 44%|████▍     | 132/300 [00:22<00:30,  5.42it/s]

['I I I I I I I I']
Her attitude towards me was far from the best as she admired the previous President and she was on his side.


 47%|████▋     | 142/300 [00:24<00:23,  6.63it/s]

['I I I I I I I']
“The long-term vision is, once we’ve been able to analyze all the data from CAMP, we would have a series of panels,” said David G. Amaral, one of the researchers.


 51%|█████     | 152/300 [00:25<00:24,  6.06it/s]

['I I I I I']
No one learns faster than those who teach.


 54%|█████▎    | 161/300 [00:27<00:20,  6.89it/s]

['I I I I I I']
The data protection guidelines are an integral part of these terms of use and regulate the collection, storage and use of your personal data.


 57%|█████▋    | 171/300 [00:28<00:18,  7.16it/s]

['I I I I I I']
This is a taste of what I mean by "tortured".


 60%|██████    | 181/300 [00:30<00:21,  5.53it/s]

['I I I I I I I']
But I can absolutely tell you, in the NSA world defense wins.


 64%|██████▍   | 192/300 [00:32<00:17,  6.30it/s]

['I I I I I I']
Mairo was there, washing her hair in a lavatory.


 67%|██████▋   | 202/300 [00:34<00:16,  5.79it/s]

['I I I I I I I']
Instead, you must utilize Instagram proxies and proportional strategies so as to earn money with the social network.


 70%|███████   | 211/300 [00:35<00:14,  6.35it/s]

['I I I I I I I']
And they add: "OMNINET's product portfolio is both innovative and mature.


 74%|███████▍  | 222/300 [00:37<00:12,  6.24it/s]

['']
Have an enthusiastic attitude


 77%|███████▋  | 232/300 [00:38<00:09,  6.91it/s]

['K K']
Royal Dutch Shell: claims for oil pollution in Nigeria


 81%|████████  | 242/300 [00:40<00:09,  6.41it/s]

['I I I I I I I I']
Scalping is highly delicate trading, that almost can’t be formally characterized.


 84%|████████▎ | 251/300 [00:41<00:08,  5.61it/s]

['I I I I I I I I']
Battling on this video slot, you will get an incredible dose of excitement, as the victory always brings great pleasure


 87%|████████▋ | 262/300 [00:43<00:05,  6.46it/s]

['I I I I']
Not all diseases are easily seen.


 91%|█████████ | 272/300 [00:45<00:03,  7.52it/s]

['I I I I I I']
Nel-Peters is from the South African coastal community of Sedgefield in the Western Cape province.


 94%|█████████▎| 281/300 [00:46<00:02,  7.25it/s]

['I I I I I']
The 2008 financial crisis put them out of business.


 97%|█████████▋| 292/300 [00:51<00:05,  1.33it/s]

['..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................']
… the more I like it.


100%|██████████| 300/300 [00:52<00:00,  5.71it/s]


In [22]:
t_idx = 3
de_samples[t_idx], all_translations[t_idx], en_samples[t_idx]

('„Ich werde dir nun zum Ausgleich für alles, was du mir gesagt hast, ein paar Informationen geben.',
 '“I’m going to give you some information to compensate for everything you told me.',
 '“I am giving you some information now, in return for all that you have given me.')

In [3]:
results

{'bleu': 0.0,
 'precisions': [0.011693548387096775, 0.0, 0.0, 0.0],
 'brevity_penalty': 0.31920061590833076,
 'length_ratio': 0.46686746987951805,
 'translation_length': 2480,
 'reference_length': 5312}

## BERT

In [7]:
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

parameters_to_prune = [(m, "weight") for m in filter(lambda m: hasattr(m, "weight"), model.modules())]
prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.99,
)

NUM_TRIALS = 300

# dataset = load_dataset("squad_v2")
dataset = load_dataset("squad")
all_outputs = []
all_answers = []
for idx in tqdm(range(NUM_TRIALS)):
    question, text = dataset["validation"][idx]["question"], dataset["validation"][idx]["context"]
    
    inputs = tokenizer(question, text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        
    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()

    predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    
    
    {'': '1976', 'id': '56e10a3be3433e1400422b22', 'no_answer_probability': 0.}
    all_outputs.append({
        "prediction_text": tokenizer.decode(predict_answer_tokens, skip_special_tokens=True),
        "id": dataset["validation"][idx]["id"],
        "no_answer_probability": 0.,
    })
    all_answers.append({
        "answers": dataset["validation"][idx]["answers"],
        "id": dataset["validation"][idx]["id"],
    })


Found cached dataset squad (/home/christopherkang/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████| 2/2 [00:00<00:00, 996.75it/s]
100%|██████████| 300/300 [01:19<00:00,  3.79it/s]


In [10]:
all_outputs

[{'prediction_text': '',
  'id': '56be4db0acb8001400a502ec',
  'no_answer_probability': 0.0},
 {'prediction_text': '',
  'id': '56be4db0acb8001400a502ed',
  'no_answer_probability': 0.0},
 {'prediction_text': '',
  'id': '56be4db0acb8001400a502ee',
  'no_answer_probability': 0.0},
 {'prediction_text': '',
  'id': '56be4db0acb8001400a502ef',
  'no_answer_probability': 0.0},
 {'prediction_text': '',
  'id': '56be4db0acb8001400a502f0',
  'no_answer_probability': 0.0},
 {'prediction_text': '',
  'id': '56be8e613aeaaa14008c90d1',
  'no_answer_probability': 0.0},
 {'prediction_text': '',
  'id': '56be8e613aeaaa14008c90d2',
  'no_answer_probability': 0.0},
 {'prediction_text': '',
  'id': '56be8e613aeaaa14008c90d3',
  'no_answer_probability': 0.0},
 {'prediction_text': '',
  'id': '56bea9923aeaaa14008c91b9',
  'no_answer_probability': 0.0},
 {'prediction_text': '',
  'id': '56bea9923aeaaa14008c91ba',
  'no_answer_probability': 0.0},
 {'prediction_text': '',
  'id': '56bea9923aeaaa14008c91bb',

In [8]:
from evaluate import load

squad_v2_ = load("squad_v2")
results = squad_v2_.compute(predictions=all_outputs, references=all_answers)

In [9]:
results

{'exact': 0.0,
 'f1': 0.0,
 'total': 300,
 'HasAns_exact': 0.0,
 'HasAns_f1': 0.0,
 'HasAns_total': 300,
 'best_exact': 0.0,
 'best_exact_thresh': 0.0,
 'best_f1': 0.0,
 'best_f1_thresh': 0.0}

In [59]:
from evaluate import load
squad_metric = load("squad_v2")
predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22', 'no_answer_probability': 0.}, {'prediction_text': 'Beyonce', 'id': '56d2051ce7d4791d0090260b', 'no_answer_probability': 0.},  {'prediction_text': 'climate change', 'id': '5733b5344776f419006610e1', 'no_answer_probability': 0.}]
references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}, {'answers': {'answer_start': [233], 'text': ['Beyoncé and Bruno Mars']}, 'id': '56d2051ce7d4791d0090260b'}, {'answers': {'answer_start': [891], 'text': ['climate change']}, 'id': '5733b5344776f419006610e1'}]
results = squad_metric.compute(predictions=predictions, references=references)
results
{'exact': 66.66666666666667, 'f1': 66.66666666666667, 'total': 3, 'HasAns_exact': 66.66666666666667, 'HasAns_f1': 66.66666666666667, 'HasAns_total': 3, 'best_exact': 66.66666666666667, 'best_exact_thresh': 0.0, 'best_f1': 66.66666666666667, 'best_f1_thresh': 0.0}

{'exact': 66.66666666666667,
 'f1': 66.66666666666667,
 'total': 3,
 'HasAns_exact': 66.66666666666667,
 'HasAns_f1': 66.66666666666667,
 'HasAns_total': 3,
 'best_exact': 66.66666666666667,
 'best_exact_thresh': 0.0,
 'best_f1': 66.66666666666667,
 'best_f1_thresh': 0.0}

## Twitter 

[TwHIN-BERT](https://huggingface.co/Twitter/twhin-bert-large?text=The+goal+of+life+is+%3Cmask%3E.)

[Paper](https://arxiv.org/pdf/2209.07562.pdf)

benchmark -- 

In [7]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('Twitter/twhin-bert-large')
model = BertForMaskedLM.from_pretrained('Twitter/twhin-bert-large')
inputs = tokenizer("I'm using TwHIN-BERT! #TwHIN-BERT #NLP", return_tensors="pt")
outputs = model(**inputs)


Downloading:  68%|██████▊   | 1.53G/2.25G [03:34<01:40, 7.12MB/s]


In [8]:
tokenizer.decode(outputs)

TypeError: argument 'ids': 'dict' object cannot be converted to 'Sequence'

In [18]:
# from transformers import AutoTokenizer, AutoModelForMaskedLM

# tokenizer = AutoTokenizer.from_pretrained("Twitter/twhin-bert-large")

# model = AutoModelForMaskedLM.from_pretrained("Twitter/twhin-bert-large")



from transformers import BertTokenizer, BertForMaskedLM
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-large-cased")
model = BertForMaskedLM.from_pretrained("bert-large-cased")

inputs = tokenizer("The [MASK] is falling!", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

# retrieve index of [MASK]
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


'city'

In [14]:
mask_token_index

tensor([], dtype=torch.int64)

## DeBERTa

[Q&A](https://huggingface.co/docs/transformers/tasks/question_answering)

[Q&A eval](https://huggingface.co/course/chapter7/7?fw=tf)

-- had problems just evaluating the model, wasn't satisfied with performance, so switching to a lower parameter model. 

In [26]:
from transformers import DebertaV2Tokenizer, DebertaV2ForQuestionAnswering
import torch

# tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v2-xxlarge")
tokenizer = DebertaV2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-deberta-v2")
model = DebertaV2ForQuestionAnswering.from_pretrained("microsoft/deberta-v2-xxlarge", force_download=True)

dataset = load_dataset("squad")
print(dataset["train"][0])


t_idx = 1
# question, text = dataset["train"][t_idx]["question"], dataset["train"][t_idx]["context"]
# print(dataset["train"][t_idx]["answers"])
question, text = "To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?", "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary."

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
print(tokenizer.decode(predict_answer_tokens))

Downloading: 100%|██████████| 633/633 [00:00<00:00, 1.45MB/s]
Some weights of the model checkpoint at microsoft/deberta-v2-xxlarge were not used when initializing DebertaV2ForQuestionAnswering: ['deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2ForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaV2ForQuestionAnswering were not initialized from 

{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}



In [31]:
from transformers import DebertaV2Tokenizer, DebertaV2ForMaskedLM
import torch

# tokenizer = DebertaV2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-deberta-v2")
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v2-xxlarge")
# model = DebertaV2ForMaskedLM.from_pretrained("hf-internal-testing/tiny-random-deberta-v2")
model = DebertaV2ForMaskedLM.from_pretrained("microsoft/deberta-v2-xxlarge", force_download=True)

inputs = tokenizer("The capital of France is [MASK].", return_tensors = "pt")
# inputs = tokenizer("I like to put [MASK] and mustard on my hot dog.", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

# retrieve index of [MASK]
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Downloading: 100%|██████████| 633/633 [00:00<00:00, 1.51MB/s]
Some weights of the model checkpoint at microsoft/deberta-v2-xxlarge were not used when initializing DebertaV2ForMaskedLM: ['deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClas

'LTO'

In [23]:
answer_start_index

tensor(127)

In [11]:
from transformers import AutoModelForQuestionAnswering, DebertaTokenizerFast
model = AutoModelForQuestionAnswering.from_pretrained("microsoft/deberta-v2-xxlarge", )

def preprocess_validation_examples(examples, max_length=100, stride=50):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

## taken from HF

raw_datasets = load_dataset("squad")
small_eval_set = raw_datasets["validation"].select(range(100))
trained_checkpoint = "distilbert-base-cased-distilled-squad"

# tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base")
eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)


eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping", "token_type_ids"])
eval_set_for_model.set_format("numpy")

batch = {k: eval_set_for_model[k] for k in eval_set_for_model.column_names}
print(batch)

# outputs = model(**batch)
outputs = model(np.expand_dims(batch["input_ids"][0]), batch["attention_mask"][0])

import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})


Some weights of the model checkpoint at microsoft/deberta-v2-xxlarge were not used when initializing DebertaV2ForQuestionAnswering: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight']
- This IS expected if you are initializing DebertaV2ForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaV2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-v2-xxlarge and are n

{'input_ids': array([[    1, 32251,  1485, ...,     4,   287,     2],
       [    1, 32251,  1485, ...,  1582,  2616,     2],
       [    1, 32251,  1485, ...,     0,     0,     0],
       ...,
       [    1,  2264,    80, ...,     0,     0,     0],
       [    1, 12375,  2308, ...,     0,     0,     0],
       [    1, 12375, 27724, ...,     4,     2,     0]]), 'attention_mask': array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 0]])}





TypeError: _expand_dims_dispatcher() missing 1 required positional argument: 'axis'

In [10]:
print(batch["attention_mask"][0])

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [22]:
[**batch]

SyntaxError: invalid syntax (<ipython-input-22-8ed7ae7a61c1>, line 1)

In [None]:
import evaluate

metric = evaluate.load("squad")
performance = metric.compute(predictions=predicted_answers, references=theoretical_answers)

## everything else

In [None]:
params = model.state_dict()

In [None]:
size = params["transformer.h.46.attn.bias"].reshape((-1)).size()[0]
sum(np.isclose(params["transformer.h.46.attn.bias"].reshape((-1)), np.zeros(size)))

In [None]:
def assess_sparsity(model):
    params = model.state_dict()
    out = pd.DataFrame(index = params.keys(), columns=["type", "weights", "zero", "%"])

    for key, value in params.items():
    # print(key)
        flattened = value.reshape((-1))
        size = flattened.size()[0]

        type_of_arr = str(key).split(".")[-1]

        out.loc[key, "type"] = type_of_arr
        out.loc[key, "weights"] = size
        out.loc[key, "zero"] = np.count_nonzero(np.isclose(flattened, np.zeros(size), atol=1e-2))

    out["%"] = out["zero"] / out["weights"]
    return out

  ## for weights in model
  ## sparsity["layer_name"] = sparsity_fn(weights)
  ## return sparsity

In [None]:
xl_sparsity = assess_sparsity(model)

In [None]:
xl_sparsity

## Models
We are looking for three HuggingFace models all with >1B parameters. For each of the categories:
* E-o: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://huggingface.co/microsoft/deberta-v2-xxlarge)
* D-o: [GPT2-XL](https://huggingface.co/gpt2-xl)
* E-D: [Facebook / M2M100](https://huggingface.co/facebook/m2m100_418M)

In [None]:
xl_sparsity_2 = assess_sparsity(model)

In [None]:
xl_sparsity_2

prune

In [None]:
parameters_to_prune = [(m, "weight") for m in filter(lambda m: hasattr(m, "weight"), model.modules())]

prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.1,
)

In [None]:
tokenizer.push_to_hub("gpt2-xl-10")
model.push_to_hub("gpt2-xl-10")

In [None]:
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

generator(
    "My name is Julien and I like to", )

In [11]:
LIST_OF_MODELS = ["gpt2-xl", "microsoft/deberta-v2-xxlarge", "facebook/m2m100_418M"]
def produce_model(name):
    if name == "gpt2-xl":
        tokenizer = AutoTokenizer.from_pretrained("gpt2-xl", force_download=True)
        model = AutoModelForCausalLM.from_pretrained("gpt2-xl", force_download=True)
        
        return tokenizer, model
        
    elif name == "microsoft/deberta-v2-xxlarge":
        model = AutoModel.from_pretrained("microsoft/deberta-v2-xxlarge", force_download=True)
        return _, model
    
    elif name == "facebook/m2m100_418M":
        tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M", force_download=True)
        model = AutoModelForSeq2SeqLM.from_pretrained("facebook/m2m100_418M", force_download=True)

        return tokenizer, model
    
    assert 1 == 0, "Unknown name!"

In [9]:
for sparsity in np.array([10, 50, 90, 95, 99]) / 100:
    print(sparsity)

0.1
0.5
0.9
0.95
0.99


In [None]:
SPARSITY = np.array([10, 50, 90, 95, 99]) / 100
# SPARSITY = np.array([99]) / 100
MODEL = LIST_OF_MODELS[0]

for sparsity in SPARSITY:
    tokenizer, model = produce_model(MODEL)
    
    parameters_to_prune = [(m, "weight") for m in filter(lambda m: hasattr(m, "weight"), model.modules())]

    prune.global_unstructured(
        parameters_to_prune,
        pruning_method=prune.L1Unstructured,
        amount=sparsity,
    )
    
    repo_header = MODEL.split("/")[-1]
    
    generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

    print(generator("My name is Julien and I like to", ))
    
    if tokenizer:
        tokenizer.push_to_hub(f"b_{repo_header}_{int(sparsity * 100)}")

    model.push_to_hub(f"b_{repo_header}_{int(sparsity * 100)}")
    
    del tokenizer
    del model
    del parameters_to_prune
    
    print(f"{sparsity} done!")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "My name is Julien and I like to write on stories that are interesting for me, like I have several stories that I'm reading right now where there are a lot of girls who are doing the same thing as me, but nobody was interested in"}]
0.1 done!


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'My name is Julien and I like to English Author Author User Apprentice Senior Teacher Independent\n Woman One Other Year Writer Year Woman Year Woman Woman World Word Author Other Teacher Teacher Teacher Self Teacher Teacher Teacher Teacher Teacher Teacher Teacher Teacher U Teacher Writer\n Year Teacher'}]
0.5 done!


In [None]:
## grab model
tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
model = AutoModelForCausalLM.from_pretrained("gpt2-xl")

## prune
parameters_to_prune = [(m, "weight") for m in filter(lambda m: hasattr(m, "weight"), model.modules())]
prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.4,
)

## test
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

print(generator("My name is Julien and I like to", ))

In [8]:
tokenizer.push_to_hub("test_b")
model.push_to_hub("test_b")

CommitInfo(commit_url='https://huggingface.co/ctkang/test_b/commit/2472028eb83ca95a7478d7282addfd3bfd53b5e6', commit_message='Upload model', commit_description='', oid='2472028eb83ca95a7478d7282addfd3bfd53b5e6', pr_url=None, pr_revision=None, pr_num=None)

## Evaluation
The following sections can be used to evaluate the models

In [27]:
model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
# [(m, "weight") for m in filter(lambda m: hasattr(m, "weight"), model.modules())]

[GPT2LMHeadModel(
   (transformer): GPT2Model(
     (wte): Embedding(50257, 1600)
     (wpe): Embedding(1024, 1600)
     (drop): Dropout(p=0.1, inplace=False)
     (h): ModuleList(
       (0): GPT2Block(
         (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
         (attn): GPT2Attention(
           (c_attn): Conv1D()
           (c_proj): Conv1D()
           (attn_dropout): Dropout(p=0.1, inplace=False)
           (resid_dropout): Dropout(p=0.1, inplace=False)
         )
         (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
         (mlp): GPT2MLP(
           (c_fc): Conv1D()
           (c_proj): Conv1D()
           (act): NewGELUActivation()
           (dropout): Dropout(p=0.1, inplace=False)
         )
       )
       (1): GPT2Block(
         (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
         (attn): GPT2Attention(
           (c_attn): Conv1D()
           (c_proj): Conv1D()
           (attn_dropout): Dropout(p=0.1, inplace=Fals

In [43]:
[name for (name, module) in model.named_modules()]

['',
 'transformer',
 'transformer.wte',
 'transformer.wpe',
 'transformer.drop',
 'transformer.h',
 'transformer.h.0',
 'transformer.h.0.ln_1',
 'transformer.h.0.attn',
 'transformer.h.0.attn.c_attn',
 'transformer.h.0.attn.c_proj',
 'transformer.h.0.attn.attn_dropout',
 'transformer.h.0.attn.resid_dropout',
 'transformer.h.0.ln_2',
 'transformer.h.0.mlp',
 'transformer.h.0.mlp.c_fc',
 'transformer.h.0.mlp.c_proj',
 'transformer.h.0.mlp.act',
 'transformer.h.0.mlp.dropout',
 'transformer.h.1',
 'transformer.h.1.ln_1',
 'transformer.h.1.attn',
 'transformer.h.1.attn.c_attn',
 'transformer.h.1.attn.c_proj',
 'transformer.h.1.attn.attn_dropout',
 'transformer.h.1.attn.resid_dropout',
 'transformer.h.1.ln_2',
 'transformer.h.1.mlp',
 'transformer.h.1.mlp.c_fc',
 'transformer.h.1.mlp.c_proj',
 'transformer.h.1.mlp.act',
 'transformer.h.1.mlp.dropout',
 'transformer.h.2',
 'transformer.h.2.ln_1',
 'transformer.h.2.attn',
 'transformer.h.2.attn.c_attn',
 'transformer.h.2.attn.c_proj',
 'tran

In [62]:
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention
xx = [(m[1], "weight") for m in filter(lambda nm: hasattr(nm[1], "weight") and "mlp" in nm[0], model.named_modules())]

In [64]:
del model

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.66 GiB (GPU 0; 39.59 GiB total capacity; 26.08 GiB already allocated; 2.37 GiB free; 36.01 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [70]:
# del model
# model = AutoModelForCausalLM.from_pretrained("gpt2-xl").to(device)
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 6            |        cudaMalloc retries: 8         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   31322 MB |   39610 MB |   18772 GB |   18741 GB |
|       from large pool |   31315 MB |   39547 MB |   18639 GB |   18609 GB |
|       from small pool |       7 MB |     108 MB |     132 GB |     132 GB |
|---------------------------------------------------------------------------|
| Active memory         |   31322 MB |   39610 MB |   18772 GB |   18741 GB |
|       from large pool |   31315 MB |   39547 MB |   18639 GB |   18609 GB |
|       from small pool |       7 MB |     108 MB |     132 GB |     132 GB |
|---------------------------------------------------------------

In [3]:
ppl

tensor(14.7911, device='cuda:0')

### GPT2-XL

In [4]:
data = load_dataset("imdb", split="test").shuffle(seed=42).select(range(1000))
task_evaluator = evaluator("text-classification")

MODEL = "gpt2-xl"

tokenizer, model = produce_model(MODEL)

## pruning
parameters_to_prune = [(m, "weight") for m in filter(lambda m: hasattr(m, "weight"), model.modules())]

prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=sparsity,
)

## evaluation
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

print(generator("My name is Julien and I like to", ))

model = AutoModelForSequenceClassification.from_pretrained("lvwerra/distilbert-imdb")
tokenizer = AutoTokenizer.from_pretrained("lvwerra/distilbert-imdb")

eval_results = task_evaluator.compute(
    model_or_pipeline=model,
    tokenizer=tokenizer,
    data=data,
    label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
)

del tokenizer
del model
del parameters_to_prune

Found cached dataset imdb (/home/christopherkang/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
Loading cached shuffled indices for dataset at /home/christopherkang/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-b23cfeb68a931a8d.arrow


### DeBERTa

In [2]:
data = load_dataset("squad", split="validation").shuffle(seed=42).select(range(50))
task_evaluator = evaluator("question-answering")

# MODEL = "gpt2-xl"

# tokenizer, model = produce_model(MODEL)

# ## pruning
# parameters_to_prune = [(m, "weight") for m in filter(lambda m: hasattr(m, "weight"), model.modules())]

# prune.global_unstructured(
#     parameters_to_prune,
#     pruning_method=prune.L1Unstructured,
#     amount=sparsity,
# )

# ## evaluation
# generator = pipeline(task="question-answering", model=model, tokenizer=tokenizer)

# print(generator("My name is Julien and I like to", ))


tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v2-xlarge")
# model = DebertaV2Model.from_pretrained("microsoft/deberta-v2-xlarge")
model = DebertaV2ForQuestionAnswering.from_pretrained("microsoft/deberta-v2-xlarge")

eval_results = task_evaluator.compute(
    model_or_pipeline=model,
    tokenizer=tokenizer,
    data=data
)

del tokenizer
del model
# del parameters_to_prune

Found cached dataset squad (/home/christopherkang/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Loading cached shuffled indices for dataset at /home/christopherkang/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-c13da8c69afb5b41.arrow
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v2-xlarge were not used when initializing DebertaV2ForQuestionAnswering: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForQuestionAnswering from the checkpoint of a model trained on another task o

In [3]:
eval_results

{'exact_match': 0.0,
 'f1': 6.4492063492063485,
 'total_time_in_seconds': 13.478586913988693,
 'samples_per_second': 3.7095876829719976,
 'latency_in_seconds': 0.2695717382797738}

In [28]:
## sample code from Madison
debert = DebertaV2Model.from_pretrained("microsoft/deberta-v2-xlarge")
debert_tok = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v2-xlarge")
inputs = debert_tok("A step by step recipe to make bolognese pasta:", return_tensors="pt")
outputs = debert(**inputs)
last_hidden_states = outputs.last_hidden_state
debert_tok.decode(outputs)

Some weights of the model checkpoint at microsoft/deberta-v2-xlarge were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


ValueError: invalid literal for int() with base 10: 'last_hidden_state'

Found cached dataset imdb (/home/christopherkang/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
Loading cached shuffled indices for dataset at /home/christopherkang/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-b23cfeb68a931a8d.arrow


Exception: Impossible to guess which tokenizer to use. Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer.

BaseModelOutput(last_hidden_state=tensor([[[-0.6520, -5.2399, -0.7639,  ..., -0.3622,  1.1090, -0.5299],
         [-0.5814,  1.3496,  0.2630,  ...,  0.1930,  1.5334, -0.6259],
         [-0.2318,  2.7247, -0.2569,  ...,  0.4096,  1.1944,  0.3863],
         ...,
         [-0.1746, -0.1909, -0.1213,  ..., -0.0068,  0.2581,  0.5316],
         [-0.8192, -1.7572,  0.2999,  ...,  0.9059, -0.2985, -0.5376],
         [-1.1186, -4.5928, -0.8893,  ..., -0.1412,  1.2751, -0.4949]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [21]:
eval_results

{'exact_match': 0.0,
 'f1': 7.0,
 'total_time_in_seconds': 22.872378915082663,
 'samples_per_second': 0.43720856659145896,
 'latency_in_seconds': 2.287237891508266}

In [None]:
LIST_OF_MODELS = ["gpt2-xl", "microsoft/deberta-v2-xxlarge", "facebook/m2m100_418M"]


## garbage

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("ctkang/b_gpt2-xl_10")
model = AutoModelForCausalLM.from_pretrained("ctkang/b_gpt2-xl_10")

Downloading: 100%|██████████| 258/258 [00:00<00:00, 244kB/s]
Downloading: 100%|██████████| 798k/798k [00:00<00:00, 11.1MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 5.43MB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 15.6MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 98.6kB/s]
Downloading: 100%|██████████| 975/975 [00:00<00:00, 401kB/s]
Downloading: 100%|██████████| 75.5k/75.5k [00:00<00:00, 2.28MB/s]
Downloading: 100%|██████████| 10.0G/10.0G [02:07<00:00, 78.7MB/s]
Downloading: 100%|██████████| 3.15G/3.15G [00:41<00:00, 76.0MB/s]
Some weights of the model checkpoint at ctkang/b_gpt2-xl_10 were not used when initializing GPT2LMHeadModel: ['transformer.h.37.mlp.c_fc.weight_orig', 'transformer.h.32.attn.c_proj.weight_orig', 'transformer.h.3.attn.c_attn.weight_mask', 'transformer.h.35.attn.c_proj.weight_orig', 'transformer.h.5.mlp.c_fc.weight_orig', 'transformer.h.20.ln_1.weight_mask', 'transformer.h.1.mlp.c_fc.weight_mask', 'transformer.h.10.attn.

In [4]:
tokenizer = AutoTokenizer.from_pretrained("ctkang/test_gpt_xl")
model = AutoModelForCausalLM.from_pretrained("ctkang/test_gpt_xl")

Downloading: 100%|██████████| 258/258 [00:00<00:00, 251kB/s]
Downloading: 100%|██████████| 798k/798k [00:00<00:00, 7.99MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 5.95MB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 16.2MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 102kB/s]


In [3]:
from transformers import pipeline

generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

generator(
    "My name is Julien and I like to", )

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'My name is Julien and I like to store integratedTalkfutureousy plasticabil skirtshesis scientSelf rupt skirts REG integrated hangingroximatelyNitIVERS BearsReally peasantsusk reuniteduskStock � hitterspe Draw diplomat hitters occasionallyifty neededhesis hanging Grahamprofessionalauntlet Null'}]

In [5]:
parameters_to_prune = [(m, "weight") for m in filter(lambda m: hasattr(m, "weight"), model.modules())]

prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.10,
)

In [8]:
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

generator("My name is Julien and I like to", )

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'My name is Julien and I like to make funny games. As an avid fan of all things comic book, I have a lot of ideas for various characters, story branches, etc., but that takes time and money away from making the games I'}]

In [None]:
tokenizer.push_to_hub("test_gpt_xl")
model.push_to_hub("test_gpt_xl")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ctkang/test_gpt_xl")
model = AutoModelForCausalLM.from_pretrained("ctkang/test_gpt_xl")

In [None]:
from transformers import pipeline
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

generator(
    "My name is Julien and I like to", )