In [3]:
!pip install torch

Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Downloading torch-1.13.1-cp39-cp39-manylinux1_x86_64.whl (887.4 MB)
[K     |████████████████████████████████| 887.4 MB 57 kB/s  eta 0:00:0111     |█████████████████▋              | 487.2 MB 108.2 MB/s eta 0:00:04     |███████████████████             | 528.4 MB 28.3 MB/s eta 0:00:13     |███████████████████▏            | 531.5 MB 28.3 MB/s eta 0:00:13     |███████████████████▉            | 550.5 MB 28.3 MB/s eta 0:00:12
Collecting nvidia-cublas-cu11==11.10.3.66
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)
[K     |████████████████████████████████| 317.1 MB 109 kB/s eta 0:00:0101��████████              | 177.4 MB 107 kB/s eta 0:21:44
[?25hCollecting nvidia-cuda-nvrtc-cu11==11.7.99
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[K     |████████████████████████████████| 21.0 MB 140.4 MB/s eta 0:00:01
[?25hColl

In [26]:
import datasets
import transformers
import os
import pickle
import random

import pandas as pd
import datasets
from datasets import Dataset, DatasetDict

In [27]:
with open(os.path.join('../data', 'repos_as_pandas_metainfo_20230126.pkl'), 'rb') as file:
  repos = pickle.load(file)

In [28]:
repos = repos[repos.type_script=='py']

In [29]:
def standardize_types(x):
    if isinstance(x, list):
        return ','.join(x)
    else:
        return x

repos.meta_keywords = repos.meta_keywords.apply(standardize_types)

In [30]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

In [31]:

batch_size=4  # change to 16 for full training
encoder_max_length=512
decoder_max_length=6

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["code_script"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["meta_keywords"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch


In [32]:
repos['key_len'] = repos.meta_keywords.apply(len)
repos['code_len'] = repos.code_script.apply(len)

repos = repos[repos['code_len'] >5]
repos = repos[repos['key_len'] > 5]

In [33]:
repos['repo_name'] = repos.q_name.str.split('/', expand=True)[0]
repos['code_script'] = repos['code_script'].apply(lambda x: [line.decode() for line in x])
repos['code_script'] = repos['code_script'].apply(lambda x: ' newline '.join(x))

In [34]:
ids = list(range(repos.repo_name.nunique()))
train_idx  = random.sample(ids, 100)
test_idx = [id for id in ids if id not in train_idx]

train_repos = repos['repo_name'].unique()[train_idx]
test_repos = repos['repo_name'].unique()[test_idx]

train_data = Dataset.from_pandas(repos.loc[repos.repo_name.isin(train_repos),['code_script', 'meta_keywords']])
test_data = Dataset.from_pandas(repos.loc[repos.repo_name.isin(test_repos),['code_script', 'meta_keywords']])

In [35]:
val_data = test_data

In [36]:
# only use 32 training examples for notebook - DELETE LINE FOR FULL TRAINING
train_data = train_data.select(range(32))

train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["code_script", "meta_keywords"]
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)


# only use 16 training examples for notebook - DELETE LINE FOR FULL TRAINING
val_data = val_data.select(range(16))

val_data = val_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["code_script", "meta_keywords"]
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
   

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [37]:
from transformers import EncoderDecoderModel

bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("microsoft/codebert-base", "bert-base-uncased")

# set special tokens
bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id
bert2bert.config.eos_token_id = tokenizer.eos_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
bert2bert.config.vocab_size = bert2bert.config.decoder.vocab_size
bert2bert.config.max_length = 142
bert2bert.config.min_length = 56
bert2bert.config.no_repeat_ngram_size = 3
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 4

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.9.crossattention.self.value.bias', 'bert.encoder.layer.8.crossattention.self.value.bias', 'bert.encoder.layer.10.crossattention.output.dense.bias', 'bert.encoder.layer.11.crossattention.output.LayerNorm.weight', 'bert.e

In [38]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [41]:
# load rouge for validation
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge1_precision": round(rouge_output.precision, 4),
        "rouge1_recall": round(rouge_output.recall, 4),
        "rouge1_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [42]:

# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    logging_steps=1000,  # set to 1000 for full training
    save_steps=16,  # set to 500 for full training
    eval_steps=4,  # set to 8000 for full training
    warmup_steps=2000,  # set to 2000 for full training
    max_steps=16, # delete for full training
    overwrite_output_dir=True,
    save_total_limit=3,
    fp16=True, 
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=bert2bert,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()

Step,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 16
  Batch size = 8
Generate config GenerationConfig {
  "decoder_start_token_id": 101,
  "early_stopping": true,
  "eos_token_id": 102,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}



KeyboardInterrupt: 

In [1]:
import datasets
from transformers import BertTokenizer, EncoderDecoderModel

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = EncoderDecoderModel.from_pretrained("./checkpoint-16")
model.to("cuda")


EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [19]:
# only use 16 training examples for notebook - DELETE LINE FOR FULL TRAINING
test_data = test_data.select(range(16))

batch_size = 16  # change to 64 for full evaluation

# map data correctly
def generate_summary(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["code_script"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch

results = test_data.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["code_script"])

pred_str = results["pred"]
label_str = results["meta_keywords"]

rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

print(rouge_output)

  0%|          | 0/1 [00:00<?, ?ba/s]



Score(precision=0.0, recall=0.0, fmeasure=0.0)


In [20]:
pred_str

['the the the of of of part part part section part part of of section section section part section section sections section section line section section segment section section piece section section portion part part piece piece piece part section piece piece section segment segment segment part part per per per part part segment segment section piece part part bit bit bit section section bit bit part part parts part part portion section section per per piece piece bit bit piece piece per per bit bit bits bit bit of of bit bit little little little bit bit way way way bit bitininin bit bit cut bit bit tip bit bit line bit bit,,,inin section sectioncococoinin,, way way section section cut bit section part piece bit',
 'the the the of of of lower lower lower higher higher higher lower lower upper lower higher lower higher upper higher lower upper higher higher upper upper lower upper upper higher upper lower lower high higher higher high lower lower below higher higher above higher higher