In [None]:
!pip install transformers[sentencepiece] datasets

In [2]:
from datasets import load_dataset, DatasetDict

ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")

Downloading and preparing dataset json/huggingface-course--codeparrot-ds-train to /root/.cache/huggingface/datasets/json/huggingface-course--codeparrot-ds-train-40ca73f561bc3fca/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.25G [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/huggingface-course--codeparrot-ds-train-40ca73f561bc3fca/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


In [3]:
raw_datasets = ds_train.train_test_split(train_size=0.03, seed=20)
raw_datasets.pop("test")
raw_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 16380
    })
    test: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 1821
    })
})

In [4]:
raw_datasets["validation"] = raw_datasets.pop("test")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 16380
    })
    validation: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 1821
    })
})

In [5]:
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}")

REPO_NAME: radical-experiments/AIMES-Experience
PATH: old/synapse_integration_testing/cleaned_data/experiments/plot.py
COPIES: 1
SIZE: 1132
CONTENT: import csv
import sys
import matplotlib.pyplot as plt


if sys.argv[1] == 'TTC':
    filename = 'TTC.csv'
elif sys.argv[1] == 'Tq':
    filename = 'Tq.csv'
elif sys.argv[1] == 'Tx':
    filename = 'Tx
LICENSE: mit


In [6]:
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

outputs = tokenizer(
    raw_datasets["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Downloading:   0%|          | 0.00/265 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/771k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Input IDs length: 27
Input chunk lengths: [128, 128, 128, 51, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 3]
Chunk mapping: [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [7]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

  0%|          | 0/17 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 450339
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 49121
    })
})

In [8]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [9]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.2M parameters


In [10]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [12]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="gpt_code_gen",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=1000,
    logging_steps=500,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=500,
    save_total_limit=2,
    fp16=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

Using amp half precision backend


In [13]:
trainer.train()

***** Running training *****
  Num examples = 450339
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 8
  Total optimization steps = 3518


Step,Training Loss,Validation Loss
1000,3.164,2.716587
2000,2.2102,2.150288
3000,1.8594,1.891417


Saving model checkpoint to gpt_code_gen/checkpoint-500
Configuration saved in gpt_code_gen/checkpoint-500/config.json
Model weights saved in gpt_code_gen/checkpoint-500/pytorch_model.bin
tokenizer config file saved in gpt_code_gen/checkpoint-500/tokenizer_config.json
Special tokens file saved in gpt_code_gen/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 49121
  Batch size = 32
Saving model checkpoint to gpt_code_gen/checkpoint-1000
Configuration saved in gpt_code_gen/checkpoint-1000/config.json
Model weights saved in gpt_code_gen/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in gpt_code_gen/checkpoint-1000/tokenizer_config.json
Special tokens file saved in gpt_code_gen/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to gpt_code_gen/checkpoint-1500
Configuration saved in gpt_code_gen/checkpoint-1500/config.json
Model weights saved in gpt_code_gen/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in gpt_

TrainOutput(global_step=3518, training_loss=2.7161174804531356, metrics={'train_runtime': 17297.6961, 'train_samples_per_second': 52.069, 'train_steps_per_second': 0.203, 'total_flos': 5.8832709894144e+16, 'train_loss': 2.7161174804531356, 'epoch': 2.0})

In [14]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 49121
  Batch size = 32


{'eval_loss': 1.863879680633545,
 'eval_runtime': 311.2194,
 'eval_samples_per_second': 157.834,
 'eval_steps_per_second': 4.935,
 'epoch': 2.0}

In [19]:
trainer.save_model()

Saving model checkpoint to gpt_code_gen
Configuration saved in gpt_code_gen/config.json
Model weights saved in gpt_code_gen/pytorch_model.bin
tokenizer config file saved in gpt_code_gen/tokenizer_config.json
Special tokens file saved in gpt_code_gen/special_tokens_map.json


In [None]:
import torch
from transformers import pipeline

pipe = pipeline(
    "text-generation", model="gpt_code_gen", device=0
)

In [27]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
m = x + y + width # make sure it is the
