In [63]:
# !pip install torchdata==0.5.1

In [64]:
#!pip install datasets

In [65]:
#!pip install transformers

In [66]:
#!pip install accelerate

In [None]:
# !python -m spacy download en_core_web_md

In [6]:
import torch, torchdata, torchtext
from torch import nn
import torch.nn.functional as F

import random, math, time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# making our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cuda


In [7]:
torch.cuda.get_device_name(0)

'Tesla T4'

In [8]:
torch.__version__

'1.13.1+cu116'

In [9]:
torchtext.__version__

'0.14.1'

# 1. Loading Data

In [10]:
def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True
    return False

In [11]:
filters = ["numpy", "sklearn", "matplotlib", "seaborn"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"

print(
    any_keyword_in_string(example_1, filters), any_keyword_in_string(example_2, filters)
)

True False


In [12]:
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset

def filter_streaming_dataset(dataset, filters):
    filtered_dict = defaultdict(list)
    total = 0
    for sample in tqdm(iter(dataset)):
        total += 1
        if any_keyword_in_string(sample["content"], filters):
            for k, v in sample.items():
                filtered_dict[k].append(v)
    print(f"{len(filtered_dict['content'])/total:.2%} of data after filtering.")
    return Dataset.from_dict(filtered_dict)

In [13]:
from datasets import load_dataset, DatasetDict

# using "github jupyter code to text" dataset

ds_train = load_dataset("codeparrot/github-jupyter-code-to-text", split="train")
ds_valid = load_dataset("codeparrot/github-jupyter-code-to-text", split="test")

raw_datasets = DatasetDict(
    {
        "train": ds_train.shuffle().select(range(5000)),
        "valid": ds_valid.shuffle().select(range(500))
    }
)

raw_datasets

Downloading readme:   0%|          | 0.00/857 [00:00<?, ?B/s]

Downloading and preparing dataset parquet/codeparrot--github-jupyter-code-to-text to /root/.cache/huggingface/datasets/codeparrot___parquet/codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/227M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/56.9M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/codeparrot___parquet/codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.




DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'license', 'content'],
        num_rows: 5000
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'license', 'content'],
        num_rows: 500
    })
})

In [14]:
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}")

REPO_NAME: judithfan/graphcomm
PATH: experiments/recog/preprocess_sketches.ipynb
LICENSE: mit
CONTENT: upload_dir = './sketch'

import boto
runThis = 0
if runThis:
    conn = boto.connect_s3()
    b = conn.create_bucket('sketchpad_basic_pilot2_sketches')
    all_files = [i for i in os.listdir(upload_di


# 2. Preprocessing

In [15]:
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

outputs = tokenizer(
    raw_datasets["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Downloading (…)okenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/789k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/448k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Input IDs length: 25
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 87]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [16]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 130650
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 12463
    })
})

# 3. Modeling

In [34]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# adding the EOS token as PAD token to avoid warnings
model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

In [35]:
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.4M parameters


In [36]:
keytoken_ids = []
for keyword in [
    "plt",
    "pd",
    "sk",
    "fit",
    "predict",
    " plt",
    " pd",
    " sk",
    " fit",
    " predict",
    "testtest",
]:
    ids = tokenizer([keyword]).input_ids[0]
    if len(ids) == 1:
        keytoken_ids.append(ids[0])
    else:
        print(f"Keyword has not single token: {keyword}")

Keyword has not single token: plt
Keyword has not single token: predict
Keyword has not single token:  plt
Keyword has not single token:  pd
Keyword has not single token: testtest


## Loss

In [37]:
from torch.nn import CrossEntropyLoss
import torch

def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # shifting so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # calculating per-token loss
    loss_fct = CrossEntropyLoss(reduce=False) #change to reduction=None
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # resizing and calculating average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # calculating and scale weighting
    weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
        axis=[0, 2]
    )
    weights = alpha * (1.0 + weights)
    # calculating weighted average
    weighted_loss = (loss_per_sample * weights).mean()
    return weighted_loss

## DataLoaders

In [38]:
from torch.utils.data.dataloader import DataLoader

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=32, shuffle=True)
eval_dataloader  = DataLoader(tokenized_datasets["valid"], batch_size=32)

## Optimizer

In [39]:
weight_decay = 0.1

def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

In [40]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])
            outputs.loss = outputs.loss.reshape(1)
        losses.append(accelerator.gather(outputs.loss))        
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [41]:
# model = GPT2LMHeadModel(config)

In [42]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

## Accelerator

In [43]:
from accelerate import Accelerator

accelerator = Accelerator(mixed_precision='fp16')

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [44]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1_000,
    num_training_steps=num_training_steps,
)

## Repository

In [28]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [29]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "codeparrot-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'SakibBinAlam/codeparrot-accelerate'

In [30]:
# os.environ["TOKENIZERS_PARALLELISM"] = "true"

output_dir = "codeparrot-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

Cloning https://huggingface.co/SakibBinAlam/codeparrot-accelerate into local empty directory.


## Greedy Search

In [48]:
# encoding context, the generation is conditioned on
input_ids = tokenizer.encode('import tensorflow', return_tensors='pt').to(device)

# generating text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(input_ids, max_length=50)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
import tensorflow.core.TensorFlow.TensorFlow.TensorFlow.TensorFlow.TensorFlow.TensorFlow.TensorFlow.TensorFlow.TensorFlow.TensorFlow.TensorFlow


## Beam Search

In [49]:
# activating beam search and early_stopping
beam_output = model.generate(
    input_ids,  
    max_length=50, 
    num_beams=5, 
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
import tensorflow

import tensorflow

import tensorflow

import tensorflow

import tensorflow

import tensorflow

import tensorflow

import tensorflow

import tens


In [50]:
# setting no_repeat_ngram_size to 2
beam_output = model.generate(
    input_ids, 
    max_length=50, 
    num_beams=5, 
    no_repeat_ngram_size=2, 
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
import tensorflow.core.TensorFlow.

In this example, we will use the following code to create a class that will be used in the next step of the tutorial:
.class("tensorflow") class Tensor


In [51]:
# setting return_num_sequences > 1
beam_outputs = model.generate(
    input_ids, 
    max_length=50, 
    num_beams=5, 
    no_repeat_ngram_size=2, 
    num_return_sequences=5, 
    early_stopping=True
)

# now we have 3 output sequences
print("Output:\n" + 100 * '-')
for i, beam_output in enumerate(beam_outputs):
  print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: import tensorflow.core.TensorFlow.

In this example, we will use the following code to create a class that will be used in the next step of the tutorial:
.class("tensorflow") class Tensor
1: import tensorflow.core.TensorFlow.

In this example, we will use the following code to create a class that will be used in the next step of the tutorial:
.class("tensorflow").class(
2: import tensorflow.core.TensorFlow.

In this example, we will use the following code to create a class that will be used in the next step of the tutorial:
.class("tensorflow-tutorial")
3: import tensorflow.core.TensorFlow.

In this example, we will use the following code to create a class that will be used in the next step of the tutorial:
.class("tensorflow").class('
4: import tensorflow.core.TensorFlow.

In this example, we will use the following code to create a class that will be used in the next step of

# 4. Training

In [52]:
evaluate()

(8.684147834777832, 5908.50341796875)

In [53]:
num_train_epochs

1

In [54]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8
eval_steps = 5_000

model.train()
completed_steps = 0
for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        logits = model(batch["input_ids"]).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
        if step % 100 == 0:
            accelerator.print(
                {
                    "steps": completed_steps,
                    "loss/train": loss.item() * gradient_accumulation_steps,
                }
            )
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress step {step}", blocking=False
                )

  0%|          | 0/4083 [00:00<?, ?it/s]



{'steps': 12, 'loss/train': 74.33778381347656}
{'steps': 24, 'loss/train': 71.8944320678711}
{'steps': 37, 'loss/train': 61.7232666015625}
{'steps': 49, 'loss/train': 56.108734130859375}
{'steps': 62, 'loss/train': 54.38740921020508}
{'steps': 74, 'loss/train': 49.46559143066406}
{'steps': 87, 'loss/train': 52.197227478027344}
{'steps': 99, 'loss/train': 48.116241455078125}
{'steps': 112, 'loss/train': 45.33609390258789}
{'steps': 124, 'loss/train': 41.40361022949219}
{'steps': 137, 'loss/train': 40.932159423828125}
{'steps': 149, 'loss/train': 38.43894577026367}
{'steps': 162, 'loss/train': 40.00958251953125}
{'steps': 174, 'loss/train': 42.479736328125}
{'steps': 187, 'loss/train': 44.37135696411133}
{'steps': 199, 'loss/train': 43.79956817626953}
{'steps': 212, 'loss/train': 40.84946060180664}
{'steps': 224, 'loss/train': 37.814544677734375}
{'steps': 237, 'loss/train': 33.93517303466797}
{'steps': 249, 'loss/train': 34.02598571777344}
{'steps': 262, 'loss/train': 33.65303039550781}

In [55]:
# saving config
model.config.to_json_file("config.json")

In [56]:
model.save_pretrained('SakibBinAlam/codeparrot-accelerate')

# 5. Testing/Inference

In [57]:
import torch
from transformers import pipeline

pipe = pipeline("text-generation", max_length=100, pad_token_id=0, eos_token_id=0, model="SakibBinAlam/codeparrot-accelerate", tokenizer=tokenizer)

In [59]:
code = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
"""
print(pipe(code, num_return_sequences=1)[0]["generated_text"])

# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
p(asnp/�class Strl�ou jump�$For = circ sur Curart P custom� km��ivelyuss Pages tart/�o`asons` Police� cent�$Qu tartiredartom Blvelampleasub/�o` partners` Police


In [60]:
code = """
# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
"""
print(pipe(code, num_return_sequences=1)[0]["generated_text"])


# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
ice random� thlfior- Pas sur t mon g� P pers`ustom`ince over�class Strl�ou jump�ionub`pe`mon)ack- May ap where- investigter�� weeks mon�onyill pers =ily gack S


In [61]:
code = """
# import tensorflow
import tensorflow
"""
print(pipe(code, num_return_sequences=1)[0]["generated_text"])


# import tensorflow
import tensorflow
#import twb0ack`ort#lfried- tvel Sodeameource sometas mov t toofer S t South;�class Strl�ou jump� used Corbyn/ FBI`read Al approaching advertised��Y� approaching advertised/ow` meaningay�z� approaching advertised/ow` meaning)Y/Tr*��ony,")Z*�onyall psychology save listerMPe[\
