In [None]:
!pip install datasets transformers evaluate

In [None]:
!pip install tqdm

In [None]:
from datasets import load_dataset

split = "train"
data = load_dataset(f"transformersbook/codeparrot-{split}", split=split, streaming=True)


In [None]:
def any_keyword_in_string(keywords, string):
  for keyword in keywords:
    if keyword in string:
      return True
  return False

In [None]:
filters = [ "sklearn", "pandas", "matplotlib", "seaborn"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"

print(
    any_keyword_in_string( filters, example_1,), any_keyword_in_string(filters, example_2,)
)

In [None]:
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset

def filter_streaming_dataset(dataset, filters):
  filtered_dict=defaultdict(list)
  total=0
  for sample in tqdm(iter(dataset)):
    total+=1
    if any_keyword_in_string(filters, sample["content"]):
      for k,v in sample.items():
        filtered_dict[k].append(v)
  print(f"{len(filtered_dict['content'])/total:.2%} of data after filtering.")
  return Dataset.from_dict(filtered_dict)

In [None]:
filters = [ "sklearn", "pandas", "matplotlib", "seaborn"]
# filtered_data = filter_streaming_dataset(data, filters)

filtered dataset already in the hub 

In [None]:
from datasets import load_dataset, DatasetDict

ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")

raw_datasets = DatasetDict(
    {
        "train": ds_train.shuffle().select(range(50000)),
        "valid": ds_valid.shuffle().select(range(500))
    }
)

raw_datasets

In [None]:
from transformers import AutoTokenizer

context_length=128

tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

In [None]:
outputs = tokenizer(raw_datasets["train"][:2]["content"], truncation=True, max_length=context_length, return_overflowing_tokens=True, return_length=True)

In [None]:
print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

In [None]:
def tokenize(elements):
  outputs =tokenizer(elements["content"], truncation=True, max_length=context_length, return_overflowing_tokens=True, return_length=True)
  input_batch=[]
  for len, output in zip(outputs["length"], outputs["input_ids"]):
    if len == context_length:
      input_batch.append(output)
  return {"input_ids" : input_batch}

tokenized_dataset = raw_datasets.map(tokenize, batched=True, remove_columns= raw_datasets["train"].column_names)

tokenized_dataset

In [None]:
# tokenized_dataset.save_to_disk("tokenized_dataset")

In [None]:
# from datasets import load_from_disk

# tokenized_dataset = load_from_disk("tokenized_dataset")
# tokenized_dataset

In [None]:
from transformers import AutoConfig, TFGPT2LMHeadModel

config = AutoConfig.from_pretrained("gpt2",
                                    vocab_size=len(tokenizer), 
                                    n_ctx=context_length, 
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,)

model = TFGPT2LMHeadModel(config)

In [None]:
model(model.dummy_inputs)
model.summary()

In [None]:
from transformers import  DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")

In [None]:
tf_train_dataset = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)
tf_eval_dataset = model.prepare_tf_dataset(
    tokenized_dataset["valid"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import create_optimizer
import tensorflow as tf

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=100,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(output_dir="codeparrot-ds", tokenizer=tokenizer)

model.fit(tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback])

Cloning https://huggingface.co/Thabet/codeparrot-ds into local empty directory.


 1669/42582 [>.............................] - ETA: 13:31:25 - loss: 5.4548