# Install and import required libraries

In [2]:
!pip install accelerate transformers einops datasets peft bitsandbytes --upgrade



In [3]:
import torch

In [4]:
import os
from datasets import load_dataset, DatasetDict, Dataset

# Define the directory to save/load the datasets
local_dataset_dir = "local_datasets"
test=True
if test:
   # Sample data for testing
    test_data = {
        "train": {
            "content": ["Sample text 1"*60, "Sample text 2"*50, "Sample text 3"]*500,
        },
        "valid": {
            "content": ["Sample text 4", "Sample text 5"*70]*50,
        }
    }

    # Convert the sample data into a DatasetDict
    raw_datasets = DatasetDict({
        "train": Dataset.from_dict(test_data["train"]),
        "valid": Dataset.from_dict(test_data["valid"])
    })
    print("use test data")
elif os.path.exists(local_dataset_dir):
    # Load datasets from the local directory
    raw_datasets = DatasetDict.load_from_disk(local_dataset_dir)
    print("Datasets loaded from the local directory.")
else:
    # Load datasets from the Hugging Face repository
    ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
    ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")

    # Create a DatasetDict
    raw_datasets = DatasetDict(
        {
            "train": ds_train,
            "valid": ds_valid,
        }
    )

    # Save datasets to the local directory
    raw_datasets.save_to_disk(local_dataset_dir)
    print("Datasets saved to the local directory.")

# Now you have the datasets loaded into raw_datasets, whether from the local directory or Hugging Face repository.


use test data


# Test 1% of data

In [5]:
# Select 1% of the data
ds_train_1percent = raw_datasets['train'].select(range(500)) # 5000 & 500
ds_valid_1percent = raw_datasets['valid'].select(range(50))

# Create a DatasetDict
raw_datasets_1percent = DatasetDict(
    {
        "train": ds_train_1percent,
        "valid": ds_valid_1percent,
    }
)


# Tokenize data

In [6]:
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

outputs = tokenizer(
    raw_datasets["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Input IDs length: 4
Input chunk lengths: [128, 52, 128, 22]
Chunk mapping: [0, 0, 1, 1]


In [7]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets_1percent.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 334
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 25
    })
})

In [8]:
len(tokenized_datasets['train'][0]['input_ids'])

128

# Initialize GPT-2 model

In [25]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig
local_model_dir='codeparrot-ds'
if os.path.exists(local_model_dir):
    model = GPT2LMHeadModel.from_pretrained(local_model_dir)
    config = AutoConfig.from_pretrained(local_model_dir)
    print('load model =D')
else:
    config = AutoConfig.from_pretrained(
        "gpt2",
        vocab_size=len(tokenizer),
        n_ctx=context_length,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

load model =D
GPT-2 size: 124.2M parameters


# Data collator for Casuallm and tokenizer padding

In [26]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [27]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


# Start Training

In [12]:
torch.cuda.empty_cache()

In [28]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="codeparrot-ds",
    per_device_train_batch_size=16, #48 for gpu in colab
    per_device_eval_batch_size=16, #48 for gpu in colab
    evaluation_strategy="steps",
    eval_steps=5_0,
    logging_steps=10,
    gradient_accumulation_steps=5,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_0,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_0,
    fp16=torch.cuda.is_available(), # True for gpu
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

In [15]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#pipe = pipeline(
#    "text-generation", model="huggingface-course/codeparrot-ds", device=device
#)
pipe = pipeline(
    "text-generation",tokenizer=tokenizer, model=model, device=device
)

In [16]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
 2Sample text 2Sample text 2Sample text 2Sample text 2


In [17]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
 text 2 text 2Sample text 2Sample textSample text 2Sample text


In [18]:
txt = """\
# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
 text 2Sample text 2


In [19]:
txt = """\
lst = [2,4,5]
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

lst = [2,4,5]
 text 2Sample text 2 text 2 text 2Sample text 2Sample text 2 2Sample text 2 chips 2 2Sample text 2 2 2 2 2Sample text 2 suc 2 2 2 2Sample text 2


In [20]:
model.save_pretrained("codeparrot-ds")

Non-default generation parameters: {'max_length': 50, 'do_sample': True}


In [21]:
#!huggingface-cli login


In [22]:
hf_upload = False

from huggingface_hub import notebook_login
if hf_upload:
    notebook_login()

In [23]:
from huggingface_hub import Repository, HfApi

if hf_upload:
    # Initialize Hugging Face API
    api = HfApi()

    # Specify your model directory
    model_dir = "codeparrot-ds"

    # Replace 'username' with your Hugging Face username and 'model_name' with your desired model name
    username = "WKLI22"
    model_name = "codeparrot-ds"

    # Create a new repository on Hugging Face
    repo_url = api.create_repo(token=api.token, repo_id=model_name, exist_ok=True)



In [24]:

if hf_upload:
    # push to the hub
    model.push_to_hub(model_name, config=config)