In [21]:
!pip install datasets

from datasets import load_dataset
dataset = load_dataset("codeparrot/github-code", streaming=True, split="train")
#dataset = load_dataset("codeparrot/github-code", streaming=True, split="train")




In [30]:
# Install necessary libraries
!pip install datasets transformers

# Import required libraries
from datasets import load_dataset
import re
from transformers import AutoTokenizer

# Step 1: Load the dataset
dataset = load_dataset("codeparrot/github-code", streaming=True, split="train")

# Step 2: Filter out non-Python files (e.g., README, tests)
def filter_code_files(example):
    # Only keep Python code (filter by the 'language' field)
    return example['language'] == 'Python'

# Filter the dataset to keep only Python files
filtered_dataset = dataset.filter(filter_code_files)

# Step 3: Extract functions and classes from the code
def extract_functions_and_classes(code):
    if isinstance(code, list):  # If the code is a list, concatenate the list into a single string
        code = ' '.join(code)  # Ensure code is a single string

    # Extract functions using regex
    function_pattern = r"def\s+(\w+)\(.*\):"
    functions = re.findall(function_pattern, code)

    # Extract classes using regex
    class_pattern = r"class\s+(\w+)\(.*\):"
    classes = re.findall(class_pattern, code)

    # Ensure both functions and classes are lists, even if empty
    functions = functions if functions else []
    classes = classes if classes else []

    # Ensure that both functions and classes return empty lists if they are missing
    return {'functions': functions, 'classes': classes}

# Step 4: Normalize variable names in the code for better learning
def normalize_variable_names(code):
    # Replace variable names with generic names
    variable_pattern = r"\b\w+\b"
    normalized_code = re.sub(variable_pattern, lambda m: f"var{hash(m.group(0)) % 1000}", code)
    return normalized_code

# Step 5: Tokenize the data using a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("codeparrot/codeparrot-small")

# Set the pad_token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Using eos_token as the pad_token

def tokenize_code(example):
    return tokenizer(example['code'], truncation=True, padding='max_length', max_length=512)

# Step 6: Process only the first 1000 examples in a streaming manner
tokenized_data = []
example_count = 0
for example in filtered_dataset:
    if example_count == 1000:
        break  # Stop once we have processed 1000 examples

    # Extract functions and classes
    processed_example = extract_functions_and_classes(example['code'])

    # Normalize the code
    normalized_code = normalize_variable_names(example['code'])

    # Tokenize the normalized code
    tokenized_example = tokenize_code({'code': normalized_code})

    # Add the processed data to the list
    tokenized_data.append({
        **processed_example,
        **tokenized_example
    })

    example_count += 1

# Access and print the first entry in the tokenized dataset
print(tokenized_data[0])

# You can now proceed with model training using tokenized_data.


{'functions': ['__init__', '__init__', '__get__', '__set__', '__delete__', 'get_attname', 'contribute_to_class', 'fix_init_kwarg', 'formfield', 'get_internal_type', 'to_python', 'get_prep_value', 'formfield', 'validate', '_get_choices'], 'classes': ['TemplateField', 'JSONDescriptor', 'JSONField', 'SlugMultipleChoiceField'], 'input_ids': [1391, 1400, 2729, 7736, 2729, 31485, 2729, 7957, 199, 1391, 1400, 2729, 7736, 14, 1391, 30923, 14, 1391, 31230, 2729, 31485, 2729, 32749, 199, 1391, 1400, 2729, 7736, 14, 1391, 30923, 14, 1391, 29114, 2729, 31485, 2729, 6686, 199, 1391, 1400, 2729, 7736, 14, 1391, 11551, 2729, 31485, 2729, 24509, 199, 1391, 1400, 2729, 7736, 14, 1391, 23, 2036, 2729, 31485, 2729, 19301, 2729, 10806, 2729, 27696, 199, 1391, 1400, 2729, 7736, 14, 1391, 23, 2036, 14, 1391, 29527, 2729, 31485, 2729, 3781, 199, 1391, 1400, 2729, 7736, 14, 1391, 23, 2036, 14, 1391, 14271, 2729, 31485, 2729, 15128, 2729, 10806, 2729, 31505, 199, 199, 1391, 1400, 2729, 14520, 14, 1391, 7957, 1

In [32]:
from datasets import Dataset, DatasetDict

# Step 1: Convert tokenized data to a Dataset object
# Convert the list of tokenized examples into a Dataset object
tokenized_dataset = Dataset.from_dict({
    'input_ids': [example['input_ids'] for example in tokenized_data],
    'attention_mask': [example['attention_mask'] for example in tokenized_data],
    'functions': [example['functions'] for example in tokenized_data],
    'classes': [example['classes'] for example in tokenized_data],
})

# Step 2: Split the data into training and validation sets
# You can use the `train_test_split` method to split the dataset
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)  # 90% for training, 10% for validation
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

# Optional: Further split the validation set into test and validation sets (80/20 split)
test_val_split = val_dataset.train_test_split(test_size=0.2)  # 80% for validation, 20% for test
val_dataset = test_val_split['train']
test_dataset = test_val_split['test']

# Create a DatasetDict to organize the datasets for training, validation, and test
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset,
})

# Inspect the split dataset
print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'functions', 'classes'],
        num_rows: 900
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'functions', 'classes'],
        num_rows: 80
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'functions', 'classes'],
        num_rows: 20
    })
})


In [33]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Step 1: Load the pre-trained model (e.g., codeparrot-small)
model = AutoModelForCausalLM.from_pretrained("codeparrot/codeparrot-small")

# Step 2: Load the tokenizer (already done previously, just confirming)
tokenizer = AutoTokenizer.from_pretrained("codeparrot/codeparrot-small")


config.json:   0%|          | 0.00/903 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/457M [00:00<?, ?B/s]

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
from datasets import Dataset

# Step 1: Convert tokenized data to a Dataset object (as already done in previous steps)
tokenized_dataset = Dataset.from_dict({
    'input_ids': [example['input_ids'] for example in tokenized_data],
    'attention_mask': [example['attention_mask'] for example in tokenized_data],
    'functions': [example['functions'] for example in tokenized_data],
    'classes': [example['classes'] for example in tokenized_data],
})

# Step 2: Ensure `labels` are the same as `input_ids` for language modeling tasks
def add_labels(example):
    example['labels'] = example['input_ids']  # Set labels equal to input_ids for causal language modeling
    return example

# Apply this transformation to the dataset
tokenized_dataset = tokenized_dataset.map(add_labels)

# Step 3: Split the dataset (if not done already)
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

# Step 4: Configure the DataCollator (handles padding and labels)
tokenizer = AutoTokenizer.from_pretrained("codeparrot/codeparrot-small")

# Set the pad_token to eos_token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Set to False since we're doing causal language modeling (next-token prediction)
)

# Step 5: Load the model
model = AutoModelForCausalLM.from_pretrained("codeparrot/codeparrot-small")

# Step 6: Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",            # where to save the model and results
    eval_strategy="epoch",             # evaluation strategy
    learning_rate=5e-5,                # learning rate
    per_device_train_batch_size=4,     # batch size for training
    per_device_eval_batch_size=4,      # batch size for evaluation
    num_train_epochs=3,                # number of training epochs
    weight_decay=0.01,                 # weight decay for regularization
    logging_dir='./logs',              # directory for logging
    logging_steps=10,
    save_steps=500,                    # save model checkpoint every 500 steps
    save_total_limit=2,                # save only the last 2 model checkpoints
    report_to="none",                  # Disable W&B logging
)

# Step 7: Initialize the Trainer
trainer = Trainer(
    model=model,                       # the model to train
    args=training_args,                # training arguments
    train_dataset=train_dataset,       # training dataset
    eval_dataset=val_dataset,          # validation dataset
    data_collator=data_collator,       # data collator for padding and handling labels
)

# Step 8: Train the model
trainer.train()


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
