# RUN THE BELOW BLOCK ONCE TO SET UP FUNCTIONS

In [25]:
# IMPORTS
from datasets import load_dataset
from datasets.iterable_dataset import IterableDataset
from Modules.Code2Code.Extracontent.code2doc import Code2DocModule
import requests
import ciso8601
from typing import Dict
import time
import torch
import json
from datetime import datetime

GITHUB_DATASET_URI = "codeparrot/github-code"
ANNOTATED_DATASET_URI = 'michaelnath/annotated-code-functions-base'
repo_to_features_mapping = dict()
def construct_feature_set(code_entry):
    features = dict();
    user_repo_name = code_entry.split('/')
    owner = user_repo_name[0]
    repo = "".join(user_repo_name[1:])
    header = {"Authorization": 'token ' + <INSERT_TOKEN_STRING_HERE>}
    repo_response = requests.get(f"https://api.github.com/repos/{owner}/{repo}", headers=header)
    content = json.loads(repo_response.text)
    features["num_stars"] = content.get("stargazers_count", 0)
    features["num_forks"] = content.get("forks_count", 0)
    features["num_watchers"] = content.get("watchers_count", 0)
    features["num_open_issues"] = content.get("open_issues_count", 0)
    parsed_datetime = ciso8601.parse_datetime(content.get("created_at", datetime.now().isoformat()))
    timestamp = time.mktime(parsed_datetime.timetuple())
    features["created_at"] = timestamp
    return [features["num_stars"], features["num_forks"], features["num_watchers"], features["num_open_issues"], features["created_at"]]
# THE BELOW IS FOR PYTHON FILES (LEVERAGES INDENTATION RULES)
def construct_list_of_functions(raw_code):
    lines = raw_code.split('\n')
    start = -1
    functions = []
    begin_considering_function_termination = False
    amnt_tabs = 0
    for i in range(len(lines)):
        # disregard empty lines (prune trailing whitespace later)
        if (start != -1 and len(lines[i]) > 0):
            amnt_tabs_new = len(lines[i].rstrip()) - len(lines[i].strip())
            if amnt_tabs_new <= amnt_tabs and begin_considering_function_termination:
                functions.append(("\n".join(lines[start:i])).strip())
                start = -1
                begin_considering_function_termination = False
        if lines[i].lstrip().startswith(("def ", "async def ")):
            start = i
            amnt_tabs = len(lines[i].rstrip()) - len(lines[i].strip())
        if start != -1 and not begin_considering_function_termination and ":" in lines[i] and ")" in lines[i]:
            begin_considering_function_termination = True 
    return functions

def augment_code_entry(entries):
        entries["functions"] = []
        entries["reputation_features"] = []
        PAD_WORD = "BLEH"
        PAD_LENGTH = 512
        for i in range(len(entries["code"])):
            functions = construct_list_of_functions(entries["code"][i])
            entries["functions"].append(functions + [PAD_WORD] * max(0, PAD_LENGTH - len(functions)))
            if repo_to_features_mapping.get(entries["repo_name"][i], None) == None:
                repo_to_features_mapping[entries["repo_name"][i]] = construct_feature_set(entries["repo_name"][i])
            entries["reputation_features"] += [repo_to_features_mapping[entries["repo_name"][i]]]
        entries["reputation_features"] = torch.Tensor(entries["reputation_features"]).view(len(entries["functions"]), 5)
        return entries

The below code will create a dictionary `dicty` that can then be converted into a Hugging Face Dataset object. Adjust `BATCH_SIZE` to get more efficient processing,
**but** be warned that increasing batch size too much may cause CUDA memory issues. 

In [29]:
from torch.utils.data import DataLoader
import numpy as np
ds = load_dataset("codeparrot/github-code", split="train", streaming=True, languages=["Python"])
BATCH_SIZE = 2
ds=ds.map(augment_code_entry, batched=True, batch_size=BATCH_SIZE, remove_columns=["code", "license", "size", "language"])

dataloader = DataLoader(ds, batch_size=2, num_workers=2)

dicty= dict()
dicty["function"] = []
dicty["repo_name"] = []
dicty["path"] = []
dicty["features"] = []

DESIRED_NUM_FUNCTIONS = 300


for _, batch in enumerate(dataloader):
    features = batch["reputation_features"]
    functions = np.array(batch["functions"]).reshape(BATCH_SIZE, -1)
    indices = torch.where(torch.all(features > 0, axis=1))[0]
    for index in indices:
        actual_functions = list(functions[index][functions[index] != "BLEH"])
        dicty["function"] += actual_functions
        dicty["repo_name"] += [batch["repo_name"][index]] * len(actual_functions)
        dicty["path"] += [batch["path"][index]] * len(actual_functions) 
        dicty["features"] += [features[index]] * len(actual_functions)
    if len(dicty["function"]) > DESIRED_NUM_FUNCTIONS:
        break

Using custom data configuration Python-all-4b2efe4a27feed92


In [31]:
print(len(dicty["function"]))
# Converting dicty to hugging face dataset
from datasets import Dataset
ds = Dataset.from_dict(dicty)
ds[0]

306


{'function': 'def __init__(self, allow=None, disallow=None, secure=True, *args, **kwargs):\n\t\tsuper(TemplateField, self).__init__(*args, **kwargs)\n\t\tself.validators.append(TemplateValidator(allow, disallow, secure))',
 'repo_name': 'ithinksw/philo',
 'path': 'philo/models/fields/__init__.py',
 'features': [50.0, 12.0, 50.0, 3.0, 1274327296.0]}

In [36]:
# to add a column, you simply invoke the add_column method
ds = ds.add_column("random_column", ["random_string" for _ in range(len(ds))])

In [37]:
print(ds["random_column"])
ds = ds.remove_columns(column_names=["random_column"])

['random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_string', 'random_strin

In [38]:
print(ds.column_names)

['function', 'repo_name', 'path', 'features']


In [None]:
# Uncomment the below code to push to HF. Note that you're going to have to login first by typing `huggingface-cli login` in the shell
# DATASET_NAME = <INSERT_SOME_DESCRIPTIVE_NAME>
# ds.push_to_hub(DATASET_NAME)