In [31]:
# choose a name for the folder for the jsonl files
folder_name = 'folder name'

# choose a language to parse from the dataset
df_language = 'coding language'

# use your huggingface username and the desired repo name
hf_username = 'username'
hf_repo_name = 'repo'

In [None]:
!apt install git-lfs # git extension
!pip install transformers # transformers for viewing codeparrot dataset
!pip install datasets # huggingface datasets
!pip install function-parser # ncooper's function_parser -> parse github repos into docstrings and function code pairs

In [None]:
!git clone https://github.com/tree-sitter/tree-sitter-python.git # tree-sitter-python for building language grammar for Python

In [36]:
import os
import pandas as pd

# tqdm to create a progress bar for the parsing loop
from tqdm import tqdm

# tree sitter and function parser imports
import function_parser
from tree_sitter import Language

from function_parser.language_data import LANGUAGE_METADATA
from function_parser.process import DataProcessor
from tree_sitter import Language

# transformers and datasets imports
from datasets import load_dataset, DatasetDict
from huggingface_hub import list_datasets
from huggingface_hub import notebook_login, create_repo

In [None]:
#retrieve path to function_parser
parser_path = function_parser.__path__[0]
parser_path

In [39]:
# add the tree-siter-py to function_parser PATH
Language.build_library(
    # Store the library in the directory
    f'{parser_path}/tree-sitter-languages.so',
    # Include one or more languages
    [
        'tree-sitter-python'
    ]
)

True

In [40]:
# pass the language param and build grammar for that language
language = "python"
DataProcessor.PARSER.set_language(
    Language(os.path.join(function_parser.__path__[0], "tree-sitter-languages.so"), language)
)
processor = DataProcessor(
    language=language, language_parser=LANGUAGE_METADATA[language]["language_parser"]
)

In [59]:
# create the full pile dataframe
url = 'https://raw.githubusercontent.com/EleutherAI/github-downloader/master/github_repositories.csv'

col_names = ['repository', 'stars', 'language']

df_full = pd.read_csv(url, names=col_names)

df_full.head()

Unnamed: 0,repository,stars,language
0,Kavex/GameDev-Resources,1129,
1,pristineio/webrtc-mirror,152,C++
2,emacs-lsp/lsp-ui,635,Emacs Lisp
3,Bwar/CJsonObject,328,C++
4,defshine/cleanblog,108,JavaScript


In [None]:
# create a df only containing repositories in the desired language
df = df_full[df_full.language == df_language]

df.head()

In [44]:
# create an array for repo names
repos = df.repository.values

# inspect the number or rows in repo
total = repos.shape[0]

print(total)

6229


In [46]:
# create a folder for the jsonl files
dir = os.getcwd()
os.chdir(dir)
os.mkdir(folder_name)
os.chdir(dir)
os.chdir(folder_name)
json_path = os.getcwd()
os.chdir(dir)

In [None]:
# loop through the repos array and create a json file for each newly created dataframe
for repo in tqdm(repos):
  dependee = repo
  definitions = processor.process_dee(dependee, ext=LANGUAGE_METADATA[language]["ext"])
  pile = pd.DataFrame(definitions)
  if pile.shape[0] > 0:
    pile.dropna(subset=['docstring'], inplace=True)
  if pile.shape[0] > 0:
    pile = pile[pile.docstring != '']
  if pile.shape[0] > 0:
    pile.to_json(f"{json_path}/{'__'.join(repo.split('/'))}.jsonl", orient='records', lines=True)

In [None]:
# create the dataset using all of the jsonl files
the_pile_parsed = load_dataset("json", data_files=f"{json_path}/*.jsonl", split="train")

In [None]:
the_pile_parsed

In [None]:
all_datasets = list_datasets()
print(f"Number of datasets on Hub: {len(all_datasets)}")


In [52]:
# 90% train, 10% test + validation
train_testvalid = the_pile_parsed.train_test_split(test_size=0.1)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [None]:
train_test_valid_dataset

In [None]:
# login to huggingface
!pip install ipywidgets
notebook_login()

In [None]:
# create the huggingface repo
repo = create_repo(name=hf_repo_name, repo_type="dataset")
repo

In [None]:
#push the dataset to huggingface
train_test_valid_dataset.push_to_hub(repo_id=f'{hf_username}/{hf_repo_name}')

In [None]:
# dowload the newly pushed dataset
read = load_dataset(f'{hf_username}/{hf_repo_name}')

In [None]:
# check that dataset was pused correctly
read