In [1]:
from datasets import load_dataset
import pandas as pd
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [3]:
code_1 = "espejelomar/code_search_net_python_10000_examples"
code_2 = "flytech/python-codes-25k"
code_3 = "iamtarun/python_code_instructions_18k_alpaca"

In [4]:
ds_1 = load_dataset(code_1)
ds_2 = load_dataset(code_2)
ds_3 = load_dataset(code_3)
ds_4 = load_dataset("huggingface-course/codeparrot-ds-train", split="train").shuffle().select(range(10000))
ds_5 = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation").shuffle().select(range(3000))

In [5]:
ds_1 = ds_1.remove_columns([col for col in ds_1['train'].column_names if col!='whole_func_string'])
ds_2 = ds_2.remove_columns([col for col in ds_2['train'].column_names if col!='output'])
ds_3 = ds_3.remove_columns([col for col in ds_3['train'].column_names if col!='output'])
ds_4 = ds_4.remove_columns([col for col in ds_4.column_names if col!='content'])
ds_5 = ds_5.remove_columns([col for col in ds_5.column_names if col!='content'])

In [6]:
df_1 = ds_1['train'].to_pandas()
df_2 = ds_2['train'].to_pandas()
df_3 = ds_3['train'].to_pandas()
df_4 = ds_4.to_pandas()
df_5 = ds_5.to_pandas()

In [7]:
df_1.shape, df_2.shape, df_3.shape, df_4.shape, df_5.shape

((10000, 1), (49626, 1), (18612, 1), (10000, 1), (3000, 1))

In [8]:
df_1.rename({"whole_func_string": "code"}, axis=1, inplace=True)
df_2.rename({"output": "code"}, axis=1, inplace=True)
df_3.rename({"output": "code"}, axis=1, inplace=True)
df_4.rename({"content": "code"}, axis=1, inplace=True)
df_5.rename({"content": "code"}, axis=1, inplace=True)

In [9]:
df = pd.concat([df_1, df_2, df_3, df_4, df_5], axis=0, ignore_index=True)

In [10]:
df.to_csv("data/data.csv", index=False)

## tokenize

In [11]:
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
import random

In [12]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [13]:
df = pd.read_csv("data/data.csv")

In [14]:
def tokenize(code):
    code = code + tokenizer.eos_token
    tokens = tokenizer.encode(code)
    return np.array(tokens)

In [15]:
random.seed(2406)
idx = df.index
train_idxs = set(random.sample(list(idx), k=int(len(idx)*0.9)))
test_idxs = set(idx).difference(train_idxs)

train_df = df.iloc[list(train_idxs)]
test_df = df.iloc[list(test_idxs)]

train_data, test_data = [], []

for i in range(train_df.shape[0]):
    train_data.extend(tokenize(train_df.iloc[i, 0]))

for i in range(test_df.shape[0]):
    test_data.extend(tokenize(test_df.iloc[i, 0]))

Token indices sequence length is longer than the specified maximum sequence length for this model (1087 > 1024). Running this sequence through the model will result in indexing errors


In [16]:
print(len(train_data), len(test_data))

82529957 9760431


In [17]:
train_data = np.array(train_data)
test_data = np.array(test_data)

In [18]:
np.save('data/train.npy', arr=train_data)
np.save('data/test.npy', arr=test_data)