# Dataset Processing

In [1]:
import json
import re

# Open json file
with open("all_data.json", "r") as f:
    data = json.load(f)
    f.close()

# Remove the '/' at the end of each repository
labels = list(data.keys())
for label in labels:
    repo_list = []
    for repo in data[label]:
        repo = repo.rstrip("/")
        repo_list.append(repo)
    data[label] = repo_list

# Remove not match "user/repo" pattern data
for label in labels:
    repo_list = []
    for repo in data[label]:
        if re.fullmatch(r"(\d|\w|-|.)*/{1}(\d|\w|-|.)*", repo):
            repo_list.append(repo)
    data[label] = repo_list

# Update json file
data_str = json.dumps(data, indent=4)
with open("all_data.json", "w") as f:
    f.write(data_str)
    f.close()

In [2]:
import pickle

# Open json file
with open("all_data.json", "r") as f:
    data = json.load(f)
    f.close()

REPOS = {}
labels = list(data.keys())
for label in labels:
    for repo in data[label]:
        REPOS[repo] = label

# Write total data into REPOS.pkl
print(len(REPOS.items()))
with open("REPOS.pkl", "wb") as f:
      pickle.dump(REPOS, f)

464


In [3]:
# Train data has been processed, and be saved into REPOS_train.pkl
with open("REPOS_train.pkl", "rb") as f:
    REPOS_train = pickle.load(f)
    f.close()
print(len(REPOS_train.items()))

315


In [4]:
REPOS_keys = REPOS.keys()
REPOS_train_keys = REPOS_train.keys()

# The reset data
difference = list(set(REPOS_keys) - set(REPOS_train_keys))
print(len(difference))

REPOS_rest = {}
for repo in difference:
    REPOS_rest[repo] = REPOS[repo]

149


In [5]:
from sklearn.model_selection import train_test_split

# Split the rest data into validation set and test set
X = []
Y = []

for repo in REPOS_rest:
    X.append(repo)
    Y.append(REPOS_rest[repo])

x_validation, x_test, y_validation, y_test = train_test_split(X, Y, random_state=42, test_size=0.3)
print(len(x_validation))
print(len(x_test))

104
45


In [6]:
REPOS_test = {}
for i in range(len(x_test)):
    REPOS_test[x_test[i]] = y_test[i]
    
REPOS_validation = {}
for i in range(len(x_validation)):
    REPOS_validation[x_validation[i]] = y_validation[i]

with open("./REPOS_test.pkl", "wb") as f:
    pickle.dump(REPOS_test, f)
    f.close()

with open("./REPOS_validation.pkl", "wb") as f:
    pickle.dump(REPOS_validation, f)
    f.close()

In [7]:
with open("./REPOS_train.pkl", "rb") as f:
    REPOS_train_check = pickle.load(f)
    f.close()
print(len(REPOS_train_check.items()))

315


In [8]:
with open("./REPOS_test.pkl", "rb") as f:
    REPOS_test_check = pickle.load(f)
    f.close()
print(len(REPOS_test_check.items()))

45


In [9]:
with open("./REPOS_validation.pkl", "rb") as f:
    REPOS_validation_check = pickle.load(f)
    f.close()
print(len(REPOS_validation_check.items()))

104
