<a href="https://colab.research.google.com/github/WaliMuhammadAhmad/CODEX-Dataset/blob/main/Eval4Deepseek_Coder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tarfile
import logging
import shutil
import json
import os

In [2]:
from google.colab import userdata
from huggingface_hub import login
from huggingface_hub import Repository
from huggingface_hub import create_repo

### Define Global Variables

In [19]:
folder = 'eval'  # this name will be use in all notebook
files = 0     # count the number of files
count = 1     # count for the folders
files_per_folder = 10000  # files per folder. hugging face repo uses Git LFS by default but still make issue for more then 10k files in a folder
context_type = "FM_FC_MS_FF"  # Choose context type from: FM, FM_FC, FM_FC_CO, FM_FC_MS, FM_FC_MS_FF

# Path
file = f'/content/drive/MyDrive/Colab Notebooks/Dataset/{folder}.tar.bz2'
input_path = f'/content/{folder}/'
output_path = '/content/Eval4Deepseek-Coder'

### Define Functions

In [14]:
def preprocess_dataset(input_folder, output_folder, context_type):
    contexts = {
        "FM": "src_fm",
        "FM_FC": "src_fm_fc",
        "FM_FC_CO": "src_fm_fc_co",
        "FM_FC_MS": "src_fm_fc_ms",
        "FM_FC_MS_FF": "src_fm_fc_ms_ff"
    }

    if context_type not in contexts:
        raise ValueError("Invalid context type. Choose from: FM, FM_FC, FM_FC_CO, FM_FC_MS, FM_FC_MS_FF.")

    context_key = contexts[context_type]

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for subdir, _, files in os.walk(input_folder):
        for file in files:
            if file.endswith('_corpus.json'):
                input_path = os.path.join(subdir, file)
                try:
                    with open(input_path, 'r') as infile:
                        data = json.load(infile)

                        if context_key in data and 'target' in data:
                            instruction = (f"Generate a unit test case for the following Java method: {data[context_key]}")
                            output = (f"The unit test case for the given Java method is: {data['target']}")
                            transform_data = {
                                'instruction': instruction,
                                'output': output
                            }

                            # Determine the output path
                            relative_path = os.path.relpath(input_path, input_folder)
                            output_path = os.path.join(output_folder, relative_path)

                            # Ensure the output directory exists
                            os.makedirs(os.path.dirname(output_path), exist_ok=True)

                            # Write the transformed data to the output path
                            with open(output_path, 'w') as outfile:
                                json.dump(transform_data, outfile, indent=4)
                except Exception as e:
                    logger.error(f"Error processing file {input_path}: {e}")

print(f"Preprocessing complete on {input_path} @ {output_path}")

Preprocessing complete on /content/eval/ @ /content/Eval4Deepseek-Coder


In [33]:
def count_files(directory=dir):
  try:
      entries = os.listdir(directory)
      files = [entry for entry in entries if os.path.isfile(os.path.join(directory, entry))]
      print(f"Number of files in '{directory}': {len(files)}")
  except Exception as e:
      logger.error(f"Error: {e}")

### Start

In [6]:
shutil.rmtree('/content/sample_data')

In [7]:
log_format = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(filename="logs.log", filemode="w", format=log_format, level=logging.WARNING)

logger = logging.getLogger(__name__)
file_handler = logging.FileHandler('logs.log')
file_handler.setLevel(logging.WARNING)
formatter = logging.Formatter(log_format)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

In [8]:
with tarfile.open(file, 'r:bz2') as tar:
  tar.extractall('/content/')

print(f"Extracted {file} @ {input_path}.")

Extracted /content/drive/MyDrive/Colab Notebooks/Dataset/eval.tar.bz2 @ /content/eval/.


In [9]:
folders = os.listdir(input_path)

for f in folders:
  files +=count_files(f'{input_path}/{f}')

print(f"Total files in {input_path} : {files}")

Total files in /content/eval/ : 78534


In [10]:
userdata.get('huggingface')

'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'

In [11]:
login("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",add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [12]:
create_repo("CodexAI/Eval4Deepseek-Coder", repo_type="dataset", private=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


RepoUrl('https://huggingface.co/datasets/CodexAI/Eval4Deepseek-Coder', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CodexAI/Eval4Deepseek-Coder')

In [13]:
repo_url='https://huggingface.co/datasets/CodexAI/Eval4Deepseek-Coder'
repo_id = 'CodexAI/Eval4Deepseek-Coder'

repo = Repository(local_dir=output_path, clone_from=repo_url)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/datasets/CodexAI/Eval4Deepseek-Coder into local empty directory.


In [15]:
preprocess_dataset(input_path, output_path, context_type)

In [34]:
folders = os.listdir(output_path)
files = 0

for f in folders:
  if not os.path.isdir(f):
    count_files(f'{output_path}/{f}')

Number of files in '/content/Eval4Deepseek-Coder/43142077': 2
Number of files in '/content/Eval4Deepseek-Coder/858685': 561
Number of files in '/content/Eval4Deepseek-Coder/178486098': 8
Number of files in '/content/Eval4Deepseek-Coder/87820447': 232
Number of files in '/content/Eval4Deepseek-Coder/35230855': 33
Number of files in '/content/Eval4Deepseek-Coder/58107893': 386
Number of files in '/content/Eval4Deepseek-Coder/64902062': 1
Number of files in '/content/Eval4Deepseek-Coder/198452291': 3
Number of files in '/content/Eval4Deepseek-Coder/47121388': 32
Number of files in '/content/Eval4Deepseek-Coder/243312367': 2
Number of files in '/content/Eval4Deepseek-Coder/230760808': 5
Number of files in '/content/Eval4Deepseek-Coder/4808693': 13
Number of files in '/content/Eval4Deepseek-Coder/38732107': 178
Number of files in '/content/Eval4Deepseek-Coder/66657283': 2
Number of files in '/content/Eval4Deepseek-Coder/150133859': 478
Number of files in '/content/Eval4Deepseek-Coder/498125

ERROR:__main__:Error: [Errno 20] Not a directory: '/content/Eval4Deepseek-Coder/.gitattributes'


Number of files in '/content/Eval4Deepseek-Coder/32808027': 178
Number of files in '/content/Eval4Deepseek-Coder/29127570': 22
Number of files in '/content/Eval4Deepseek-Coder/30702818': 163
Number of files in '/content/Eval4Deepseek-Coder/88765628': 14
Number of files in '/content/Eval4Deepseek-Coder/276256': 1
Number of files in '/content/Eval4Deepseek-Coder/64314375': 98
Number of files in '/content/Eval4Deepseek-Coder/20916592': 18
Number of files in '/content/Eval4Deepseek-Coder/95094994': 4
Number of files in '/content/Eval4Deepseek-Coder/123195802': 14
Number of files in '/content/Eval4Deepseek-Coder/95005676': 286
Number of files in '/content/Eval4Deepseek-Coder/26163430': 1166
Number of files in '/content/Eval4Deepseek-Coder/7102335': 16
Number of files in '/content/Eval4Deepseek-Coder/166056144': 590
Number of files in '/content/Eval4Deepseek-Coder/34198359': 3
Number of files in '/content/Eval4Deepseek-Coder/104750793': 183
Number of files in '/content/Eval4Deepseek-Coder/16

### Push to hugging Face

push to hugging face repo using legacy git method, this takes tons of time but immune to connection speed

In [35]:
repo.git_pull()

In [36]:
repo.push_to_hub(commit_message=f"{folder} uploaded")

To https://huggingface.co/datasets/CodexAI/Eval4Deepseek-Coder
   0d7acc9..5f000b5  main -> main

   0d7acc9..5f000b5  main -> main



'https://huggingface.co/datasets/CodexAI/Eval4Deepseek-Coder/commit/5f000b52991f292fcfacc160ae8184aebf403af0'

Alternatively, use HFApi to push dataset to hugging face, this is faster then legacy git commands but requires consistant connection speed

In [None]:
from huggingface_hub import HfApi
api = HfApi()

In [None]:
os.chdir(output_path)

In [None]:
api.upload_folder(
  folder_path=output_path,
  repo_id=repo_id,
  repo_type="dataset",
)

### Release the Space

In [37]:
shutil.rmtree(input_path)
shutil.rmtree(output_path)

In [38]:
os.remove('/content/logs.log')

In [39]:
from google.colab import drive
drive.flush_and_unmount()

### All set!