<a href="https://colab.research.google.com/github/WaliMuhammadAhmad/CODEX-Dataset/blob/main/Test4Deepseek_Coder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tarfile
import logging
import shutil
import json
import os

In [2]:
from google.colab import userdata
from huggingface_hub import login
from huggingface_hub import Repository
from huggingface_hub import create_repo

### Define Global Variables

In [3]:
folder = 'test'  # this name will be use in all notebook
files = 0     # count the number of files
context_type = "FM_FC_MS_FF"  # Choose context type from: FM, FM_FC, FM_FC_CO, FM_FC_MS, FM_FC_MS_FF

# Path
file = f'/content/drive/MyDrive/Colab Notebooks/Dataset/{folder}.tar.bz2'
input_path = f'/content/{folder}/'
output_path = '/content/Test4Deepseek-Coder'

### Define Functions

In [8]:
def preprocess_dataset(input_folder, output_folder, context_type):
    contexts = {
        "FM": "src_fm",
        "FM_FC": "src_fm_fc",
        "FM_FC_CO": "src_fm_fc_co",
        "FM_FC_MS": "src_fm_fc_ms",
        "FM_FC_MS_FF": "src_fm_fc_ms_ff"
    }

    if context_type not in contexts:
        raise ValueError("Invalid context type. Choose from: FM, FM_FC, FM_FC_CO, FM_FC_MS, FM_FC_MS_FF.")

    context_key = contexts[context_type]

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for subdir, _, files in os.walk(input_folder):
        for file in files:
            if file.endswith('_corpus.json'):
                input_path = os.path.join(subdir, file)
                try:
                    with open(input_path, 'r') as infile:
                        data = json.load(infile)

                        if context_key in data and 'target' in data:
                            instruction = (f"Generate a unit test case for the following Java method: {data[context_key]}")
                            output = (f"The unit test case for the given Java method is: {data['target']}")
                            transform_data = {
                                'instruction': instruction,
                                'output': output
                            }

                            # Determine the output path
                            relative_path = os.path.relpath(input_path, input_folder)
                            output_path = os.path.join(output_folder, relative_path)

                            # Ensure the output directory exists
                            os.makedirs(os.path.dirname(output_path), exist_ok=True)

                            # Write the transformed data to the output path
                            with open(output_path, 'w') as outfile:
                                json.dump(transform_data, outfile, indent=4)
                except Exception as e:
                    logger.error(f"Error processing files @ {input_path}: {e}")

In [7]:
def count_files(directory=dir):
  try:
      entries = os.listdir(directory)
      files = [entry for entry in entries if os.path.isfile(os.path.join(directory, entry))]
      print(f"Number of files in '{directory}': {len(files)}")
  except Exception as e:
      logger.error(f"Error: {e}")

### Start

In [9]:
shutil.rmtree('/content/sample_data')

In [10]:
log_format = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(filename="logs.log", filemode="w", format=log_format, level=logging.WARNING)

logger = logging.getLogger(__name__)
file_handler = logging.FileHandler('logs.log')
file_handler.setLevel(logging.WARNING)
formatter = logging.Formatter(log_format)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

In [11]:
with tarfile.open(file, 'r:bz2') as tar:
  tar.extractall('/content/')

print(f"Extracted {file} @ {input_path}")

Extracted /content/drive/MyDrive/Colab Notebooks/Dataset/test.tar.bz2 @ /content/test/.


In [13]:
folders = os.listdir(input_path)

for f in folders:
  count_files(f'{input_path}/{f}')

Number of files in '/content/test//104078932': 2
Number of files in '/content/test//66622989': 67
Number of files in '/content/test//58892507': 23
Number of files in '/content/test//27030629': 24
Number of files in '/content/test//14117014': 9
Number of files in '/content/test//1467919': 2
Number of files in '/content/test//24095382': 1
Number of files in '/content/test//11585818': 29
Number of files in '/content/test//3865446': 54
Number of files in '/content/test//95635499': 150
Number of files in '/content/test//66060256': 4
Number of files in '/content/test//15870646': 11
Number of files in '/content/test//112703947': 4
Number of files in '/content/test//53297769': 6
Number of files in '/content/test//12925113': 7
Number of files in '/content/test//234031371': 4
Number of files in '/content/test//90016934': 96
Number of files in '/content/test//21894018': 21
Number of files in '/content/test//6639766': 1
Number of files in '/content/test//95081481': 2
Number of files in '/content/t

In [14]:
userdata.get('huggingface')

'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'

In [15]:
login("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",add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [16]:
create_repo("CodexAI/Test4Deepseek-Coder", repo_type="dataset", private=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


RepoUrl('https://huggingface.co/datasets/CodexAI/Test4Deepseek-Coder', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CodexAI/Test4Deepseek-Coder')

In [17]:
repo_url='https://huggingface.co/datasets/CodexAI/Test4Deepseek-Coder'
repo_id = 'CodexAI/Eval4Deepseek-Coder'

repo = Repository(local_dir=output_path, clone_from=repo_url)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/datasets/CodexAI/Test4Deepseek-Coder into local empty directory.


In [18]:
preprocess_dataset(input_path, output_path, context_type)

In [19]:
folders = os.listdir(output_path)
files = 0

for f in folders:
  if not os.path.isdir(f):
    count_files(f'{output_path}/{f}')

Number of files in '/content/Test4Deepseek-Coder/104078932': 2
Number of files in '/content/Test4Deepseek-Coder/66622989': 67
Number of files in '/content/Test4Deepseek-Coder/58892507': 23
Number of files in '/content/Test4Deepseek-Coder/27030629': 24
Number of files in '/content/Test4Deepseek-Coder/14117014': 9
Number of files in '/content/Test4Deepseek-Coder/1467919': 2
Number of files in '/content/Test4Deepseek-Coder/24095382': 1
Number of files in '/content/Test4Deepseek-Coder/11585818': 29
Number of files in '/content/Test4Deepseek-Coder/3865446': 54
Number of files in '/content/Test4Deepseek-Coder/95635499': 150
Number of files in '/content/Test4Deepseek-Coder/66060256': 4
Number of files in '/content/Test4Deepseek-Coder/15870646': 11
Number of files in '/content/Test4Deepseek-Coder/112703947': 4
Number of files in '/content/Test4Deepseek-Coder/53297769': 6
Number of files in '/content/Test4Deepseek-Coder/12925113': 7
Number of files in '/content/Test4Deepseek-Coder/234031371': 4

ERROR:__main__:Error: [Errno 20] Not a directory: '/content/Test4Deepseek-Coder/.gitattributes'


Number of files in '/content/Test4Deepseek-Coder/51269282': 1
Number of files in '/content/Test4Deepseek-Coder/1045976': 87
Number of files in '/content/Test4Deepseek-Coder/49475567': 226
Number of files in '/content/Test4Deepseek-Coder/42032884': 85
Number of files in '/content/Test4Deepseek-Coder/8103494': 9
Number of files in '/content/Test4Deepseek-Coder/183214896': 3
Number of files in '/content/Test4Deepseek-Coder/53577396': 14
Number of files in '/content/Test4Deepseek-Coder/52850323': 2
Number of files in '/content/Test4Deepseek-Coder/11614244': 65
Number of files in '/content/Test4Deepseek-Coder/156416177': 184
Number of files in '/content/Test4Deepseek-Coder/33015857': 168
Number of files in '/content/Test4Deepseek-Coder/75960287': 78
Number of files in '/content/Test4Deepseek-Coder/171415041': 6
Number of files in '/content/Test4Deepseek-Coder/117949407': 5
Number of files in '/content/Test4Deepseek-Coder/129463208': 3
Number of files in '/content/Test4Deepseek-Coder/7579313

### Push to hugging Face

push to hugging face repo using legacy git method, this takes tons of time but immune to connection speed

In [20]:
repo.git_pull()

In [21]:
repo.push_to_hub(commit_message=f"{folder} uploaded")

To https://huggingface.co/datasets/CodexAI/Test4Deepseek-Coder
   3d15ee9..e72cf91  main -> main

   3d15ee9..e72cf91  main -> main



'https://huggingface.co/datasets/CodexAI/Test4Deepseek-Coder/commit/e72cf91ff40d5bb2c3d8348d084efc26dd4169f5'

Alternatively, use HFApi to push dataset to hugging face, this is faster then legacy git commands but requires consistant connection speed

In [None]:
from huggingface_hub import HfApi
api = HfApi()

In [None]:
os.chdir(output_path)

In [None]:
api.upload_folder(
  folder_path=output_path,
  repo_id=repo_id,
  repo_type="dataset",
)

### Release the Space

In [22]:
shutil.rmtree(input_path)
shutil.rmtree(output_path)

In [23]:
os.remove('/content/logs.log')

In [None]:
from google.colab import drive
drive.flush_and_unmount()

### All set!