<a href="https://colab.research.google.com/github/WaliMuhammadAhmad/CODEX-Dataset/blob/main/Train4Deepseek_Coder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tarfile
import logging
import shutil
import json
import os

In [None]:
from google.colab import userdata
from huggingface_hub import login
from huggingface_hub import Repository
from huggingface_hub import create_repo

### Define Global Variables

In [None]:
folder = 'train'  # this name will be use in all notebook
files = 0     # count the number of files
context_type = "FM_FC_MS_FF"  # Choose context type from: FM, FM_FC, FM_FC_CO, FM_FC_MS, FM_FC_MS_FF

# Path
file = f'/content/drive/MyDrive/Colab Notebooks/Dataset/{folder}.tar.bz2'
input_path = f'/content/{folder}/'
output_path = '/content/Train4Deepseek-Coder'

### Define Functions

In [None]:
def preprocess_dataset(input_folder, output_folder, context_type):
    contexts = {
        "FM": "src_fm",
        "FM_FC": "src_fm_fc",
        "FM_FC_CO": "src_fm_fc_co",
        "FM_FC_MS": "src_fm_fc_ms",
        "FM_FC_MS_FF": "src_fm_fc_ms_ff"
    }

    if context_type not in contexts:
        raise ValueError("Invalid context type. Choose from: FM, FM_FC, FM_FC_CO, FM_FC_MS, FM_FC_MS_FF.")

    context_key = contexts[context_type]

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for subdir, _, files in os.walk(input_folder):
        for file in files:
            if file.endswith('_corpus.json'):
                input_path = os.path.join(subdir, file)
                try:
                    with open(input_path, 'r') as infile:
                        data = json.load(infile)

                        if context_key in data and 'target' in data:
                            instruction = (f"Generate a unit test case for the following Java method: {data[context_key]}")
                            output = (f"The unit test case for the given Java method is: {data['target']}")
                            transform_data = {
                                'instruction': instruction,
                                'output': output
                            }

                            # Determine the output path
                            relative_path = os.path.relpath(input_path, input_folder)
                            output_path = os.path.join(output_folder, relative_path)

                            # Ensure the output directory exists
                            os.makedirs(os.path.dirname(output_path), exist_ok=True)

                            # Write the transformed data to the output path
                            with open(output_path, 'w') as outfile:
                                json.dump(transform_data, outfile, indent=4)
                except Exception as e:
                    logger.error(f"Error processing files @ {input_path}: {e}")

In [None]:
def count_files(directory=dir):
  try:
      entries = os.listdir(directory)
      files = [entry for entry in entries if os.path.isfile(os.path.join(directory, entry))]
      print(f"Number of files in '{directory}': {len(files)}")
  except Exception as e:
      logger.error(f"Error: {e}")

### Start

In [None]:
shutil.rmtree('/content/sample_data')

In [None]:
log_format = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(filename="logs.log", filemode="w", format=log_format, level=logging.WARNING)

logger = logging.getLogger(__name__)
file_handler = logging.FileHandler('logs.log')
file_handler.setLevel(logging.WARNING)
formatter = logging.Formatter(log_format)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

In [None]:
with tarfile.open(file, 'r:bz2') as tar:
  tar.extractall('/content/')

print(f"Extracted {file} @ {input_path}")

Extracted /content/drive/MyDrive/Colab Notebooks/Dataset/train.tar.bz2 @ /content/train/


In [None]:
folders = os.listdir(input_path)

for f in folders:
  count_files(f'{input_path}/{f}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Number of files in '/content/train//3742298': 5
Number of files in '/content/train//108706261': 1
Number of files in '/content/train//35814909': 30
Number of files in '/content/train//40763113': 5
Number of files in '/content/train//115411826': 58
Number of files in '/content/train//57461213': 259
Number of files in '/content/train//33119850': 2
Number of files in '/content/train//102328857': 13
Number of files in '/content/train//114670580': 2
Number of files in '/content/train//47311500': 8
Number of files in '/content/train//2090979': 1
Number of files in '/content/train//169246391': 40
Number of files in '/content/train//64025178': 20
Number of files in '/content/train//78753723': 3
Number of files in '/content/train//14496551': 22
Number of files in '/content/train//10522120': 8293
Number of files in '/content/train//16389681': 17
Number of files in '/content/train//57263576': 26
Number of files in '/content/train//9

In [None]:
userdata.get('huggingface')

'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'

In [None]:
login("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",add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
create_repo("CodexAI/Train4Deepseek-Coder", repo_type="dataset", private=True)

RepoUrl('https://huggingface.co/datasets/CodexAI/Train4Deepseek-Coder', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CodexAI/Train4Deepseek-Coder')

In [None]:
repo_url='https://huggingface.co/datasets/CodexAI/Train4Deepseek-Coder'
repo_id = 'CodexAI/Train4Deepseek-Coder'

repo = Repository(local_dir=output_path, clone_from=repo_url)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/datasets/CodexAI/Train4Deepseek-Coder into local empty directory.


In [None]:
preprocess_dataset(input_path, output_path, context_type)

In [None]:
folders = os.listdir(output_path)
files = 0

for f in folders:
  if not os.path.isdir(f):
    count_files(f'{output_path}/{f}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Number of files in '/content/Train4Deepseek-Coder/9580819': 46
Number of files in '/content/Train4Deepseek-Coder/200642013': 1
Number of files in '/content/Train4Deepseek-Coder/5244445': 45
Number of files in '/content/Train4Deepseek-Coder/11480369': 4
Number of files in '/content/Train4Deepseek-Coder/3337772': 588
Number of files in '/content/Train4Deepseek-Coder/43923787': 45
Number of files in '/content/Train4Deepseek-Coder/92120027': 63
Number of files in '/content/Train4Deepseek-Coder/74704643': 8
Number of files in '/content/Train4Deepseek-Coder/67114675': 28
Number of files in '/content/Train4Deepseek-Coder/82398873': 9
Number of files in '/content/Train4Deepseek-Coder/57191077': 4
Number of files in '/content/Train4Deepseek-Coder/2588654': 38
Number of files in '/content/Train4Deepseek-Coder/42678647': 15
Number of files in '/content/Train4Deepseek-Coder/6358188': 2667
Number of files in '/content/Train4Deepseek-C

ERROR:__main__:Error: [Errno 20] Not a directory: '/content/Train4Deepseek-Coder/.gitattributes'



Number of files in '/content/Train4Deepseek-Coder/134143184': 424
Number of files in '/content/Train4Deepseek-Coder/33766200': 1
Number of files in '/content/Train4Deepseek-Coder/4182801': 177
Number of files in '/content/Train4Deepseek-Coder/2988721': 4
Number of files in '/content/Train4Deepseek-Coder/114619810': 316
Number of files in '/content/Train4Deepseek-Coder/129428289': 45
Number of files in '/content/Train4Deepseek-Coder/25916153': 2
Number of files in '/content/Train4Deepseek-Coder/128535513': 129
Number of files in '/content/Train4Deepseek-Coder/18106767': 1
Number of files in '/content/Train4Deepseek-Coder/27785546': 145
Number of files in '/content/Train4Deepseek-Coder/122968674': 2
Number of files in '/content/Train4Deepseek-Coder/186642789': 4
Number of files in '/content/Train4Deepseek-Coder/39875602': 2
Number of files in '/content/Train4Deepseek-Coder/60701545': 341
Number of files in '/content/Train4Deepseek-Coder/8015168': 9
Number of files in '/content/Train4Dee

### Push to hugging Face

push to hugging face repo using legacy git method, this takes tons of time but immune to connection speed

In [None]:
repo.git_pull()

In [None]:
repo.push_to_hub(commit_message=f"{folder} uploaded")

To https://huggingface.co/datasets/CodexAI/Train4Deepseek-Coder
   1d3face..1bfd771  main -> main

   1d3face..1bfd771  main -> main



'https://huggingface.co/datasets/CodexAI/Train4Deepseek-Coder/commit/1bfd7714c019a013f9d880a224a0a05ff6ba9afa'

Alternatively, use HFApi to push dataset to hugging face, this is faster then legacy git commands but requires consistant connection speed

In [None]:
from huggingface_hub import HfApi
api = HfApi()

In [None]:
os.chdir(output_path)

In [None]:
api.upload_folder(
  folder_path=output_path,
  repo_id=repo_id,
  repo_type="dataset",
)

### Release the Space

In [None]:
shutil.rmtree(input_path)
shutil.rmtree(output_path)

In [None]:
os.remove('/content/logs.log')

In [None]:
from google.colab import drive
drive.flush_and_unmount()

### All set!