In [1]:
import sys
import os

# Giả sử bạn đang ở: /path/to/projects/notebooks
# Ta thêm thư mục Text2CAD vào sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '../Text2CAD'))
sys.path.append(project_root)


In [2]:
from datasets import load_dataset, DatasetDict
import json
from CadSeqProc.cad_sequence import CADSequence
from CadSeqProc.utility.utils import normalize_pc, chamfer_dist
from CadSeqProc.utility.utils import (create_path_with_time,ensure_dir)

In [3]:
dataset = load_dataset("TruongSinhAI/DEEPCAD-Text2Json-EnVi")

## Text cleaning and preprocessing

In [1]:
import re
def clean_text(example):
    def process(text):
        text = re.sub(r'[-=#]{3,}', '', text)
        text = re.sub(r'\n+', ' ', text)        
        text = re.sub(r'\s+', ' ', text)        
        return text.strip()
        
    def round_numbers_in_text(text, decimal_places=4):
        def repl(match):
            num = float(match.group())
            return str(round(num, decimal_places))
        return re.sub(r"\d+\.\d+", repl, text)

    example['output'] = round_numbers_in_text(process(example['output']))
    return example

## REMOVE 'description' and 'final_shape'. Because redundant.

In [16]:
def optimize(old):
    old_dict = json.loads(old)
    old_dict.pop('final_shape', None)

    # Duyệt qua các part và xóa khóa 'description' nếu có
    if 'parts' in old_dict:
        for part in old_dict['parts'].values():
            part.pop('description', None)

    return old_dict


In [26]:
def apply_optimize(example):
    example['output'] = str(optimize(example['output']))
    return example

In [27]:
dataset = DatasetDict({
    k: v.map(apply_optimize) for k, v in dataset.items()
})

Map:   0%|          | 0/155783 [00:00<?, ? examples/s]

Map:   0%|          | 0/155783 [00:00<?, ? examples/s]

Map:   0%|          | 0/8814 [00:00<?, ? examples/s]

Map:   0%|          | 0/8814 [00:00<?, ? examples/s]

Map:   0%|          | 0/7924 [00:00<?, ? examples/s]

Map:   0%|          | 0/7924 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# from huggingface_hub import HfApi

# repo_id = "TruongSinhAI/DEEPCAD-Text2Json-EnVi"
# token = "token"  

# dataset.push_to_hub(repo_id, private=False, token=token)

## Check validation of output

In [None]:
from tqdm import tqdm
errors = {}
for split in ['train_en', 'val_en', 'test_en']:
    errors[split] = []
    for sample in tqdm(dataset[split]):
        uid = sample['uid']
        output = sample['output']
        try:
            g = CADSequence.from_minimal_json(json.loads(output.replace("'", '"'))).create_cad_model()
        except Exception as e:
            errors[split].append(uid)


In [None]:
for split in errors:
    print(split, ' : ', len(errors[split]))