In [1]:
import json
import re
import os
from datasets import load_dataset
from tqdm import tqdm
import google as genai
from huggingface_hub import login
from dotenv import load_dotenv

In [2]:
from datasets import load_dataset, Dataset, DatasetDict 
import math
import pandas as pd
import math
import os
from huggingface_hub import create_repo, upload_folder
import json 
from google import genai
from google.genai import types
from pathlib import Path
import concurrent.futures
from json_repair import repair_json


In [3]:
def process_json_files(folder_path):
    """
    Đọc tất cả các file JSON trong một thư mục và lưu chúng vào một mảng.

    Args:
        folder_path (str): Đường dẫn đến thư mục chứa các file JSON.

    Returns:
        list: Một danh sách chứa nội dung của tất cả các file JSON đã đọc.
              Trả về một danh sách rỗng nếu thư mục không tồn tại hoặc không có file JSON nào.
    """
    all_data = []

    if not os.path.isdir(folder_path):
        print(f"Lỗi: Thư mục '{folder_path}' không tồn tại.")
        return all_data

    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    all_data.append(data)
            except json.JSONDecodeError:
                print(f"Lỗi: Không thể giải mã JSON từ file '{file_path}'.")
            except FileNotFoundError:
                print(f"Lỗi: Không tìm thấy file '{file_path}'.")
            except Exception as e:
                print(f"Lỗi khi đọc file '{file_path}': {e}")
    return all_data


In [4]:
now = process_json_files('results_not_done4')

In [8]:
class Gemini:
    def __init__(self, api_key):
        self.client = genai.Client(api_key = api_key)
        self.model_name ="gemini-2.5-flash"
        self.config()

    def config(self):
        self.generate_content_config = types.GenerateContentConfig(
            response_mime_type="application/json",
            response_schema=genai.types.Schema(
                type = genai.types.Type.OBJECT,
                required = ["result"],
                properties = {
                    "result": genai.types.Schema(
                        type = genai.types.Type.ARRAY,
                        items = genai.types.Schema(
                            type = genai.types.Type.STRING,
                        ),
                    ),
                },
            ),
            system_instruction=[
                types.Part.from_text(text = """
I have a CAD JSON file (used to create CAD) and a description of the JSON, YOU WILL RECIVE MANY JSON-COMPLETTION, Please return into json format for each pairs, each pair separate by "\n======================================".

YOU NEED TO RESPONSE ME BY JSON FORMAT ARRAY.
IN THE VALUE YOU RESPONSE PLEASE USING `\"` INSTEAD `"`
RESPONSE IN VIETNAMESE
FOLLOWING THESE STEPS:

*S1: Remove the excess description immediately after the extrusion description.

**Example:
<description>
Part 1: Three-dimensional rectangular prism with a flat top and bottom. Create a new coordinate system with the following properties: * Euler Angles: (0.0, 0.0, -90.0) * Translation Vector: (0.0, 0.0316, 0.0). Draw a 2D sketch on the XY plane of the coordinate system. Create a face containing one closed loop made up of 4 lines: * Line 1: Start Point (0.0, 0.0), End Point (0.75, 0.0) * Line 2: Start Point (0.75, 0.0), End Point (0.75, 0.6772) * Line 3: Start Point (0.75, 0.6772), End Point (0.0, 0.6772) * Line 4: Start Point (0.0, 0.6772), End Point (0.0, 0.0). Scale the 2D sketch by a factor of 0.75. Transform the scaled 2D sketch into a 3D sketch using the defined coordinate system. Extrude the 3D sketch by 0.0316 units in the positive Z direction. The height of this part is 0.75 units, the width is 0.0316 units, and the length is 0.75 units. This completes the three-dimensional rectangular prism part.
</description>

-> You need to remove the line: "The height of this part is 0.75 units, the width is 0.0316 units, and the length is 0.75 units. This completes the three-dimensional rectangular prism part." Keep the </description> tag.
The output you need to return after removing the excess description would look like:

<description>
Part 1: Three-dimensional rectangular prism with a flat top and bottom. Create a new coordinate system with the following properties: * Euler Angles: (0.0, 0.0, -90.0) * Translation Vector: (0.0, 0.0316, 0.0). Draw a 2D sketch on the XY plane of the coordinate system. Create a face containing one closed loop made up of 4 lines: * Line 1: Start Point (0.0, 0.0), End Point (0.75, 0.0) * Line 2: Start Point (0.75, 0.0), End Point (0.75, 0.6772) * Line 3: Start Point (0.75, 0.6772), End Point (0.0, 0.6772) * Line 4: Start Point (0.0, 0.6772), End Point (0.0, 0.0). Scale the 2D sketch by a factor of 0.75. Transform the scaled 2D sketch into a 3D sketch using the defined coordinate system. Extrude the 3D sketch by 0.0316 units in the positive Z direction.
</description>
**

*S2: Check if the newly created description matches the json. If it does, create "<valid>Yes</valid>", if it doesn't, create "<valid>No</valid>"

*S3: Create a sample reasoning data enclosed in <think> ... </think>. The reasoning data should follow two steps:
Step 1: Reason out the components that will be in the JSON based on the given description.
Step 2: Check the logic, arithmetic correctness, and make corrections (if necessary) from Step 1.

**Example 1 sample:

***Input:
<json> {{"parts": {{"part_1": {{"coordinate_system": {{"Euler Angles": [0.0, 0.0, -90.0], "Translation Vector": [0.0, 0.0316, 0.0]}}, "sketch": {{"face_1": {{"loop_1": {{"line_1": {{"Start Point": [0.0, 0.0], "End Point": [0.75, 0.0]}}, "line_2": {{"Start Point": [0.75, 0.0], "End Point": [0.75, 0.6772]}}, "line_3": {{"Start Point": [0.75, 0.6772], "End Point": [0.0, 0.6772]}}, "line_4": {{"Start Point": [0.0, 0.6772], "End Point": [0.0, 0.0]}}}}}}, "extrusion": {{"extrude_depth_towards_normal": 0.0316, "extrude_depth_opposite_normal": 0.0, "sketch_scale": 0.75, "operation": "NewBodyFeatureOperation"}}}}}}}} </json>
<description> Part 1: Three-dimensional rectangular prism with a flat top and bottom. Create a new coordinate system with the following properties: * Euler Angles: (0.0, 0.0, -90.0) * Translation Vector: (0.0, 0.0316, 0.0). Draw a 2D sketch on the XY plane of the coordinate system. Create a face containing one closed loop made up of 4 lines: * Line 1: Start Point (0.0, 0.0), End Point (0.75, 0.0) * Line 2: Start Point (0.75, 0.0), End Point (0.75, 0.6772) * Line 3: Start Point (0.75, 0.6772), End Point (0.0, 0.6772) * Line 4: Start Point (0.0, 0.6772), End Point (0.0, 0.0). Scale the 2D sketch by a factor of 0.75. Transform the scaled 2D sketch into a 3D sketch using the defined coordinate system. Extrude the 3D sketch by 0.0316 units in the positive Z direction. </description>

***Output:
S1:
<description>
Part 1: Three-dimensional rectangular prism with a flat top and bottom. Create a new coordinate system with the following properties: * Euler Angles: (0.0, 0.0, -90.0) * Translation Vector: (0.0, 0.0316, 0.0). Draw a 2D sketch on the XY plane of the coordinate system. Create a face containing one closed loop made up of 4 lines: * Line 1: Start Point (0.0, 0.0), End Point (0.75, 0.0) * Line 2: Start Point (0.75, 0.0), End Point (0.75, 0.6772) * Line 3: Start Point (0.75, 0.6772), End Point (0.0, 0.6772) * Line 4: Start Point (0.0, 0.6772), End Point (0.0, 0.0). Scale the 2D sketch by a factor of 0.75. Transform the scaled 2D sketch into a 3D sketch using the defined coordinate system. Extrude the 3D sketch by 0.0316 units in the positive Z direction.
</description>
S2:
<valid>Yes</valid>
S3:
<think>
***Step 1: Infer the components that will be in the json based on the provided description:

parts:From the description, we have a single part labeled "part_1" describing a three-dimensional rectangular prism. This will be reflected in the json as "part_1".

part_1: (Rectangular Prism)
-coordinate_system:
--Euler Angles: [0.0, 0.0, -90.0] (Derived from the description: "Create a new coordinate system with the following properties: * Euler Angles: (0.0, 0.0, -90.0)").
--Translation Vector: [0.0, 0.0316, 0.0] (Derived from the description: "Translation Vector: (0.0, 0.0316, 0.0)").
-sketch:
--face_1:
---loop_1:
----line_1: Start [0.0, 0.0], End [0.75, 0.0] (Derived from the description: "Line 1: Start Point (0.0, 0.0), End Point (0.75, 0.0)").
----line_2: Start [0.75, 0.0], End [0.75, 0.6772] (Derived from the description: "Line 2: Start Point (0.75, 0.0), End Point (0.75, 0.6772)").
----line_3: Start [0.75, 0.6772], End [0.0, 0.6772] (Derived from the description: "Line 3: Start Point (0.75, 0.6772), End Point (0.0, 0.6772)").
----line_4: Start [0.0, 0.6772], End [0.0, 0.0] (Derived from the description: "Line 4: Start Point (0.0, 0.6772), End Point (0.0, 0.0)").
-extrusion:
--extrude_depth_towards_normal: 0.0316 (Derived from the description: "Extrude the 3D sketch by 0.0316 units in the positive Z direction").
--extrude_depth_opposite_normal: 0.0 (Not mentioned in the description, but inferred as 0 because no extrusion is required in the opposite direction).
--sketch_scale: 0.75 (Derived from the description: "Scale the 2D sketch by a factor of 0.75").
--operation: "NewBodyFeatureOperation" (Inferred from the description: "Create a 3D model of the rectangular prism").

***Step 2: Check the logic, arithmetic correctness, and make corrections (if necessary) from Step 1:

-Check the lines (line_1, line_2, line_3, line_4):
line_1 (Start [0.0, 0.0], End [0.75, 0.0]) connects correctly with line_2 (Start [0.75, 0.0], End [0.75, 0.6772]).
line_2 (Start [0.75, 0.0], End [0.75, 0.6772]) connects correctly with line_3 (Start [0.75, 0.6772], End [0.0, 0.6772]).
line_3 (Start [0.75, 0.6772], End [0.0, 0.6772]) connects correctly with line_4 (Start [0.0, 0.6772], End [0.0, 0.0]).
line_4 (Start [0.0, 0.6772], End [0.0, 0.0]) connects correctly with line_1 (Start [0.0, 0.0], End [0.75, 0.0]).

--Result: The sketch loop is valid and closed.

-Check the extrusion:
Extrusion of the 3D sketch with a depth of 0.0316 units in the positive Z direction, as specified in the description.
No extrusion is needed in the opposite direction, so extrude_depth_opposite_normal is correctly set to 0.

--Result: The extrusion is correct and matches the requirements.

-Check dimensions consistency:
The values for the Start/End points of the lines are correctly represented and match the 2D sketch described.
The extrusion depth and scale factor are consistent with the description.

--Result: The dimensions are correct and consistent.

-Correction: This inferred data is accurate and does not require further modifications.
</think>
**
"""),
            ],
            # thinking_config=types.ThinkingConfig(thinking_budget=0)
        )

    # def fix_json(self, text):
    #     def replace_quote(match):
    #         quote = match.group(0)
    #         start_pos = match.start()
    #         end_pos = match.end()
            
    #         # Kiểm tra ký tự trước và sau
    #         char_before = text[start_pos - 1] if start_pos > 0 else ''
    #         char_after = text[end_pos] if end_pos < len(text) else ''
            
    #         # Ngoại trừ: có > trước hoặc < sau
    #         if char_before == '>' or char_after == '<':
    #             return quote
            
    #         # Kiểm tra trường hợp "result"
    #         # Lấy context xung quanh để kiểm tra
    #         context_start = max(0, start_pos - 7)
    #         context_end = min(len(text), end_pos + 7)
    #         context = text[context_start:context_end]
            
    #         if '"result"' in context:
    #             # Kiểm tra chính xác vị trí
    #             result_pattern = r'"result"'
    #             for result_match in re.finditer(result_pattern, context):
    #                 result_start = context_start + result_match.start()
    #                 result_end = context_start + result_match.end()
    #                 if result_start <= start_pos < result_end:
    #                     return quote
            
    #         # Kiểm tra xem đã có dấu \ trước chưa
    #         if char_before == '\\':
    #             return quote
            
    #         return '\\' + quote
    
    #     # Tìm tất cả dấu " và thay thế
    #     result = re.sub(r'"', replace_quote, text)
    #     return repair_json(result)

    def extract(self, text):
        """
        Tìm và trích xuất tất cả các đoạn text bắt đầu bằng <description> và kết thúc bằng </think>
        
        Args:
            text (str): Văn bản đầu vào
            
        Returns:
            list: Danh sách các đoạn text được tìm thấy
        """
        # Pattern để tìm các đoạn từ <description> đến </think>
        pattern = r'<description>.*?</think>'
        
        # Tìm tất cả các đoạn khớp với pattern (DOTALL để . khớp với \n)
        matches = re.findall(pattern, text, re.DOTALL)
        
        return matches

    
    def ask_gemini(self, inputs:str) -> list[str]:

        try:
            response = self.client.models.generate_content(
                model=self.model_name,
                contents=inputs,
                config=self.generate_content_config,
            )
            ans = self.extract(response.text)
            if len(ans) == 5:
                return ans

        except:
            response = self.client.models.generate_content(
                    model= "gemini-2.5-flash-lite-preview-06-17",
                    contents=inputs,
                    config=self.generate_content_config,
                )
            ans = self.extract(response.text)
            if len(ans) == 5:
                return ans
        return [''] * 5




In [9]:
from dotenv import load_dotenv
load_dotenv()
api_keys_string = os.getenv("API_KEY_LIST")
api_keys = api_keys_string.split(",") if api_keys_string else []

In [4]:
old_split = "range_500_1000_vi"
start = 0#500  * 5* len(api_keys)*2
end = 500  * 5* len(api_keys)  # max_request * batch_size * num_of_accounts

In [5]:
dataset = load_dataset("wanhin/cad_hqh1", split=old_split)
    
# Filter dataset based on start and end indices
filtered_dataset = dataset.select(range(start, min(end, len(dataset))))
print(f"Processing {len(filtered_dataset)} samples from index {start} to {min(end, len(dataset))}")

Processing 25000 samples from index 0 to 25000


In [10]:
n_workers = len(api_keys)
gemini_clients = [Gemini(api_key) for api_key in api_keys]
n_workers

10

In [11]:
def process_data(gemini: Gemini, data_chunk: list[dict], worker_id: int, batch_size: int = 5):
    results = []
    for i in range(0, len(data_chunk), batch_size):
        batch = data_chunk[i:i + batch_size]
        inputs = ""
        for idx in range(5): #batchsize
            inputs += batch['completion'][idx] + '\n\n' + batch['prompt'][idx] +"\n======================================"
            
        try:
            response = gemini.ask_gemini(inputs)
            batch['reasoning'] = response
            batch_dicts = [dict(zip(batch, values)) for values in zip(*batch.values())]
            results += batch_dicts
        except Exception as e:
            # print(f"Worker {worker_id} - Error in batch {i}: {e}")
            continue
        save_result(batch_dicts, index=f"{worker_id}_{i}", save_dir="vi_1", filename_prefix="part")
    return results


In [12]:
from datasets import Dataset
from datasets import concatenate_datasets

def chunk_data(dataset: Dataset, n_chunks: int) -> list[Dataset]:
    length = len(dataset)
    chunk_size = length // n_chunks
    chunks = [Dataset.from_dict(dataset[i * chunk_size : (i + 1) * chunk_size]) for i in range(n_chunks)]

    # Xử lý phần dư nếu có
    remainder_start = n_chunks * chunk_size
    if remainder_start < length:
        remainder = dataset.select(range(remainder_start, length))  # đây là Dataset
        chunks[-1] = concatenate_datasets([chunks[-1], remainder])

    return chunks

data_chunks = chunk_data(filtered_dataset, n_workers)

In [13]:
data_chunks

[Dataset({
     features: ['prompt', 'completion'],
     num_rows: 2500
 }),
 Dataset({
     features: ['prompt', 'completion'],
     num_rows: 2500
 }),
 Dataset({
     features: ['prompt', 'completion'],
     num_rows: 2500
 }),
 Dataset({
     features: ['prompt', 'completion'],
     num_rows: 2500
 }),
 Dataset({
     features: ['prompt', 'completion'],
     num_rows: 2500
 }),
 Dataset({
     features: ['prompt', 'completion'],
     num_rows: 2500
 }),
 Dataset({
     features: ['prompt', 'completion'],
     num_rows: 2500
 }),
 Dataset({
     features: ['prompt', 'completion'],
     num_rows: 2500
 }),
 Dataset({
     features: ['prompt', 'completion'],
     num_rows: 2500
 }),
 Dataset({
     features: ['prompt', 'completion'],
     num_rows: 2500
 })]

In [14]:
def save_result(results: list[dict], index: int, save_dir="results", filename_prefix="reasoning"):
    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, f"{filename_prefix}_{index}.json")

    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    
    # print(f"Saved {len(results)} items to {save_path}")


In [15]:
from tqdm import tqdm
all_results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
    futures = [
        executor.submit(process_data, gemini_clients[i], data_chunks[i], i, batch_size=5)
        for i in range(n_workers)
    ]
    for future in tqdm(concurrent.futures.as_completed(futures)):
        all_results.extend(future.result())
save_result(all_results, 0, 'results_vi')

10it [6:54:01, 2484.16s/it]


In [24]:
all_results

[]

In [16]:
len(all_results)

9095

In [None]:
# save_result(all_results, 0000)

In [5]:
hehe = []
for sample in now:
    hehe.extend(sample)

In [6]:
len(hehe)

5995

In [17]:
c = 0
for sample in all_results:
    if sample['reasoning'] != '':
        c+= 1

In [18]:
c

5305

In [19]:
def extract_tags(text, tag_name):
    """Extract content from XML-like tags"""
    pattern = f"<{tag_name}>(.*?)</{tag_name}>"
    matches = re.findall(pattern, text, re.DOTALL)
    return matches[0] if matches else None

In [24]:
new_data = []
invalid_indices = []
c2 = 0
c3 = 0
c4 = 0
for sample in all_results:
    if sample['reasoning'] != '':
        c4 += 1
        response_text = sample['reasoning']
        valid_tag = extract_tags(response_text, "valid")
        if valid_tag and valid_tag.strip() == "Yes":
            # Extract description and reasoning
            description_tag = extract_tags(response_text, "description")
            think_tag = extract_tags(response_text, "think")
            
            if description_tag and think_tag:
                # Create new sample
                new_sample = {
                    "description": f"<description>{description_tag}</description>",
                    "reasoning": f"<think>{think_tag}</think>",
                    "completion": sample['completion']
                }
                new_data.append(new_sample)
            else:
                c3 += 1
        else:
            c2 += 1
            

In [25]:
c2


1293

In [26]:
len(new_data)

3997

In [29]:
from datasets import Dataset
    
new_dataset = Dataset.from_list(new_data)

In [None]:
from huggingface_hub import HfApi

repo_id = "TruongSinhAI/cad_reasoning"
token = "tokennn"  
new_dataset.push_to_hub(repo_id, private=False, token=token, split="range_500_1000_vi_0_50000")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/875 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/TruongSinhAI/cad_reasoning/commit/500c48474aea836d3a5929a195719399a20bdd38', commit_message='Upload dataset', commit_description='', oid='500c48474aea836d3a5929a195719399a20bdd38', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/TruongSinhAI/cad_reasoning', endpoint='https://huggingface.co', repo_type='dataset', repo_id='TruongSinhAI/cad_reasoning'), pr_revision=None, pr_num=None)

In [15]:
data = load_dataset("TruongSinhAI/cad_reasoning")

README.md:   0%|          | 0.00/764 [00:00<?, ?B/s]

(…)0_1000_en_0_25000-00000-of-00001.parquet:   0%|          | 0.00/9.88M [00:00<?, ?B/s]

(…)00_en_25000_50000-00000-of-00001.parquet:   0%|          | 0.00/2.61M [00:00<?, ?B/s]

(…)1000_en_50000_end-00000-of-00001.parquet:   0%|          | 0.00/4.84M [00:00<?, ?B/s]

Generating range_500_1000_en_0_25000 split:   0%|          | 0/6152 [00:00<?, ? examples/s]

Generating range_500_1000_en_25000_50000 split:   0%|          | 0/1632 [00:00<?, ? examples/s]

Generating range_500_1000_en_50000_end split:   0%|          | 0/3007 [00:00<?, ? examples/s]

In [18]:
import pandas as pd
from datasets import concatenate_datasets

merged_dataset = concatenate_datasets([
    data['range_500_1000_en_0_25000'],
    data['range_500_1000_en_25000_50000'],
    data['range_500_1000_en_50000_end']
])

# Chuyển sang pandas DataFrame
df = merged_dataset.to_pandas()



In [20]:
df.to_csv("merged_dataset.csv", index=False)

In [21]:
df2 = pd.read_csv('h.csv')
df2.head(5)

Unnamed: 0,description,reasoning,completion,description_trans,reasoning_trans
0,<description><part\\_1> - Construct a rectangu...,<think>\n***Step 1: Infer the components that ...,"<json>\n{""parts"":{""part_1"":{""coordinate_system...",<description><part\\_1> - Dựng một lăng trụ đứ...,<think>\n***Bước 1: Suy ra các thành phần sẽ c...
1,<description>Part 1: Cylindrical Object with T...,<think>\n***Step 1: Infer the components that ...,"<json>\n{""parts"":{""part_1"":{""coordinate_system...",<description>Phần 1: Đối tượng hình trụ có đầu...,<think>\n***Bước 1: Suy ra các thành phần sẽ c...
2,<description>**Part 1: A rectangular prism wit...,<think>\n***Step 1: Infer the components that ...,"<json>\n{""parts"":{""part_1"":{""coordinate_system...",<description>**Phần 1: Lăng trụ chữ nhật có cạ...,<think>\n***Bước 1: Suy ra các thành phần sẽ c...
3,<description>Part 1: Rectangular Table Top The...,<think>\n***Step 1: Infer the components that ...,"<json>\n{""parts"":{""part_1"":{""coordinate_system...",<description>Phần 1: Mặt bàn hình chữ nhật Phầ...,<think>\n***Bước 1: Suy ra các thành phần sẽ c...
4,<description> **Part 1: Building a rectangular...,<think>\n***Step 1: Infer the components that ...,"<json>\n{""parts"":{""part_1"":{""coordinate_system...",<description> **Phần 1: Xây dựng một lăng trụ ...,<think>\n***Bước 1: Suy ra các thành phần sẽ c...


In [42]:
x = set()
for row in df2['description_trans']:
    
    # x.add(row[row.find('<'):row.find('>')+1])
    x.add(row[-12:])
x

{' trình đùn',
 ' tính năng',
 '. </mô tả>',
 '.0</mô tả>',
 '0]</mô tả>',
 '44</mô tả>',
 '48</mô tả>',
 '5 </mô tả>',
 '7.</mô tả>',
 '75</mô tả>',
 ': Nối thân',
 '\\n</mô tả>',
 'ao tác đùn',
 'c tính nối',
 'g khối mới',
 'h năng cắt',
 'hảo đã đùn',
 'n vật liệu',
 'pháp tuyến',
 'ration"\n,"',
 'scription>',
 't động nối',
 'ớc khi đùn',
 'ủa bộ phận'}

In [44]:
als = []
for idx, row in df2.iterrows():
    description = row['description_trans'].replace('<mô tả>', '<description>')
    description = description.replace('</mô tả>', '</description>')
    if description[-1] != '>':
        description += '</description>'
        
    reasoning = row['reasoning_trans']
    new_sample = {
                    "description":description ,
                    "reasoning":reasoning ,
                    "completion": row['completion']
                }
    als.append(new_sample)
# for row in df2['description_trans']:
    
    # x.add(row[row.find('<'):row.find('>')+1])
    # x.add(row[-10:])


In [48]:
from datasets import Dataset, DatasetDict
ds = Dataset.from_list(als)


In [49]:
ds

Dataset({
    features: ['description', 'reasoning', 'completion'],
    num_rows: 10791
})

In [None]:
from huggingface_hub import HfApi

repo_id = "TruongSinhAI/cad_reasoning"
token = "tokennn"  
ds.push_to_hub(repo_id, private=False, token=token, split="data_vi")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/TruongSinhAI/cad_reasoning/commit/e2a512d0be63fc629a83148bea531e59223d4e61', commit_message='Upload dataset', commit_description='', oid='e2a512d0be63fc629a83148bea531e59223d4e61', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/TruongSinhAI/cad_reasoning', endpoint='https://huggingface.co', repo_type='dataset', repo_id='TruongSinhAI/cad_reasoning'), pr_revision=None, pr_num=None)