In [57]:
from datasets import load_dataset, Dataset, DatasetDict 
import math
import pandas as pd
import math
import os
from huggingface_hub import create_repo, upload_folder
import json
from google import genai
from google.genai import types
from pathlib import Path
import concurrent.futures


In [58]:
class Gemini:
    def __init__(self, api_key):
        self.client = genai.Client(api_key = api_key)
        self.model_name = "gemini-2.5-flash-preview-04-17"
        self.config()

    def config(self):

        self.generate_content_config = types.GenerateContentConfig(
            response_mime_type="application/json",
            system_instruction=[
                types.Part.from_text(text = """
You will be given multiple English paragraphs.

Your task:
- Translate each paragraph independently into Vietnamese.
- Maintain the paragraph order.
- Return the translations as a JSON array of strings.

Important:
- Escape all special characters properly in the JSON output (e.g., double quotes as \", backslashes as \\) to ensure it can be parsed using JSON.parse().

Format strictly like this (and nothing else):

[
  "Translation of paragraph 1",
  "Translation of paragraph 2",
  "Translation of paragraph 3",
  ...
]

Do not return any explanations. Only return the JSON array.
"""),
            ],
            thinking_config=types.ThinkingConfig(thinking_budget=0)
        )
        
    def ask_gemini(self, paragraphs: list[str]) -> list[str]:
        contents = {
            f'paragraph {i}': paragraphs[i] for i in range(len(paragraphs)) 
        }
        
        response = self.client.models.generate_content(
            model=self.model_name,
            contents=str(contents),
            config=self.generate_content_config,
        )

        try:
            data  = json.loads(response.text)
            if len(data) == len(paragraphs):
                return data
            else:
                return [''] * len(paragraphs)
        except json.JSONDecodeError:
            data = response.text.split('",\n')
            if len(data) == len(paragraphs):
                return data
            raise ValueError("Failed to parse response as JSON. Raw output:\n" + response.text)



In [None]:
api_keys = [
    """
    # Add your Gemini API keys here
    """
]

In [5]:
# dataset = load_dataset("TruongSinhAI/deepcad_prompt_json")

# merged_df = pd.concat([
#     dataset["train"].to_pandas(),
#     dataset["validation"].to_pandas(),
#     dataset["test"].to_pandas()
# ], ignore_index=True)

In [60]:
# # Calculate the split size (same length for each)
# split_size = len(merged_df) // 3

# # Create new train, validation, and test datasets
# new_train_df = merged_df.iloc[:split_size]
# new_val_df = merged_df.iloc[split_size:2*split_size]
# new_test_df = merged_df.iloc[2*split_size:]

# #Convert back to datasets
# new_train_ds = Dataset.from_pandas(new_train_df)
# new_val_ds = Dataset.from_pandas(new_val_df)
# new_test_ds = Dataset.from_pandas(new_test_df)

# new_dataset = DatasetDict({
#     'part_1': new_train_ds,
#     'part_2': new_val_ds,
#     'part_3': new_test_ds
# })

# new_dataset

In [7]:
# data_translated = Dataset.load_from_disk('translated_main')

In [8]:
# uids_ = data_translated['uid']

In [9]:
# data_p23 = concatenate_datasets([new_dataset['part_2'], new_dataset['part_3']])

In [10]:
# not_done = []
# for data in data_p23:
#     if data['uid'] not in uids_:
#         not_done.append(data)


In [11]:
# not_done = Dataset.from_list(not_done)

In [12]:
# not_done

## NLI_DATA

In [61]:
df = pd.read_csv('../text2cad_v1.0.csv')
df.head(5)

Unnamed: 0,uid,abstract,beginner,intermediate,expert,all_level_data,nli_data
0,0035/00359148,A rectangular prism with a flat top and bottom...,The CAD model is a rectangular prism with a cu...,The CAD model consists of a rectangular prism ...,"Create the first part of the CAD model, a rect...",\n<level\\_1>\nCreate the first part of the CA...,\n##### Part 1: Rectangular Prism with a Curve...
1,0035/00354394,"A long, thin, rectangular prism with a slightl...","The CAD model features a long, thin, rectangul...","The CAD model consists of a long, thin, rectan...","Create the first part of the CAD model, a long...",\n<level\\_1>\nCreate the first part of the CA...,"\n**Part 1: Long, Thin, Rectangular Prism with..."
2,0035/00351230,A three-dimensional rectangular prism with a ...,The CAD model represents a solid block with a ...,The CAD model is a solid 3D rectangle with a c...,Construct a three-dimensional rectangular pris...,\n<level\\_1>\nConstruct a three-dimensional r...,\n#### Part 1: Three-Dimensional Rectangular P...
3,0035/00358303,A cylindrical object with a slightly tapered ...,The CAD model features a cylindrical object wi...,The CAD model consists of a cylindrical object...,"Create the first part of the CAD model, a cyli...",\n<level\\_1>\nCreate the first part of the CA...,\n **Part 1: Cylindrical Object with Two Ho...
4,0035/00359942,A cylindrical object with a flat top and bott...,The CAD model is a cylindrical object with a s...,The CAD model consists of a cylindrical object...,Create the base of the cylindrical object by f...,\n<level_1>\nCreate the base of the cylindrica...,\n#### **Construct the base of the cylindrical...


In [None]:
df['expert'][100]

"\n##### Part 1: Long Rectangular Prism\n\nThe first part is a long rectangular prism with a flat top and bottom. The prism is centered horizontally and vertically in the image. The top and bottom faces are parallel to each other and perpendicular to the sides of the prism. The sides of the prism are parallel to each other and perpendicular to the top and bottom faces. The prism is the only object in the image.\n\n1. Create a new coordinate system with the following properties:\n   - Euler Angles: (-90.0, 0.0, -90.0)\n   - Translation Vector: (0.0, 0.0, 0.0)\n\n2. Draw a 2D sketch on the XY plane of the coordinate system.\n\n3. Create a rectangle by drawing four connected lines on the sketch:\n   - Line 1: Starting point (0.0, 0.0), Ending point (0.0312, 0.0)\n   - Line 2: Starting point (0.0312, 0.0), Ending point (0.0312, 0.0625)\n   - Line 3: Starting point (0.0312, 0.0625), Ending point (0.0, 0.0625)\n   - Line 4: Starting point (0.0, 0.0625), Ending point (0.0, 0.0)\n\n4. Scale th

In [67]:
{f'paragraph {i}': data['input'][i] for i in range(36)}

{'paragraph 0': 'C',
 'paragraph 1': 'o',
 'paragraph 2': 'n',
 'paragraph 3': 's',
 'paragraph 4': 't',
 'paragraph 5': 'r',
 'paragraph 6': 'u',
 'paragraph 7': 'c',
 'paragraph 8': 't',
 'paragraph 9': ' ',
 'paragraph 10': 't',
 'paragraph 11': 'h',
 'paragraph 12': 'e',
 'paragraph 13': ' ',
 'paragraph 14': 'f',
 'paragraph 15': 'i',
 'paragraph 16': 'r',
 'paragraph 17': 's',
 'paragraph 18': 't',
 'paragraph 19': ' ',
 'paragraph 20': 'i',
 'paragraph 21': 'n',
 'paragraph 22': 't',
 'paragraph 23': 'e',
 'paragraph 24': 'r',
 'paragraph 25': 'm',
 'paragraph 26': 'e',
 'paragraph 27': 'd',
 'paragraph 28': 'i',
 'paragraph 29': 'a',
 'paragraph 30': 't',
 'paragraph 31': 'e',
 'paragraph 32': ' ',
 'paragraph 33': 'p',
 'paragraph 34': 'a',
 'paragraph 35': 'r'}

In [13]:
def save_result(results: list[dict], index: int, save_dir="results_part3", filename_prefix="translation"):
    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, f"{filename_prefix}_{index}.json")

    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    
    # print(f"Saved {len(results)} items to {save_path}")


In [46]:
def process_data(gemini: Gemini, data_chunk: list[dict], worker_id: int, batch_size: int = 36):
    results = []
    for i in tqdm(range(0, len(data_chunk), batch_size), desc=f"Worker {worker_id}", position=worker_id):
        batch = data_chunk[i:i + batch_size]
        inputs = batch['input']
        try:
            response = gemini.ask_gemini(inputs)
            batch['input_vi'] = response
            batch_dicts = [dict(zip(batch, values)) for values in zip(*batch.values())]
            results += batch_dicts
        except Exception as e:
            print(f"Worker {worker_id} - Error in batch {i}: {e}")
            continue
        
        # Lưu từng batch
        # save_result(batch_dicts, index=f"{worker_id}_{i}", save_dir="results_not_done", filename_prefix="part")
    return results


In [15]:
n_workers = len(api_keys)
gemini_clients = [Gemini(api_key) for api_key in api_keys]



In [43]:
from datasets import Dataset
from datasets import concatenate_datasets

def chunk_data(dataset: Dataset, n_chunks: int) -> list[Dataset]:
    length = len(dataset)
    chunk_size = length // n_chunks
    chunks = [Dataset.from_dict(dataset[i * chunk_size : (i + 1) * chunk_size]) for i in range(n_chunks)]

    # Xử lý phần dư nếu có
    remainder_start = n_chunks * chunk_size
    if remainder_start < length:
        remainder = dataset.select(range(remainder_start, length))  # đây là Dataset
        chunks[-1] = concatenate_datasets([chunks[-1], remainder])

    return chunks


dataset = xxx#new_dataset['part_1']

data_chunks = chunk_data(dataset, n_workers)

In [44]:
len(data_chunks)

18

In [96]:
# save_result(all_results, 0000, 'part2')

In [45]:
from tqdm import tqdm
all_results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
    futures = [
        executor.submit(process_data, gemini_clients[i], data_chunks[i], i, batch_size=20)
        for i in range(n_workers)
    ]
    for future in concurrent.futures.as_completed(futures):
        all_results.extend(future.result())

Worker 0:   0%|                                                                                 | 0/12 [00:00<?, ?it/s]
Worker 1:   0%|                                                                                 | 0/12 [00:00<?, ?it/s][A

Worker 2:   0%|                                                                                 | 0/12 [00:00<?, ?it/s][A[A


Worker 3:   0%|                                                                                 | 0/12 [00:00<?, ?it/s][A[A[A



Worker 4:   0%|                                                                                 | 0/12 [00:00<?, ?it/s][A[A[A[A





Worker 6:   0%|                                                                                 | 0/12 [00:00<?, ?it/s][A[A[A[A[A[A




Worker 5:   0%|                                                                                 | 0/12 [00:00<?, ?it/s][A[A[A[A[A






Worker 7:   0%|                                                             

Worker 2 - Error in batch 180: Failed to parse response as JSON. Raw output:
[
  "Tạo hệ tọa độ mới với góc Euler (0.0, 0.0, -90.0) và vectơ tịnh tiến (0.0, 0.0308, 0.0) cho Part 1, một tấm kim loại hình chữ nhật có lỗ tròn ở giữa.\n\nTrên mặt phẳng được xác định bởi hệ tọa độ, tạo một bản phác thảo bao gồm một mặt (Mặt 1) với năm vòng lặp: Vòng lặp 1 (Hình chữ nhật), Vòng lặp 2 (Hình tròn), Vòng lặp 3 (Hình dạng phức tạp với cung và đường thẳng), Vòng lặp 4 (Hình tròn), và Vòng lặp 5 (Hình tròn).\n\n- Vòng lặp 1 (Hình chữ nhật): Bốn đường thẳng với điểm bắt đầu và điểm kết thúc như sau: [Đường 1: (0.0, 0.0), (0.75, 0.0)], [Đường 2: (0.75, 0.0), (0.75, 0.347)], [Đường 3: (0.75, 0.347), (0.0, 0.347)], [Đường 4: (0.0, 0.347), (0.0, 0.0)]\n- Vòng lặp 2 (Hình tròn): Tâm tại (0.1253, 0.1735), Bán kính 0.0193\n- Vòng lặp 3 (Hình dạng phức tạp với cung và đường thẳng)\n  + Cung 1: Điểm bắt đầu (0.1812, 0.1321), Điểm giữa (0.1869, 0.1184), Điểm kết thúc (0.2005, 0.1128)\n  + Đường 1: Điểm bắt 





Worker 4:  83%|████████████████████████████████████████████████████████████            | 10/12 [06:19<01:21, 40.83s/it][A[A[A[A















Worker 16:  83%|███████████████████████████████████████████████████████████▏           | 10/12 [06:26<01:21, 40.76s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A








Worker 9:  83%|████████████████████████████████████████████████████████████            | 10/12 [06:26<01:20, 40.38s/it][A[A[A[A[A[A[A[A[A
















Worker 17:  83%|███████████████████████████████████████████████████████████▏           | 10/12 [06:30<01:15, 37.65s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A













Worker 14:  83%|███████████████████████████████████████████████████████████▏           | 10/12 [06:31<01:19, 39.81s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A














Worker 15:  75%|██████████████████████████████████████████████████████                  | 9/12 [06:32<02:09, 43.22s/it][A[A[A[A[A[A[A[A[A[A

In [24]:
len(dataset)

57840

In [47]:
len(all_results) 

3976

In [48]:
count_none = 0
xxx = []
success = []
for i in all_results:
    if i['input_vi'] == '':
        count_none += 1
        xxx.append(i)
    success.append(i['uid'])
        # print(i)
        # break
count_none

0

In [56]:
save_result(all_results, 0000, 'part1_2')

In [54]:
not_xxx2 = []
for i in all_results:
    if i['uid'] in not_xxx:
        not_xxx2.append(i)


In [51]:
len(not_xxx)

3996

In [37]:
not_xxx += xxx

In [55]:
len(not_xxx2)

0

In [42]:
xxx = Dataset.from_list(not_xxx)
xxx

Dataset({
    features: ['uid', 'input', 'output', 'instruction'],
    num_rows: 3996
})

In [170]:
def read_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    return data
data_sv2 = read_jsonl('part1_/translation_0.json')

In [180]:
axasax = 0
for data in data_sv:
    if data['input_vi'] == '':
        axasax += 1
axasax

42

In [181]:
kk = {
    
}
for i in all_results:
    kk[i['uid']] = i
len(kk)

294

In [182]:
ax = []
for data in data_sv:
    if data['input_vi'] == '':
        data['input_vi'] = kk[data['uid']]['input_vi'] 
        ax.append(kk[data['uid']]['uid'])
len(ax)

42

In [183]:
for data in all_results:
    if data['uid'] not in ax:
        data_sv.append(data)

In [184]:
len(data_sv)

11292

In [150]:
for data in data_sv2:
    if data['input_vi'] == '':
        data['input_vi'] = kk[data['uid']]

In [158]:
www = []
idx = []
for data in data_sv:
    if data['input_vi'] == '':
        www.append(data)
    idx.append(data['uid'])


In [159]:
len(idx)

11040

In [163]:
11040+294-42

11292

In [161]:
for data in not_done:
    if data['uid'] not in idx:
        www.append(data)

In [185]:
save_result(data_sv, 0000, 'done_p23')