In [1]:
from datasets import load_dataset

ds = load_dataset("iamtarun/code_instructions_120k_alpaca")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Check keys (splits)
print(ds)


DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 121959
    })
})


In [3]:
import json

# Perform 70-30 train-test split
split_dataset = ds['train'].train_test_split(test_size=0.3, seed=42)

# Save train data to JSONL
with open("train.jsonl", "w", encoding="utf-8") as f_train:
    for example in split_dataset['train']:
        json.dump({
            "prompt": example["prompt"],
            "output": example["output"]
        }, f_train)
        f_train.write('\n')

# Save test data to JSONL
with open("test.jsonl", "w", encoding="utf-8") as f_test:
    for example in split_dataset['test']:
        json.dump({
            "prompt": example["prompt"],
            "output": example["output"]
        }, f_test)
        f_test.write('\n')


In [1]:
import json
from deep_translator import GoogleTranslator
from tqdm import tqdm

# === Step 1: Translate text ===

# File paths
input_file = "Python Train.jsonl"            # English input
translated_file = "Python_Train_ms.jsonl"    # Intermediate file with both English & Malay
final_output_file = "Python_Train_Data.jsonl"  # Final output: only Malay prompt + code

# Field names
source_field = "text"
translated_field = "text_ms"

translated_data = []

print("🔄 Translating English prompts to Malay...")
with open(input_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

for line in tqdm(lines):
    item = json.loads(line)
    english_text = item.get(source_field, "")

    try:
        malay_text = GoogleTranslator(source="en", target="ms").translate(english_text)
    except Exception as e:
        print(f"⚠️ Translation error: {e}")
        malay_text = "[TRANSLATION_FAILED]"

    item[translated_field] = malay_text
    translated_data.append(item)

# Save intermediate JSONL with English + Malay
with open(translated_file, "w", encoding="utf-8") as f:
    for item in translated_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ Translated data saved to: {translated_file}")

# === Step 2: Extract only Malay + Code ===

cleaned_data = []
for item in translated_data:
    cleaned_item = {
        "text": item.get("text_ms", "[NO_TRANSLATION]"),
        "code": item.get("code", "")
    }
    cleaned_data.append(cleaned_item)

# Save final simplified JSONL
with open(final_output_file, "w", encoding="utf-8") as f:
    for item in cleaned_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ Final cleaned dataset saved to: {final_output_file}")

🔄 Translating English prompts to Malay...


 48%|████▊     | 4413/9263 [1:37:19<9:19:44,  6.92s/it]

⚠️ Translation error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


100%|██████████| 9263/9263 [3:21:00<00:00,  1.30s/it]  


✅ Translated data saved to: Python_Train_ms.jsonl
✅ Final cleaned dataset saved to: Python_Train_Data.jsonl
