In [1]:
from datasets import load_dataset

ds = load_dataset("iamtarun/code_instructions_120k_alpaca")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Check keys (splits)
print(ds)


DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 121959
    })
})


In [3]:
import json

# Perform 70-30 train-test split
split_dataset = ds['train'].train_test_split(test_size=0.3, seed=42)

# Save train data to JSONL
with open("train.jsonl", "w", encoding="utf-8") as f_train:
    for example in split_dataset['train']:
        json.dump({
            "prompt": example["prompt"],
            "output": example["output"]
        }, f_train)
        f_train.write('\n')

# Save test data to JSONL
with open("test.jsonl", "w", encoding="utf-8") as f_test:
    for example in split_dataset['test']:
        json.dump({
            "prompt": example["prompt"],
            "output": example["output"]
        }, f_test)
        f_test.write('\n')


In [6]:
import json
import time
from deep_translator import GoogleTranslator
from tqdm import tqdm

# === File paths ===
input_file = "final_train_ms.jsonl"                 # Existing file
output_file = "final_train_ms_fixed.jsonl"          # Updated version
final_dataset = "Python_Train_Data.jsonl"           # Final output
failure_file = "translation_failures.jsonl"         # Log for failed items

# === Load data ===
with open(input_file, "r", encoding="utf-8") as f:
    data = [json.loads(line.strip()) for line in f if line.strip()]

# === Re-translate only failed entries ===
print("🔁 Re-translating failed entries...")

updated_data = []
failed_items = []

for item in tqdm(data):
    if item.get("text_ms", "").strip() == "[TRANSLATION_FAILED]":
        english_text = item.get("instruction", "")
        
        # Skip if too long
        if len(english_text) > 4500:
            item["text_ms"] = "[TRANSLATION_FAILED_LONG_TEXT]"
            failed_items.append(item)
            updated_data.append(item)
            continue

        try:
            malay_text = GoogleTranslator(source="en", target="ms").translate(english_text)
            item["text_ms"] = malay_text
            time.sleep(1)  # Avoid rate-limiting
        except Exception as e:
            print(f"⚠️ Error: {e}")
            item["text_ms"] = "[TRANSLATION_FAILED]"
            failed_items.append(item)

    updated_data.append(item)

# === Save updated file ===
with open(output_file, "w", encoding="utf-8") as f:
    for item in updated_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ Updated file saved: {output_file}")

# === Save failed entries ===
with open(failure_file, "w", encoding="utf-8") as f:
    for item in failed_items:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"⚠️ Failed translations saved to: {failure_file}")

# === Step 2: Prepare Final Dataset (Malay + input + output) ===
final_data = []
for item in updated_data:
    final_data.append({
        "instruction": item.get("text_ms", "[NO_TRANSLATION]"),
        "input": item.get("input", ""),
        "output": item.get("output", "")
    })

with open(final_dataset, "w", encoding="utf-8") as f:
    for item in final_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ Final training file saved: {final_dataset}")


🔁 Re-translating failed entries...


100%|██████████| 85371/85371 [04:07<00:00, 344.51it/s] 


✅ Updated file saved: final_train_ms_fixed.jsonl
⚠️ Failed translations saved to: translation_failures.jsonl
✅ Final training file saved: Python_Train_Data.jsonl


In [7]:
import json
import time
import re
from deep_translator import GoogleTranslator
from tqdm import tqdm

# === Load failed long translations ===
with open("translation_failures.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f if line.strip()]

success = []
failed = []

def chunk_text(text, max_len=4500):
    # Split into sentences using regex (handles '.', '?', '!')
    sentences = re.split(r'(?<=[.?!])\s+', text)
    chunks = []
    current = ""
    for s in sentences:
        if len(current) + len(s) < max_len:
            current += s + " "
        else:
            chunks.append(current.strip())
            current = s + " "
    if current:
        chunks.append(current.strip())
    return chunks

for item in tqdm(data):
    text = item["instruction"]
    if len(text) < 4500:
        failed.append(item)
        continue

    try:
        chunks = chunk_text(text)
        translated_chunks = []
        for chunk in chunks:
            translated = GoogleTranslator(source='en', target='ms').translate(chunk)
            translated_chunks.append(translated)
            time.sleep(1)  # Avoid rate-limiting

        full_translation = " ".join(translated_chunks)
        item["text_ms"] = full_translation
        success.append(item)
    except Exception as e:
        print(f"Failed: {e}")
        item["text_ms"] = "[RETRANSLATION_FAILED]"
        failed.append(item)

# === Save retried translations ===
with open("retranslated_longtext_fixed.jsonl", "w", encoding="utf-8") as f:
    for item in success:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

with open("still_failed_longtext.jsonl", "w", encoding="utf-8") as f:
    for item in failed:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ Retranslated: {len(success)}")
print(f"❌ Still failed: {len(failed)}")


100%|██████████| 4/4 [00:20<00:00,  5.05s/it]

Failed: Use this code to remove \n and \t from island_list and commodity_list respectively and print the updated lists:

```
island_list = ['Armstrong Island', 'Atchafalaya Island', 'Immokalee Island', 'Moultrie Island', 'Sho-ke Island', 'Sirius Island',
               'Tumult Island', 'The Beaufort Islands', "Messier's Crown", 'Nunataq Island', 'Paollu Island', 'Qaniit Island',
               'Ancoraggio Island', 'Fluke Island', 'Kakraphoon Island', 'Eagle Archipelago', 'Cambium Island', "Hubble's Eye",
               'Ilha da Aguia', 'Ix Chel', 'Manu Island', 'Admiral Island', 'Basset Island', 'Bryher Island', 'Cromwell Island',
               'Hook Shelf', 'Isle of Kent', 'Lincoln Island', 'Wensleydale', 'Anegada Island', 'Barnard Island', 'The Lowland Hundred',
               'Lyonesse Island', 'Myvatn Island', 'Arakoua Island', 'Aten Island', 'Barbary Island', 'Caravanserai Island',
               'Kasidim Island', 'Kiwara Island', 'Terjit Island', 'Tichka Plateau', 'Aimuari Islan




In [4]:
import json
import re

# Step 1: Load original data with `prompt`
with open("original train.jsonl", "r", encoding="utf-8") as f:
    original_data = [json.loads(line.strip()) for line in f if line.strip()]

# Step 2: Extract input text from prompt
extracted_inputs = []
for item in original_data:
    prompt = item.get("prompt", "")
    input_match = re.search(r"### Input:\s*(.*?)\s*### Response:", prompt, re.DOTALL)
    if input_match:
        extracted_inputs.append(input_match.group(1).strip())
    else:
        extracted_inputs.append("")  # If input not found, use empty string

# Step 3: Load the final cleaned dataset (without input)
with open("final_train.jsonl", "r", encoding="utf-8") as f:
    final_data = [json.loads(line.strip()) for line in f if line.strip()]

# Step 4: Add input after instruction without changing other fields
updated_data = []
for entry, input_text in zip(final_data, extracted_inputs):
    new_entry = {}

    # Ensure 'instruction' is first
    new_entry["instruction"] = entry.get("instruction", "")
    # Add extracted input
    new_entry["input"] = input_text
    # Add the rest of the keys (order: output, text_ms, etc.)
    for key, value in entry.items():
        if key not in new_entry:
            new_entry[key] = value

    updated_data.append(new_entry)

# Step 5: Save to a new file
with open("final_train_with_input.jsonl", "w", encoding="utf-8") as f:
    for entry in updated_data:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")


In [1]:
import json

# Read the source file (English with outputs)
with open("cleaned_train.jsonl", "r", encoding="utf-8") as f:
    english_data = [json.loads(line.strip()) for line in f if line.strip()]

# Read the target file (Malay with empty outputs)
with open("Python_Train_Data.jsonl", "r", encoding="utf-8") as f:
    malay_data = [json.loads(line.strip()) for line in f if line.strip()]

# Copy outputs from English to Malay data
for i in range(min(len(english_data), len(malay_data))):
    malay_data[i]["output"] = english_data[i]["output"]

# Write the updated data back to Python_Train_Data.jsonl
with open("Python_Train_Data.jsonl", "w", encoding="utf-8") as f:
    for item in malay_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ Copied outputs from {len(english_data)} entries")

✅ Copied outputs from 85371 entries
