In [2]:
pip install datasets

Collecting datasets
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.18-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Using cached aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata 

In [5]:
from datasets import load_dataset

# Load the full dataset
dataset = load_dataset("jpawan33/fkr30k-image-captioning-dataset", split="train")

# Select only the first 5k examples
dataset_small = dataset.select(range(1000))

print(len(dataset_small))  # Should print 5000
dataset_small.save_to_disk("fkr5k-dataset")


1000


Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 10656.31 examples/s]


In [2]:
pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting sniffio (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0

In [3]:
from datasets import load_from_disk
import os
import json
from PIL import Image
from tqdm import tqdm
import time

# Use Google Translate
from googletrans import Translator

# Load dataset
dataset = load_from_disk("fkr5k-dataset")

# Prepare output folders
os.makedirs("dataset_images", exist_ok=True)

# Translator setup
translator = Translator()

# Progress files
progress_file = "progress.json"
output_txt_file = "translated_dataset.txt"

# Load progress if exists
translated_lines = []
start_idx = 0

if os.path.exists(progress_file):
    with open(progress_file, "r", encoding="utf-8") as f:
        progress = json.load(f)
        translated_lines = progress.get("translated_lines", [])
        start_idx = len(translated_lines)
    print(f"Resuming translation from index {start_idx}...")
else:
    print("Starting new translation...")

# Start translating
for idx, example in enumerate(tqdm(dataset, desc="Translating dataset")):
    if idx < start_idx:
        continue  # Already translated

    text = example["text"]
    image = example["image"]

    try:
        translated_text = translator.translate(text, src='en', dest='km').text
    except Exception as e:
        print(f"Error translating '{text}': {e}")
        translated_text = None

    if translated_text is not None:
        # Save image
        image_path = f"dataset_images/image_{idx+1}.jpg"
        image.save(image_path)

        # Save translation
        line = f"image_{idx+1}.jpg \"{translated_text}\""
        translated_lines.append(line)

        # Save progress every 50 items
        if idx % 50 == 0:
            with open(progress_file, "w", encoding="utf-8") as f:
                json.dump({"translated_lines": translated_lines}, f, ensure_ascii=False, indent=2)

    else:
        print(f"Skipping index {idx} due to translation error.")

    # Optional: delay
    time.sleep(0.5)

# Save final text file
try:
    with open(output_txt_file, "w", encoding="utf-8") as f:
        for line in translated_lines:
            f.write(line + "\n")
    print(f"✅ All translations saved successfully to {output_txt_file}")
except Exception as e:
    print(f"Error saving final text file: {e}")

# Clean up progress file
if os.path.exists(progress_file):
    os.remove(progress_file)


Resuming translation from index 51...


Translating dataset:  37%|███▋      | 373/1000 [09:55<20:04,  1.92s/it]

Error translating 'Men with reflective safety jackets on are working on a street intersection with many orange reflective cones .': The read operation timed out
Skipping index 373 due to translation error.


Translating dataset:  55%|█████▍    | 549/1000 [15:33<14:51,  1.98s/it]

Error translating 'A man leans against a rock pillar with his back to an ongoing parade decorated with white and pink balloon arch .': the JSON object must be str, bytes or bytearray, not NoneType
Skipping index 549 due to translation error.


Translating dataset: 100%|██████████| 1000/1000 [30:02<00:00,  1.80s/it]

✅ All translations saved successfully to translated_dataset.txt





In [2]:
import re
import os

# File to clean
input_file = "translated_dataset.txt"

with open(input_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Function to clean text and fix filenames
def clean_line(line):
    # First, split between image name and caption
    parts = line.strip().split(" ", 1)
    
    if len(parts) != 2:
        return None  # Skip malformed lines

    image_name, caption = parts

    # Fix the image name: insert underscore and .jpg properly
    # Assuming image name format is like 'image1jpg' -> 'image_1.jpg'
    if image_name.startswith("image") and image_name.endswith("jpg"):
        index = image_name[5:-3]  # Extract number between 'image' and 'jpg'
        fixed_image_name = f"image_{index}.jpg"
    else:
        fixed_image_name = image_name  # fallback if strange format

    # Clean the caption: remove " and special characters
    caption = caption.replace('"', '')
    caption = re.sub(r"[!@#$%^&*()_+=\[\]{}\\|:;'<>,.?/~`]", '', caption)
    caption = caption.strip()

    return f"{fixed_image_name} {caption}"

# Clean all lines
cleaned_lines = []
for line in lines:
    cleaned = clean_line(line)
    if cleaned:
        cleaned_lines.append(cleaned)

# Overwrite the original file
with open(input_file, "w", encoding="utf-8") as f:
    for line in cleaned_lines:
        f.write(line + "\n")

print(f"✅ Successfully cleaned and fixed filenames in {input_file}")


✅ Successfully cleaned and fixed filenames in translated_dataset.txt


In [1]:
from datasets import load_dataset

# Load the full dataset
dataset = load_dataset("jpawan33/fkr30k-image-captioning-dataset", split="train")

# Select only the first 5k examples
dataset_small = dataset.select(range(5000))

print(len(dataset_small))  # Should print 5000
dataset_small.save_to_disk("fkr5k-dataset-5k")

  from .autonotebook import tqdm as notebook_tqdm


5000


Saving the dataset (1/1 shards): 100%|██████████| 5000/5000 [00:00<00:00, 11071.60 examples/s]


In [1]:
from datasets import load_from_disk
import os
import json
from PIL import Image
from tqdm import tqdm
import time

# Use Google Translate
from googletrans import Translator

# Load dataset
dataset = load_from_disk("fkr5k-dataset-5k")

# Prepare output folders
os.makedirs("dataset_images", exist_ok=True)

# Translator setup
translator = Translator()

# Progress files
progress_file = "progress5k.json"
output_txt_file = "translated_dataset5k.txt"

# Load progress if exists
translated_lines = []
start_idx = 0

if os.path.exists(progress_file):
    with open(progress_file, "r", encoding="utf-8") as f:
        progress = json.load(f)
        translated_lines = progress.get("translated_lines", [])
        start_idx = len(translated_lines)
    print(f"Resuming translation from index {start_idx}...")
else:
    print("Starting new translation...")

# Start translating
for idx, example in enumerate(tqdm(dataset, desc="Translating dataset")):
    if idx < start_idx:
        continue  # Already translated

    text = example["text"]
    image = example["image"]

    try:
        translated_text = translator.translate(text, src='en', dest='km').text
    except Exception as e:
        print(f"Error translating '{text}': {e}")
        translated_text = None

    if translated_text is not None:
        # Save image
        image_path = f"dataset_images/image_{idx+1}.jpg"
        image.save(image_path)

        # Save translation
        line = f"image_{idx+1}.jpg \"{translated_text}\""
        translated_lines.append(line)

        # Save progress every 50 items
        if idx % 50 == 0:
            with open(progress_file, "w", encoding="utf-8") as f:
                json.dump({"translated_lines": translated_lines}, f, ensure_ascii=False, indent=2)

    else:
        print(f"Skipping index {idx} due to translation error.")

    # Optional: delay
    time.sleep(0.5)

# Save final text file
try:
    with open(output_txt_file, "w", encoding="utf-8") as f:
        for line in translated_lines:
            f.write(line + "\n")
    print(f"✅ All translations saved successfully to {output_txt_file}")
except Exception as e:
    print(f"Error saving final text file: {e}")

# Clean up progress file
if os.path.exists(progress_file):
    os.remove(progress_file)


  from .autonotebook import tqdm as notebook_tqdm


Resuming translation from index 51...


Translating dataset:  23%|██▎       | 1151/5000 [25:57<2:12:15,  2.06s/it]

Error translating 'A small boy wearing a diaper stands near the door and is covered in marker .': The read operation timed out
Skipping index 1151 due to translation error.


Translating dataset:  50%|████▉     | 2485/5000 [1:08:20<1:03:54,  1.52s/it]

Error translating 'Four dogs play together on a grassy and leafy ground .': the JSON object must be str, bytes or bytearray, not NoneType
Skipping index 2485 due to translation error.


Translating dataset:  50%|████▉     | 2486/5000 [1:08:21<57:24,  1.37s/it]  

Error translating 'A Father is observing his son , Kurt to see if his teaching prevents Kurt from cutting himself as he shave with a razor for the first time .': the JSON object must be str, bytes or bytearray, not NoneType
Skipping index 2486 due to translation error.


Translating dataset:  50%|████▉     | 2487/5000 [1:08:22<52:24,  1.25s/it]

Error translating 'The man in the red , hooded sweatshirt looks back and construction is taking place on a shop with Hebrew lettering .': the JSON object must be str, bytes or bytearray, not NoneType
Skipping index 2487 due to translation error.


Translating dataset:  50%|█████     | 2500/5000 [1:08:49<1:45:11,  2.52s/it]

Error translating 'The female with the dark shirt cutting the hair of the female in the red shirt .': the JSON object must be str, bytes or bytearray, not NoneType
Skipping index 2500 due to translation error.


Translating dataset:  56%|█████▌    | 2811/5000 [1:18:45<1:08:30,  1.88s/it]

Error translating 'A man in a blue jersey and orange visor threw a frisbee along a grass hill .': the JSON object must be str, bytes or bytearray, not NoneType
Skipping index 2811 due to translation error.


Translating dataset:  64%|██████▍   | 3225/5000 [1:32:20<1:01:03,  2.06s/it]

Error translating 'Three boys in sports casual clothing are posing in front of a blue building': The read operation timed out
Skipping index 3225 due to translation error.


Translating dataset:  65%|██████▌   | 3256/5000 [1:33:28<55:50,  1.92s/it]  

Error translating 'Two women are giving each other a hug while a man holding a glass is looking at the camera .': The read operation timed out
Skipping index 3256 due to translation error.


Translating dataset:  76%|███████▌  | 3812/5000 [1:51:52<37:50,  1.91s/it]  

Error translating 'Four workers walking in a field with a desert in the background .': the JSON object must be str, bytes or bytearray, not NoneType
Skipping index 3812 due to translation error.


Translating dataset:  90%|████████▉ | 4477/5000 [2:13:46<16:03,  1.84s/it]

Error translating 'A female police officer , wearing an officer 's hat and sunglasses , stands in uniform in front of a window-lined street block .': the JSON object must be str, bytes or bytearray, not NoneType
Skipping index 4477 due to translation error.


Translating dataset:  92%|█████████▏| 4584/5000 [2:17:09<10:46,  1.55s/it]

Error translating 'An Asian store with people walking throughout and a big red arrow pointing left .': The read operation timed out
Skipping index 4584 due to translation error.


Translating dataset: 100%|██████████| 5000/5000 [2:30:10<00:00,  1.80s/it]

✅ All translations saved successfully to translated_dataset5k.txt





In [5]:
import re
import os

# File to clean
input_file = "translated_dataset5k.txt"

with open(input_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Function to clean text and fix filenames
def clean_line(line):
    # First, split between image name and caption
    parts = line.strip().split(" ", 1)
    
    if len(parts) != 2:
        return None  # Skip malformed lines

    image_name, caption = parts

    # Clean image name: handle formats like 'image___1...jpg'
    match = re.search(r'image_+(\d+)\.*jpg', image_name)
    if match:
        index = match.group(1)
        fixed_image_name = f"image_{index}.jpg"
    else:
        # Try to fallback to known pattern
        match = re.search(r'image(\d+)\.*jpg', image_name)
        if match:
            index = match.group(1)
            fixed_image_name = f"image_{index}.jpg"
        else:
            fixed_image_name = image_name  # keep as-is if unrecognized

    # Clean the caption: remove " and special characters (including Khmer period "។")
    caption = caption.replace('"', '')
    caption = re.sub(r"[!@#$%^&*()_+=\[\]{}\\|:;'<>,.?/~`។]", '', caption)
    caption = caption.strip()

    return f"{fixed_image_name} {caption}"

# Clean all lines
cleaned_lines = []
for line in lines:
    cleaned = clean_line(line)
    if cleaned:
        cleaned_lines.append(cleaned)

# Overwrite the original file
with open(input_file, "w", encoding="utf-8") as f:
    for line in cleaned_lines:
        f.write(line + "\n")

print(f"✅ Successfully cleaned and fixed filenames in {input_file}")


✅ Successfully cleaned and fixed filenames in translated_dataset5k.txt
