In [None]:
import os
import pandas as pd
from PIL import Image
import io

input_parquet = '/home/seif_elkerdany/projects/data/cleaned_data38K.parquet'
df = pd.read_parquet(input_parquet)

base_output_dir = "/home/seif_elkerdany/projects/data/38K_images"
img1_dir = os.path.join(base_output_dir, "image1")
img2_dir = os.path.join(base_output_dir, "image2")

os.makedirs(img1_dir, exist_ok=True)
os.makedirs(img2_dir, exist_ok=True)

def extract_image_bytes(image_cell, col_name, index):
    """
    Extract image bytes from a cell.
    If the cell is a dict and contains the key 'bytes', return its value.
    Otherwise, assume the cell is already the raw bytes.
    """
    if isinstance(image_cell, dict):
        if 'bytes' in image_cell:
            return image_cell['bytes']
        else:
            if len(image_cell) == 1:
                return next(iter(image_cell.values()))
            else:
                print(f"{col_name} at index {index} is a dict with keys {list(image_cell.keys())}.")
                return None
    return image_cell


for index, row in df.iterrows():
    
    raw_image1 = row['image1']
    raw_image2 = row['image2']

    image1_bytes = extract_image_bytes(raw_image1, 'image1', index)
    image2_bytes = extract_image_bytes(raw_image2, 'image2', index)

   
    if image1_bytes is None or image2_bytes is None:
        continue

    
    try:
        image1 = Image.open(io.BytesIO(image1_bytes))
    except Exception as e:
        print(f"Error processing image1 at index {index}: {e}")
        continue

    try:
        image2 = Image.open(io.BytesIO(image2_bytes))
    except Exception as e:
        print(f"Error processing image2 at index {index}: {e}")
        continue

    
    image1_filename = f"img1_{index}.png"  
    image2_filename = f"img2_{index}.png"
    image1_path = os.path.join(img1_dir, image1_filename)
    image2_path = os.path.join(img2_dir, image2_filename)

    try:
        image1.save(image1_path)
    except Exception as e:
        print(f"Error saving image1 at index {index}: {e}")
        continue

    try:
        image2.save(image2_path)
    except Exception as e:
        print(f"Error saving image2 at index {index}: {e}")
        continue

    df.at[index, 'image1'] = image1_path
    df.at[index, 'image2'] = image2_path

output_csv = "/home/seif_elkerdany/projects/data/38K_dataset.csv"
df.to_csv(output_csv, index=False)

print("Dataset updated successfully! The images are stored in the folder:", base_output_dir)

Dataset updated successfully! The images are stored in the folder: /home/seif_elkerdany/projects/data/38K_images


In [5]:
df.head()

Unnamed: 0,image1,image2,target
0,/home/seif_elkerdany/projects/data/38K_images/...,/home/seif_elkerdany/projects/data/38K_images/...,1
1,/home/seif_elkerdany/projects/data/38K_images/...,/home/seif_elkerdany/projects/data/38K_images/...,1
2,/home/seif_elkerdany/projects/data/38K_images/...,/home/seif_elkerdany/projects/data/38K_images/...,1
3,/home/seif_elkerdany/projects/data/38K_images/...,/home/seif_elkerdany/projects/data/38K_images/...,1
4,/home/seif_elkerdany/projects/data/38K_images/...,/home/seif_elkerdany/projects/data/38K_images/...,1
