In [None]:
import os
import random
import zipfile
from sklearn.model_selection import train_test_split

def create_multiple_train_sets(original_folder, manipulated_folder, output_dir, num_sessions=5, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, seed=42):
    random.seed(seed)
    
    # Gather all files from original and manipulated folders
    original_files = [os.path.join(original_folder, f) for f in os.listdir(original_folder) if os.path.isfile(os.path.join(original_folder, f))]
    manipulated_files = [os.path.join(manipulated_folder, f) for f in os.listdir(manipulated_folder) if os.path.isfile(os.path.join(manipulated_folder, f))]

    # Calculate counts for each split based on original images
    n_original = len(original_files)
    n_train = int(n_original * train_ratio)
    n_val = int(n_original * val_ratio)
    n_test = n_original - n_train - n_val

    # Split original images into train, val, and test sets (consistent for each session)
    train_original, temp_original = train_test_split(original_files, train_size=n_train, random_state=seed)
    val_original, test_original = train_test_split(temp_original, test_size=n_test, random_state=seed)

    # Split manipulated images for constant validation and test sets
    val_manipulated = manipulated_files[:n_val]
    test_manipulated = manipulated_files[n_val:n_val + n_test]
    train_manipulated_pool = manipulated_files[n_val + n_test:]  # Remaining manipulated for training sessions

    # Generate multiple training sessions with different manipulated samples for train set only
    for session_num in range(1, num_sessions + 1):
        random.shuffle(train_manipulated_pool)
        train_manipulated = train_manipulated_pool[:n_train]

        # Save each split to separate zip files for this session
        session_output_dir = os.path.join(output_dir, f"session_{session_num}")
        os.makedirs(session_output_dir, exist_ok=True)

        save_split_to_zip("train", train_original, train_manipulated, os.path.join(session_output_dir, "train.zip"))
        
        # Save constant validation and test splits for each session
        if session_num == 1:  # Only need to save val and test once, as they remain the same
            save_split_to_zip("val", val_original, val_manipulated, os.path.join(session_output_dir, "val.zip"))
            save_split_to_zip("test", test_original, test_manipulated, os.path.join(session_output_dir, "test.zip"))

        print(f"Session {session_num} zip files created with balanced train, val, and test sets.")

def save_split_to_zip(split_name, originals, manipulated, output_zip_path):
    unique_files = set()
    
    with zipfile.ZipFile(output_zip_path, 'w') as zipf:
        # Add original images
        for img_path in originals:
            if os.path.exists(img_path):
                arcname = os.path.join(split_name, "original", os.path.basename(img_path))
                if arcname not in unique_files:
                    zipf.write(img_path, arcname)
                    unique_files.add(arcname)
        
        # Add manipulated images
        for img_path in manipulated:
            if os.path.exists(img_path):
                arcname = os.path.join(split_name, "manipulated", os.path.basename(img_path))
                if arcname not in unique_files:
                    zipf.write(img_path, arcname)
                    unique_files.add(arcname)
    
    print(f"{split_name.capitalize()} set created with {len(originals)} originals and {len(manipulated)} manipulated images in {output_zip_path}")

# Example usage
original_folder = 'input/folder'
manipulated_folder = 'output/folder'
output_dir = 'output/folder/location'

create_multiple_train_sets(original_folder, manipulated_folder, output_dir, num_sessions=5)