In [1]:
## Upload Local Dataset to Hugging Face

In [None]:
import pandas as pd
import re
from datasets import Dataset, Features, Value, Image, DatasetDict
import random

In [None]:
df = pd.read_csv('face_data.csv')

# Extract source IDs
def extract_source_id(path):
    match = re.match(r'(\d+_\d+_\d+_\d+)', path)
    return match.group(1) if match else None

df['source_id'] = df['input_path'].apply(lambda x: extract_source_id(x.split('/')[-1]))

# Split source IDs
unique_source_ids = df['source_id'].unique().tolist()
random.seed(42)
random.shuffle(unique_source_ids)

test_size = 0.05
split_point = int(len(unique_source_ids) * (1 - test_size))
train_source_ids = unique_source_ids[:split_point]
test_source_ids = unique_source_ids[split_point:]

# Split dataframes
train_df = df[df['source_id'].isin(train_source_ids)]
test_df = df[df['source_id'].isin(test_source_ids)]

# Rest of your code remains the same
def load_images(example):
    example['input_image'] = example['input_path']
    example['output_image'] = example['output_path']
    return example

features = Features({
    'index': Value('int64'),
    'contrast': Value('float64'),
    'gamma': Value('float64'),
    'sharpness': Value('float64'), 
    'dither': Value('float64'),
    'severity': Value('string'),
    'input_image': Image(),
    'output_image': Image(),
    'input_path': Value('string'),
    'output_path': Value('string'),
    'source_id': Value('string')
})

In [None]:
train_dataset = Dataset.from_pandas(train_df, features=features)
test_dataset = Dataset.from_pandas(test_df, features=features)

train_dataset = train_dataset.map(load_images)
test_dataset = test_dataset.map(load_images)

split_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

In [None]:
split_dataset.push_to_hub("Satrat/gameboy-faces")