In [1]:
## Upload Local Dataset to Hugging Face

In [2]:
import pandas as pd
import re
from datasets import Dataset, Features, Value, Image, DatasetDict
import random

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('face_data.csv')

# Extract source IDs from degraded_input_path
def extract_source_id(path):
    filename = path.split('/')[-1]  # Get just the filename
    match = re.match(r'(\d+_\d+_\d+_\d+)', filename)
    return match.group(1) if match else None

df['source_id'] = df['input_path'].apply(extract_source_id)

# Get unique source IDs and split them
unique_source_ids = df['source_id'].unique().tolist()
random.seed(30)
random.shuffle(unique_source_ids)

test_size = 0.05
split_point = int(len(unique_source_ids) * (1 - test_size))
train_source_ids = unique_source_ids[:split_point]
test_source_ids = unique_source_ids[split_point:]

# Split dataframes
train_df = df[df['source_id'].isin(train_source_ids)].reset_index(drop=True)
test_df = df[df['source_id'].isin(test_source_ids)].reset_index(drop=True)

def load_images(example):
    example['input_image'] = example['input_path']
    example['output_image'] = example['output_path']
    return example

features = Features({
    'contrast': Value('float64'),
    'brightness': Value('float64'),
    'noise': Value('float64'), 
    'edge_enhance': Value('bool'),
    'agg_contrast': Value('bool'),
    'severity': Value('string'),
    'input_image': Image(),
    'output_image': Image(),
    'original_input_path': Value('string'),
    'input_path': Value('string'),
    'output_path': Value('string'),
    'source_id': Value('string')
})

In [4]:
train_dataset = Dataset.from_pandas(train_df, features=features)
test_dataset = Dataset.from_pandas(test_df, features=features)

train_dataset = train_dataset.map(load_images)
test_dataset = test_dataset.map(load_images)

split_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

Map: 100%|██████████| 86772/86772 [00:05<00:00, 15278.54 examples/s]
Map: 100%|██████████| 4568/4568 [00:00<00:00, 14915.73 examples/s]


In [5]:
split_dataset.push_to_hub("Satrat/gameboy-faces")

Map: 100%|██████████| 924/924 [00:03<00:00, 242.98 examples/s]?it/s]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:01<00:00,  8.66ba/s]
Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.
Map: 100%|██████████| 924/924 [00:03<00:00, 234.50 examples/s]48, 10.84s/it]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:01<00:00,  8.52ba/s]
Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.
Map: 100%|██████████| 924/924 [00:03<00:00, 257.28 examples/s]27, 10.08s/it]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:01<00:00,  8.39ba/s]
Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.
Map: 100%|██████████| 924/924 [00:03<00:00, 269.91 examples/s]00,  9.90s/it]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:01<00:00,  8.87ba/s]
Uploading files as a binary IO buffer is not supported by Xet St

CommitInfo(commit_url='https://huggingface.co/datasets/Satrat/gameboy-faces/commit/3e0c045e18a4cb4c9d5efde49d27f15681de7f82', commit_message='Upload dataset (part 00001-of-00002)', commit_description='', oid='3e0c045e18a4cb4c9d5efde49d27f15681de7f82', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Satrat/gameboy-faces', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Satrat/gameboy-faces'), pr_revision=None, pr_num=None)