# Data 

## Extraction

In [66]:
from datasets import Dataset, load_dataset, load_from_disk
from typing import Tuple, List, Literal, Union
from pathlib import Path
import os

In [67]:
CV_NUM = 20

In [68]:
def create_kfold_splits(
        dataset_name: str, 
        k_folds: int = 10, 
        split_name: str = "train", 
        limit: int = 100,
        **kwargs
    ) -> Tuple[List[Dataset], List[Dataset]]:
    """
    Creates dynamic training and validation splits for K-Fold cross-validation using Hugging Face datasets.

    Args:
        dataset_name (str): Name of the dataset on the Hugging Face Hub.
        k_folds (int): Number of folds for cross-validation (e.g., 5 or 10).
        split_name (str): Name of the dataset split to apply K-Fold to (typically "train").
        limit (int): Percentage (0–100) of the dataset to consider when generating the folds.
            For example, if limit=50 and k_folds=5, only the first 50% of the split will be used,
            divided into 5 equal parts of 10% each.
        **kwargs: Additional keyword arguments passed to `load_dataset`

    Returns:
        tuple: A tuple (train_splits, val_splits) where each is a list of datasets:
            - train_splits: List of training subsets (excluding one fold each time).
            - val_splits: List of validation subsets (one fold each).
    """
    if limit > 100 or limit <= 0:
        raise ValueError("The 'limit' parameter must be between 1 and 100.")        
    
    val_percentages = [
        f"{split_name}[{i*(limit//k_folds)}%:{(i+1)*(limit//k_folds)}%]" 
        for i in range(k_folds)
    ]
    train_percentages = [
        f"{split_name}[:{i*(limit//k_folds)}%]+{split_name}[{(i+1)*(limit//k_folds)}%:]"
        for i in range(k_folds)
    ]
    
    val_ds = load_dataset(dataset_name, split=val_percentages, **kwargs)
    train_ds = load_dataset(dataset_name, split=train_percentages, **kwargs)
    
    return train_ds, val_ds

def save_data(
    ds: Union[Dataset, List[Dataset]],
    name: str,
    split: Literal["train", "test", "val"],
    raw: bool,
) -> None:
    """
    Save a Hugging Face Dataset or list of Datasets to disk.

    Parameters:
        ds (Dataset or List[Dataset]): The dataset(s) to save.
        name (str): Base name for the saved dataset folder(s).
        split (str): One of 'train', 'test', or 'val'.
        raw (bool): Whether to save under 'raw' or 'processed'.
    """
    base_type = "raw" if raw else "processed"
    root_dir = Path("../data") / base_type

    # Validate base directory
    if not root_dir.exists():
        raise FileNotFoundError(f"Expected directory does not exist: {root_dir}")

    split_dir = root_dir / split
    split_dir.mkdir(parents=True, exist_ok=True)

    # Save dataset(s)
    if isinstance(ds, Dataset):
        ds.save_to_disk(split_dir / name)
    elif isinstance(ds, list):
        for idx, subset in enumerate(ds):
            subset.save_to_disk(split_dir / f"{name}_{idx}")
    else:
        raise TypeError("ds must be a Dataset or a list of Datasets.")

def load_data(
    name: str,
    split: Literal["train", "test", "val"],
    raw: bool
) -> Union[Dataset, List[Dataset]]:
    """
    Load a saved Hugging Face Dataset or list of Datasets from disk.

    Parameters:
        name (str): Base name of the saved dataset folder(s).
        split (str): One of 'train', 'test', or 'val'.
        raw (bool): Whether to load from 'raw' or 'processed'.

    Returns:
        Dataset or List[Dataset]: The loaded dataset(s).
    """
    base_type = "raw" if raw else "processed"
    root_dir = Path("../data") / base_type

    # Validate root directory
    if not root_dir.exists():
        raise FileNotFoundError(f"Base directory does not exist: {root_dir}")

    split_dir = root_dir / split
    if not split_dir.exists():
        raise FileNotFoundError(f"Split directory does not exist: {split_dir}")

    # Look for matching folders
    matching_dirs = sorted(split_dir.glob(f"{name}*"))

    if not matching_dirs:
        raise FileNotFoundError(f"No dataset directories found for base name '{name}' in {split_dir}")

    # Decide whether it's a single or multiple datasets
    if len(matching_dirs) == 1 and matching_dirs[0].name == name:
        return load_from_disk(matching_dirs[0])
    else:
        return [load_from_disk(d) for d in matching_dirs if d.name.startswith(name)]

In [69]:
train_ds, val_ds = create_kfold_splits(
    "google-research-datasets/go_emotions", 
    k_folds=CV_NUM, 
    limit=90,
    name='raw',
    trust_remote_code=True
    )
test_ds = load_dataset("google-research-datasets/go_emotions", 
                       'raw',
                       split='train[90%:]',
                       trust_remote_code=True)

In [70]:
print([ds.num_rows for ds in train_ds])
print([ds.num_rows for ds in val_ds])
print(test_ds.num_rows)

[202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776]
[8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449]
21123


In [71]:
save_data(train_ds, 'ge_fold', 'train', True)
save_data(val_ds, 'ge_fold', 'val', True)
save_data(test_ds, 'ge', 'test', True)

Saving the dataset (1/1 shards): 100%|██████████| 202776/202776 [00:01<00:00, 132708.54 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 202776/202776 [00:01<00:00, 147627.91 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 202776/202776 [00:01<00:00, 149387.70 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 202776/202776 [00:01<00:00, 143756.53 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 202776/202776 [00:01<00:00, 141147.57 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 202776/202776 [00:01<00:00, 151855.41 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 202776/202776 [00:01<00:00, 151905.97 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 202776/202776 [00:01<00:00, 157888.48 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 202776/202776 [00:03<00:00, 53022.34 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 202776/202776 [00:02<00:00, 98439.41 examples/s] 
S

In [72]:
train_ds = load_data('ge_fold', 'train', True)
val_ds = load_data('ge_fold', 'val', True)
test_ds = load_data('ge', 'test', True)

In [73]:
print([ds.num_rows for ds in train_ds])
print([ds.num_rows for ds in val_ds])
print(test_ds.num_rows)

[202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776, 202776]
[8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449, 8449]
21123


## Treatment