# Fetch Datasets
Scikit-learn includes a collection of built-in toy datasets and downloadable real-world datasets. This optional notebook is for downloading and exploring the full collection of built-in scikit-learn datasets.

In [None]:
import os

import pandas as pd
from scipy.sparse import issparse
from sklearn.datasets import load_breast_cancer, load_diabetes, load_digits, load_iris, load_linnerud, load_wine
from sklearn.datasets import fetch_20newsgroups, fetch_20newsgroups_vectorized, fetch_california_housing, fetch_covtype, fetch_kddcup99, fetch_lfw_pairs, fetch_lfw_people, fetch_olivetti_faces, fetch_rcv1, fetch_species_distributions


# Datasets to save
# Comment out individual datasets to save time and space if needed

toy_datasets = [
    'breast_cancer',
    'diabetes',
    'digits',
    'iris',
    'linnerud',
    'wine'
]

real_world_datasets = [
    '20_newsgroups',
    #'20_newsgroups_vectorized', # Sparse dataset that is not conducive to saving to disk
    'california_housing',
    'covtype',
    'kddcup99',
    'lfw_pairs',
    'lfw_people',
    'olivetti_faces',
    #'rcv1', # very large dataset with sparse data that is not conducive to saving
    #'species_distributions' # structured very differently from the others
]

save_toy_datasets = True
save_real_world_datasets = False
toy_data_output = 'datasets/toy_data'
real_world_data_output = 'datasets/real_world_data'
temp_download = 'datasets/real_world_data/temp'

## Toy Datasets
Toy datasets are small datasets that are readily available and ship as part of the scikit-learn package:

In [2]:
def load_toy_dataset(name: str):
    return eval(f"load_{name}()")

def format_bytes(bytecount, decimal_places = 1):
    """
    Given a number of bytes, output a user-facing byte count such as '50.1 MB' or '2.9 GB'
    """
    units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
    i = 0
    while bytecount >= 1024 and i < len(units) - 1:
        bytecount /= 1024.0
        i += 1
    return f"{bytecount:.{decimal_places}f} {units[i]}"

def X_y_from_dataset(bunch, convert_categorical_target = True) -> tuple:
    """
    Splits a dataset loaded with load_data or fetch_data into DataFrames for predictors vs targets.

    This is similar to using the built-in return_X_y option for loading datasets, but with the following differences:
    - Does not discard additional metadata (e.g. user-facing documentation) from the full dataset
    - Optionally uses named values for classification targets (e.g. 'benign' vs 'malignant' instead of 0 vs 1)
    """
    features = getattr(bunch, 'feature_names', None)
    def dataframe(data, columns):
        if issparse(data):
            return pd.DataFrame.sparse.from_spmatrix(data, columns = columns)
        return pd.DataFrame(data, columns = columns)
        
    predictors = dataframe(bunch.data, columns = features)
    target_names = getattr(bunch, 'target_names', None)
    # Multidimensional target: target_names represents column names
    if bunch.target.ndim > 1:
        target = dataframe(bunch.target, columns = target_names)
    # Single target: target_names may contain the column name or the names of classes
    #   Classification datasets tend to use target_names to store the names of classes in the target variable
    #   Regression datasets may or may not use a single-element array to name the target variable
    else:
        target = pd.Series(bunch.target)
        target.name = 'target'
        target_name_count = 0 if target_names is None else len(target_names)
        if target_name_count == 1:
            target.name = target_names[0]
        elif target_name_count > 1 and convert_categorical_target:
            # If target_names specifies class names, convert the target data to use named classes
            target = target.map(lambda x: target_names[x])
    return (predictors, target)

def dataset_metadata(dataset, name: str) -> pd.Series:
    """
    Returns various data about a dataset based on its predictors and target(s).
    """
    predictors, target = X_y_from_dataset(dataset)
    if isinstance(target, pd.Series):
        target = pd.DataFrame(target) # multi-class targets use DataFrame while single-class targets use Series. Let's simplify by normalizing on DataFrame targets.
    first_target = target.iloc[:,0]
    unique_target_values = first_target.unique()
    target_type = first_target.dtype
    is_regression = pd.api.types.is_float_dtype(target_type)
    is_multivariate = len(target.columns) > 1
    is_binary = len(unique_target_values) == 2

    # Derive dataset type (regression vs classification, single-target vs multi-target) from the data
    dataset_type = 'Regression' if is_regression else 'Classification'
    if is_multivariate:
        dataset_type = f'Multiple {dataset_type}'
    elif is_binary:
        dataset_type = f'Binary {dataset_type}'
    
    # Describe the target variable
    target_description = dataset_type
    if is_multivariate:
        # Multivariate: list the target variable names as details
        target_details = ', '.join(target.columns) # Example: 'Multiclass Regression: Width, Height'
    else:
        y = first_target
        if is_regression:
            # Regression: the target name is not available, unfortunately.
            # Fill the target details with basic information about how the target value is distributed.
            target_details = {
                'range': f'{y.min():.1f} - {y.max():.1f}',
                'mean': round(y.mean(), 1),
                'std': round(y.std(), 1)
            }
            target_details = ', '.join(f'{key}: {value}' for key, value in target_details.items())
        else:
            # Classification: the target name is not available, but the class names are.
            # Fill the target details with the name + count of each class.
            if not is_binary:
                target_description = target_description + f' ({len(unique_target_values)} classes)' # Example: 'Classification (10 classes)'
            target_details = y.value_counts().sort_index()
            target_details = ', '.join(f'{key}: {value}' for key, value in target_details.items())

    def describe_columns(d: pd.DataFrame) -> str:
        return ', '.join(map(lambda column: f'{column}: {str(d[column].dtype)}', d.columns))
    
    # Compute byte size
    byte_size = predictors.memory_usage().sum() + target.memory_usage().sum()
    
    metadata = [
        ('name', name),
        ('records', predictors.shape[0]),
        ('predictors', predictors.shape[1]),
        ('size', byte_size),
        ('target_type', target_description),
        ('target_details', target_details),
        ('predictor_details', describe_columns(predictors))
    ]
    return pd.Series({key: value for key, value in metadata}, name = name)



def save_dataset(
    dataset, 
    base_path: str
):
    # Save full dataset as csv
    X, y = X_y_from_dataset(dataset)
    if issparse(X):
        raise ValueError('Saving sparse datasets is not supported')

    data = pd.concat([X, y], axis = 1)
    data_path = base_path + '.csv'
    folder, file = os.path.split(data_path)
    os.makedirs(folder, exist_ok = True)

    
    data.to_csv(data_path)

    # Add data documentation
    docstring = dataset.DESCR
    doc_path = base_path + '_description.txt'
    with open(doc_path, 'w') as docfile:
        docfile.write(docstring)


In [3]:
all_toy_metadata = []
for toy_dataset in toy_datasets:
    print(f"Loading dataset {toy_dataset}...")
    raw_data = load_toy_dataset(toy_dataset)
    all_toy_metadata.append(dataset_metadata(raw_data, toy_dataset))
    if save_toy_datasets:
        base_path = os.path.join(toy_data_output, toy_dataset)
        save_dataset(raw_data, base_path)

print('\nToy Datasets:')
all_toy_metadata = pd.DataFrame.from_records(all_toy_metadata)
all_toy_metadata['size'] = all_toy_metadata['size'].map(format_bytes)
all_toy_metadata

Loading dataset breast_cancer...
Loading dataset diabetes...
Loading dataset digits...
Loading dataset iris...
Loading dataset linnerud...
Loading dataset wine...

Toy Datasets:


Unnamed: 0,name,records,predictors,size,target_type,target_details,predictor_details
0,breast_cancer,569,30,138.1 KB,Binary Classification,"benign: 357, malignant: 212","mean radius: float64, mean texture: float64, m..."
1,diabetes,442,10,38.2 KB,Regression,"range: 25.0 - 346.0, mean: 152.1, std: 77.1","age: float64, sex: float64, bmi: float64, bp: ..."
2,digits,1797,64,912.8 KB,Classification (10 classes),"0: 178, 1: 182, 2: 177, 3: 183, 4: 181, 5: 182...","pixel_0_0: float64, pixel_0_1: float64, pixel_..."
3,iris,150,4,6.1 KB,Classification (3 classes),"setosa: 50, versicolor: 50, virginica: 50","sepal length (cm): float64, sepal width (cm): ..."
4,linnerud,20,3,1.2 KB,Multiple Regression,"Weight, Waist, Pulse","Chins: float64, Situps: float64, Jumps: float64"
5,wine,178,13,19.7 KB,Classification (3 classes),"class_0: 59, class_1: 71, class_2: 48","alcohol: float64, malic_acid: float64, ash: fl..."


## Real-World Datasets
In addition to pre-packaged toy datasets, scikit-learn offers utilities to download various larger real-world datasets common in data science:

In [4]:
def fetch_real_world_dataset(name: str, data_home = temp_download):
    if name == '20_newsgroups':
        return fetch_20newsgroups(subset = 'all', data_home = data_home) # This dataset requires explicit subset='all' to load train and test data
    if name == '20_newsgroups_vectorized':
        return fetch_20newsgroups_vectorized(subset = 'all', data_home = data_home)
    command = f"fetch_{name}(data_home = '{data_home}')"
    return eval(command)

In [5]:
all_real_metadata = []

for real_dataset in real_world_datasets:
    print(f'Fetching {real_dataset}...')
    raw_data = fetch_real_world_dataset(real_dataset)
    meta = dataset_metadata(raw_data, real_dataset)
    all_real_metadata.append(meta)
    try:
        save_dataset(raw_data, os.path.join(real_world_data_output, real_dataset))
    except Exception as e:
        print(f'Failed to save dataset {real_dataset}: {e}')


Fetching 20_newsgroups...
Fetching california_housing...
Fetching covtype...
Fetching kddcup99...
Fetching lfw_pairs...
Fetching lfw_people...
Fetching olivetti_faces...


In [9]:
print("Real-world Datasets (except species distributions)")
all_real_metadata = pd.DataFrame.from_records(all_real_metadata)
all_real_metadata['size'] = all_real_metadata['size'].map(format_bytes)
all_real_metadata

Real-world Datasets (except species distributions)


Unnamed: 0,name,records,predictors,size,target_type,target_details,predictor_details
0,20_newsgroups,18846,1,294.7 KB,Classification (20 classes),"alt.atheism: 799, comp.graphics: 973, comp.os....",0: object
1,california_housing,20640,8,1.4 MB,Regression,"range: 0.1 - 5.0, mean: 2.1, std: 1.2","MedInc: float64, HouseAge: float64, AveRooms: ..."
2,covtype,581012,54,241.6 MB,Classification (7 classes),"1: 211840, 2: 283301, 3: 35754, 4: 2747, 5: 94...","Elevation: float64, Aspect: float64, Slope: fl..."
3,kddcup99,494021,41,158.3 MB,Classification (23 classes),"b'back.': 2203, b'buffer_overflow.': 30, b'ftp...","duration: object, protocol_type: object, servi..."
4,lfw_pairs,2200,5828,48.9 MB,Binary Classification,"Different persons: 1100, Same person: 1100","0: float32, 1: float32, 2: float32, 3: float32..."
5,lfw_people,13233,2914,147.2 MB,Classification (5749 classes),"AJ Cook: 1, AJ Lamas: 1, Aaron Eckhart: 1, Aar...","0: float32, 1: float32, 2: float32, 3: float32..."
6,olivetti_faces,400,4096,6.3 MB,Classification (40 classes),"0: 10, 1: 10, 2: 10, 3: 10, 4: 10, 5: 10, 6: 1...","0: float32, 1: float32, 2: float32, 3: float32..."
