### Function for selecting random CSV files from a directory and copying them to an output folder. This code was used to select the random samples of continental weatherdata.

In [32]:
import glob
import os
import random
import shutil

def select_random_sample(input_folder, output_folder, num_files, seed=None, min_size_kb=5):
    """
    Select random CSV files larger than min_size_kb from input_folder (recursively),
    copy to output_folder and prefix filenames with the parent folder name.
    """
    
    csv_files = [f for f in glob.glob(os.path.join(input_folder, '**', '*.*'), recursive=True)
                 if f.lower().endswith('.csv')]

    large_files = [f for f in csv_files if os.path.getsize(f) > min_size_kb * 1024]

    if not large_files:
        raise ValueError(f"No CSV files larger than {min_size_kb} KB found in '{input_folder}'.")

    if seed is not None:
        random.seed(seed)

    selected_files = random.sample(large_files, min(num_files, len(large_files)))

    os.makedirs(output_folder, exist_ok=True)
    input_folder_abs = os.path.abspath(input_folder)

    for file_path in selected_files:
        original_name = os.path.basename(file_path)

        parent_folder = os.path.basename(os.path.dirname(file_path))

        if not parent_folder:
            parent_folder = os.path.basename(os.path.normpath(input_folder_abs))

        new_name = f"{parent_folder}_{original_name}"
        new_path = os.path.join(output_folder, new_name)

        shutil.copy(file_path, new_path)

    print(f"✅ Copied {len(selected_files)} files > {min_size_kb} KB from '{input_folder}' → '{output_folder}' (prefixed with parent folder)")
    return selected_files


This process was done repeatedly for each continent's file. A for loop wasn't used as the files took too much space to be on the system at the same time

In [41]:
input_folder = 'northAmerica/North America'
output_folder = "raw_sampled_files"
num_files = 10
seed = 42

select_random_sample(input_folder=input_folder, output_folder=output_folder, num_files=num_files, seed=seed)


✅ Copied 10 files > 5 KB from 'northAmerica/North America' → 'raw_sampled_files' (prefixed with parent folder)


['northAmerica/North America\\weatherdata-820-331.csv',
 'northAmerica/North America\\weatherdata-423-825.csv',
 'northAmerica/North America\\weatherdata-273-819.csv',
 'northAmerica/North America\\weatherdata-573-1056.csv',
 'northAmerica/North America\\weatherdata-520-563.csv',
 'northAmerica/North America\\weatherdata-517-1213.csv',
 'northAmerica/North America\\weatherdata-439-1241.csv',
 'northAmerica/North America\\weatherdata-423-1075.csv',
 'northAmerica/North America\\weatherdata-695-1228.csv',
 'northAmerica/North America\\weatherdata-382-1025.csv']