In [2]:
import os
import random
import kaggle.api as kaggle
import pandas as pd
import shutil

# The kaggle python API calls below are documented here: https://www.kaggle.com/code/donkeys/kaggle-python-api/notebook

# Make sure ~/.kaggle/kaggle.json is set up. If not, go to https://www.kaggle.com/<username>/account to create a new
# API token, which downloads a kaggle.json file. Move that file to ~/.kaggle/kaggle.json. Then run the following:
kaggle.authenticate()

# Get the dataset at https://www.kaggle.com/datasets/romainbeaumont/laion400m
dataset = kaggle.dataset_list(user="romainbeaumont", search="laion400m")[0]

file_result = kaggle.dataset_list_files(dataset.ref)
files = file_result.files
# Sort files by file name
files.sort(key=lambda x: x.name)

# Count the number of files
file_count = len(files)
# We want to get a random sample of 500,000 rows from the dataset.
random_sample_count = 500000
sample_count_per_file = random_sample_count // file_count
print(f"File count: {file_count}")
print(f"Sample count per file: {sample_count_per_file}")

# For now, load the first file in the dataset.
# Later, load each file, read sample_count_per_file random rows from it, and delete the file, in this loop
for file in files[:1]:
    print(file.name)
    print("Downloading file...")
    kaggle.dataset_download_file(
        dataset.ref, file.name, path=file.name, force=False)
    # Unzip the file into the same directory as the zip file itself, if it doesn't already exist
    if not os.path.exists(f"{file.name}/{file.name}"):
        print("Unzipping file...")
        shutil.unpack_archive(
            f"{file.name}/{file.name}.zip", extract_dir=file.name, format="zip")

    # Load the parquet file into a pandas dataframe
    print("Reading in parquet...")
    df = pd.read_parquet(f"{file.name}/{file.name}")
    print(df.columns)
    # Filter out rows where the 'HEIGHT' or 'WIDTH' columns are below 224
    df = df[(df["HEIGHT"] >= 224) & (df["WIDTH"] >= 224)]
    # Get a random subset of sample_count_per_file rows from the dataframe, with a fixed random seed
    subset = df.sample(sample_count_per_file, random_state=984324482)


File count: 32
Sample count per file: 15625
part-00000-5b54c5d5-bbcf-484d-a2ce-0d6f73df1a36-c000.snappy.parquet
Downloading file...
Reading in parquet...
Index(['SAMPLE_ID', 'URL', 'TEXT', 'HEIGHT', 'WIDTH', 'LICENSE', 'NSFW',
       'similarity'],
      dtype='object')


In [3]:
urls = subset["URL"]

for url in urls[:10]:
    print(url)

http://st6.cannypic.com/thumbs/33/331805_352_canny_pic.jpg
http://4.bp.blogspot.com/-OGkrDlRTKVY/UBJ4znKaGaI/AAAAAAAAHDo/lZe7Vk_whvI/s1600/how+to+replace+an+old+sink+faucet+bathroom+tips+%28Large%29.JPG
https://cdn.shopify.com/s/files/1/0264/2867/0024/products/2480A-cobi-Tiger-2-P-konigstiger-tank-sd-kfz-182-historical-collection-world-war-2-back-legerspeelgoed.jpg?v=1595796455
https://trumpetmediagroup.com/downloads/1948/download/Oscar%20Pistorius%20cries%20as%20the%20%27Not%20Guilty%27%20verdict%20for%20murder%20is%20read%20out%20b.jpg?cb=ce313d7208556efef3ef5eeed3f2e3e3
https://i2.wp.com/lessbeatenpaths.hostguardian.com/wp-content/uploads/2013/04/photo-1024x768.jpg?resize=525%2C394
https://photos.smugmug.com/Journal/Photo-Journal-2014/2014-November/i-TnZPTMH/0/f777ef5b/XL/ALASKA%20EAGLES%209673-XL.jpg
https://cdn.shopify.com/s/files/1/0105/4542/products/deftones-longsleeve_medium.jpg?v=1444668593
https://media.gettyimages.com/videos/cologne-cathedral-cologne-north-rhine-westphalia-g