In [64]:
import os
import random
import kaggle.api as kaggle
import pandas as pd
import shutil

# The kaggle python API calls below are documented here: https://www.kaggle.com/code/donkeys/kaggle-python-api/notebook

# Make sure ~/.kaggle/kaggle.json is set up. If not, go to https://www.kaggle.com/<username>/account to create a new
# API token, which downloads a kaggle.json file. Move that file to ~/.kaggle/kaggle.json. Then run the following:
kaggle.authenticate()

# Get the dataset at https://www.kaggle.com/datasets/romainbeaumont/laion400m
dataset = kaggle.dataset_list(user="romainbeaumont", search="laion400m")[0]

file_result = kaggle.dataset_list_files(dataset.ref)
files = file_result.files
# Sort files by file name
files.sort(key=lambda x: x.name)

# Count the number of files
file_count = len(files)
# We want to get a random sample of 500,000 rows from the dataset.
random_sample_count = 500000
sample_count_per_file = random_sample_count // file_count
print(f"File count: {file_count}")
print(f"Sample count per file: {sample_count_per_file}")

# For now, load the first file in the dataset.
for file in files[:1]:
    print(file.name)
    print("Downloading file...")
    kaggle.dataset_download_file(
        dataset.ref, file.name, path=file.name, force=False)
    # Unzip the file into the same directory as the zip file itself, if it doesn't already exist
    if not os.path.exists(f"{file.name}/{file.name}"):
        print("Unzipping file...")
        shutil.unpack_archive(
            f"{file.name}/{file.name}.zip", extract_dir=file.name, format="zip")

    # Load the parquet file into a pandas dataframe
    print("Reading in parquet...")
    df = pd.read_parquet(f"{file.name}/{file.name}")
    print(df.columns)
    # Get a random subset of sample_count_per_file rows from the dataframe, with a fixed random seed
    subset = df.sample(sample_count_per_file, random_state=984324482)


File count: 32
Sample count per file: 15625
part-00000-5b54c5d5-bbcf-484d-a2ce-0d6f73df1a36-c000.snappy.parquet
Index(['SAMPLE_ID', 'URL', 'TEXT', 'HEIGHT', 'WIDTH', 'LICENSE', 'NSFW',
       'similarity'],
      dtype='object')


In [66]:
urls = subset["URL"]

for url in urls[:10]:
    print(url)

https://cdn7.bigcommerce.com/s-3bz7p8je/images/stencil/100x100/products/864/2656/IMG_9842_2__11423.1530831602.jpg?c=2
http://lh5.ggpht.com/jM-VjylVgjFE8zxh91NO7-Zx-8xE4HKgrMcCpx7eXVf83SuuVpDoVPMNpW_2d3B43GBG=w120
http://ecx.images-amazon.com/images/I/41P%2BMqOoefL._SL205.jpg
http://rlv.zcache.co.uk/ancient_rome_colosseum_round_sticker-rf5cf5b2dd3684e5ca00be62f57380bc1_v9waf_8byvr_152.jpg
http://image.made-in-china.com/43f34j00EvaTlVnCurpt/Slb-0837-Replacement-Digital-Camera-Battery-for-Samsung.jpg
https://cdn.shopify.com/s/files/1/0276/0953/0443/products/73446_large_75acde69-5535-4738-8973-cbad17738b21_110x110@2x.jpg?v=1590846686
http://i3.sigmapic.com/images/xsexcomics.com/136/937-bot-1.jpg
http://matchbin-assets.s3.amazonaws.com/public/sites/165/assets/ILLU_wrestling_1.JPG
http://img2.imagesbn.com/p/636943602027_p0_v1_s260x420.jpg
http://d2q9kw5vp0we94.cloudfront.net/regular/31869.jpg
