In [None]:
import os
import zstandard
import polars as pl
import requests
import json
import tempfile
import re

In [None]:
def download_process_zst(url:str) -> pl.DataFrame:    
    # Create a temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        # Download
        response = requests.get(url)
        zst_path = os.path.join(temp_dir, "data.zst")
        with open(zst_path, 'wb') as f:
            f.write(response.content)

        # Decompress
        json_path = os.path.join(temp_dir, "data.jsonl")
        with open(zst_path, 'rb') as compressed_file:
            with open(json_path, 'wb') as decompressed_file:
                dctx = zstandard.ZstdDecompressor()
                dctx.copy_stream(compressed_file, decompressed_file)

        # File is in JSONL format
        df = pl.read_ndjson(
            json_path,
            infer_schema_length=10000,
            ignore_errors=True
        )
        
    return df    

print("Downloading and processing data...")
df = download_process_zst("https://the-eye.eu/redarcs/files/truerateme_submissions.zst")
print(df.columns)

In [None]:
# select only needed columns
df = df.select([
    "id", "author", "created_utc", "subreddit",         # metadata
    "title", "selftext", "media_embed", "media", "url", # content
])
print(df.head(5))

In [None]:
# remove empty, removed, and delted posts
df = df.filter(
    (pl.col("media_embed").is_not_null()) &
    (pl.col("media").is_not_null()) &
    (pl.col("url").is_not_null()) &
    (pl.col("selftext") != "[removed]") &
    (pl.col("selftext") != "[deleted]")
)
df

In [None]:
df['media'][0]

In [None]:
df['media'][0]['oembed']['thumbnail_url']

In [None]:
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def download_thumbnail(url: str, path: str, max_retries=5):
    # Configure retry strategy with backoff
    retry_strategy = Retry(
        total=max_retries,
        backoff_factor=1,  # Exponential backoff: 1, 2, 4, 8, 16 seconds between retries
        status_forcelist=[429, 500, 502, 503, 504],  # Retry on these status codes
        allowed_methods=["GET"]
    )
    
    # Create a session with the retry strategy
    session = requests.Session()
    session.mount("https://", HTTPAdapter(max_retries=retry_strategy))
    session.mount("http://", HTTPAdapter(max_retries=retry_strategy))
    
    # Add headers to make request more browser-like
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
        'Referer': 'https://imgur.com/'
    }
    
    # Use the session to get the image with retries, redirects and headers
    response = session.get(url, headers=headers, allow_redirects=True, timeout=30)
    response.raise_for_status()
    
    # Create directory if needed
    os.makedirs(os.path.dirname(path), exist_ok=True)
    
    # Write image to file
    with open(path, 'wb') as f:
        f.write(response.content)

def remove_url_args(url:str):
    return re.sub(r'\?.*$', '', url)

def is_img_url(url:str):
    return url.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.svg'))

In [None]:
is_img_url(remove_url_args(df[0]['media'][0]['oembed']['thumbnail_url']))

In [None]:
df[0]['media'][0]

In [None]:
# Save df
df.write_parquet("reddit_posts.parquet")

In [None]:
download_thumbnail('https://i.imgur.com/t7QQzTW.jpg', './temp.jpg')

In [None]:
# Download all thumbnails in df, updating rows with the local path upon download
# note that we write every step to parquet to avoid losing data
# also we skip rows if file already exists
if 'local_thumbnail_path' not in df.columns:
    df = df.with_columns(pl.lit("").alias('local_thumbnail_path'))

df_view = df.clone()
for idx, row in enumerate(df_view.iter_rows(named=True)):
    # Extract thumbnail url
    try:
        thumbnail_url = row['media']['oembed']['thumbnail_url']
    except (KeyError, TypeError):
        continue
    if thumbnail_url == None: continue
    
    clean_url = remove_url_args(thumbnail_url)
    if not is_img_url(clean_url):  continue
    
    ext = os.path.splitext(clean_url)[1]
    local_path = os.path.join("thumbnails", f"{row['id']}{ext}")
    
    if os.path.exists(local_path): continue
    
    try:
        download_thumbnail(clean_url, local_path)
        df[idx, 'local_thumbnail_path'] = local_path
        df.write_parquet("reddit_posts.parquet")
    except:
        print(f"Failed to download {clean_url}")
        continue

df