In [None]:
import cv2
import numpy as np
import os
import requests
import json
import time
from datetime import datetime
from dotenv import load_dotenv
import praw

In [None]:
# Reddit API credentials

# Load variables from .env
load_dotenv()

CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")
USER_AGENT = os.getenv("REDDIT_USER_AGENT")

if not CLIENT_ID:
    raise ValueError("REDDIT_CLIENT_ID not found in .env file")
if not CLIENT_SECRET:
    raise ValueError("REDDIT_CLIENT_SECRET not found in .env file")
if not USER_AGENT:
    raise ValueError("REDDIT_USER_AGENT not found in .env file")


In [21]:
# Initialize Reddit instance
reddit = praw.Reddit(client_id=CLIENT_ID,
                     client_secret=CLIENT_SECRET,
                     user_agent=USER_AGENT)

In [22]:
# Directory to save images
SAVE_DIR = '/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/reddit_disgust'
os.makedirs(SAVE_DIR, exist_ok=True)

# File to track downloaded URLs
TRACK_FILE = os.path.join(SAVE_DIR, 'downloaded_urls_reddit.json')

# Load previously downloaded URLs
if os.path.exists(TRACK_FILE):
    try:
        with open(TRACK_FILE, 'r') as f:
            downloaded_urls = set(json.load(f))
        print(f"Loaded {len(downloaded_urls)} URLs from {TRACK_FILE}")
    except json.JSONDecodeError:
        print(f"Warning: {TRACK_FILE} is empty or corrupted. Initializing as empty set.")
        downloaded_urls = set()
else:
    downloaded_urls = set()



In [23]:
# Emotion-related keywords
keywords = ['disgusting', 'disgust', 'disgusted']

# Subreddits to search
subreddits = ['pics', 'faces', 'emotions']

# Maximum number of posts to process per keyword
MAX_POSTS = 100

In [24]:
# Check if an image contains a human face
def contains_face(image_path):
    try:
        face_cascade = cv2.CascadeClassifier(
            cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
        )
        image = cv2.imread(image_path)
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.2, minNeighbors=5)
        return len(faces) > 0
    except Exception as e:
        print(f"Error in face detection: {e}")
        return False

In [25]:
# Function to download image
def download_image(url, filepath):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            with open(filepath, 'wb') as f:
                f.write(response.content)
            return True
    except Exception as e:
        print(f"Error downloading {url}: {e}")
    return False

# Main scraping loop
for keyword in keywords:
    for subreddit_name in subreddits:
        subreddit = reddit.subreddit(subreddit_name)
        print(f"Searching for '{keyword}' in r/{subreddit_name}...")
        count = 0
        for submission in subreddit.search(keyword, limit=MAX_POSTS):
            if submission.url.lower().endswith(('.jpg', '.jpeg', '.png')) and submission.url not in downloaded_urls:
                filename = f"{keyword}_{submission.id}.jpg"
                filepath = os.path.join(SAVE_DIR, filename)
                if download_image(submission.url, filepath):
                    if contains_face(filepath):
                        print(f"Saved (face): {filename}")
                        downloaded_urls.add(submission.url)
                        count += 1
                    else:
                        os.remove(filepath)
                        print(f"Discarded (no face): {filename}")
                    time.sleep(1)

        print(f"Downloaded {count} images for '{keyword}' from r/{subreddit_name}.")

# Save updated list of downloaded URLs
with open(TRACK_FILE, 'w') as f:
    json.dump(list(downloaded_urls), f)

print("Scraping completed.")

Searching for 'disgusting' in r/pics...
Discarded (no face): disgusting_1j06w2h.jpg
Discarded (no face): disgusting_16rzvww.jpg
Saved (face): disgusting_18ypd9v.jpg
Discarded (no face): disgusting_cmc7vi.jpg
Saved (face): disgusting_1jlyihf.jpg
Saved (face): disgusting_1j0el8r.jpg
Saved (face): disgusting_1i6rm1q.jpg
Discarded (no face): disgusting_1hwujsm.jpg
Saved (face): disgusting_1i69z97.jpg
Saved (face): disgusting_1i965ro.jpg
Saved (face): disgusting_1jfwixn.jpg
Saved (face): disgusting_1hc5wpa.jpg
Saved (face): disgusting_1jh5p8t.jpg
Saved (face): disgusting_1i5vevf.jpg
Discarded (no face): disgusting_1h3f2j1.jpg
Saved (face): disgusting_1ggrgbz.jpg
Saved (face): disgusting_1izhi7r.jpg
Discarded (no face): disgusting_1jeesyb.jpg
Saved (face): disgusting_1i1kl91.jpg
Discarded (no face): disgusting_1fvnzsi.jpg
Saved (face): disgusting_1j0hwkt.jpg
Saved (face): disgusting_1hks7gw.jpg
Saved (face): disgusting_1et535t.jpg
Saved (face): disgusting_1jrzglh.jpg
Saved (face): disgusting