In [1]:
!pip install praw kafka-python python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
import time
import json
import praw
from kafka import KafkaProducer
from kafka.errors import NoBrokersAvailable
from dotenv import load_dotenv

In [3]:
load_dotenv()  # loads .env in current dir

REDDIT_CLIENT_ID = os.getenv('REDDIT_CLIENT_ID')
REDDIT_CLIENT_SECRET = os.getenv('REDDIT_CLIENT_SECRET')
REDDIT_USER_AGENT = os.getenv('REDDIT_USER_AGENT')
KAFKA_BROKER = os.getenv('KAFKA_BROKER')
TOPIC_NAME = os.getenv('TOPIC_NAME')

In [4]:
def create_reddit_instance():
    reddit = praw.Reddit(
        client_id=REDDIT_CLIENT_ID,
        client_secret=REDDIT_CLIENT_SECRET,
        user_agent=REDDIT_USER_AGENT
    )
    return reddit

In [5]:
def create_kafka_producer():
    """Try to connect to Kafka, retry up to 5 times."""
    retries = 5
    while retries > 0:
        try:
            producer = KafkaProducer(
                bootstrap_servers=[KAFKA_BROKER],
                value_serializer=lambda v: json.dumps(v).encode('utf-8')
            )
            print("Connected to Kafka!")
            return producer
        except NoBrokersAvailable:
            print(f"Kafka broker not found. Retries left: {retries}")
            retries -= 1
            time.sleep(5)
    raise RuntimeError("Failed to connect to Kafka after retries")


In [6]:
reddit = create_reddit_instance()
producer = create_kafka_producer()

Kafka broker not found. Retries left: 5
Kafka broker not found. Retries left: 4
Kafka broker not found. Retries left: 3
Kafka broker not found. Retries left: 2
Kafka broker not found. Retries left: 1


RuntimeError: Failed to connect to Kafka after retries

In [7]:
subreddits = ["technology", "programming", "datascience", "MachineLearning"]

posts_data = []  # will store the posts

for subreddit in subreddits:
    print(f"Fetching r/{subreddit}...")
    for submission in reddit.subreddit(subreddit).new(limit=5):
        data = {
            "id": submission.id,
            "subreddit": subreddit,
            "title": submission.title,
            "selftext": submission.selftext,
            "created_utc": submission.created_utc
        }
        posts_data.append(data)

print(f"Fetched {len(posts_data)} posts total.")

Fetching r/technology...
Fetching r/programming...
Fetching r/datascience...
Fetching r/MachineLearning...
Fetched 20 posts total.


In [9]:
!pip install pandas --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
import pandas as pd

df = pd.DataFrame(posts_data)
df.head(10)


Unnamed: 0,id,subreddit,title,selftext,created_utc
0,1ihhtan,technology,This scrappy search upstart is getting thousan...,,1738676000.0
1,1ihhpje,technology,Trump orders USDA to take down websites refere...,,1738676000.0
2,1ihhm06,technology,Australia bans DeepSeek on government devices ...,,1738676000.0
3,1ihhcup,technology,70% of leaders admit they are comfortable usin...,,1738675000.0
4,1ihh7yg,technology,Treyarch co-founder pleads guilty to drone col...,,1738674000.0
5,1ihhqds,programming,Search logs faster than Sonic - Log search eng...,Learn about the data structures and algorithms...,1738676000.0
6,1ihgwli,programming,10 Lesser Known Options of Popular Linux Commands,,1738673000.0
7,1ihfro7,programming,What should semantic diffs highlight: The chan...,,1738669000.0
8,1ihfnhk,programming,Problem with concept of AI Agent and a convers...,,1738669000.0
9,1ihfjcd,programming,It's OK to hardcode feature flags,,1738668000.0
