# Install Imports

In [10]:
pip install atproto pandas -q


You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


# Imports

In [11]:
import multiprocessing
import signal
import time
from collections import defaultdict
from types import FrameType
from typing import Any
import sqlite3
from atproto import CAR, AtUri, FirehoseSubscribeReposClient, firehose_models, models, parse_subscribe_repos_message


In [12]:
# Database setup (storing raw data instead of overloading memory)
conn = sqlite3.connect("firehose.db")
cur = conn.cursor()
cur.execute("""
    CREATE TABLE IF NOT EXISTS firehose (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        timestamp TEXT,
        repo TEXT,
        message TEXT
    )
""")
conn.commit()


## Start scraping data with Firehose and store in Database

In [13]:
from atproto import FirehoseSubscribeReposClient, parse_subscribe_repos_message
import sqlite3
import json

# Limit messages for testing (set to None for unlimited streaming)
MAX_MESSAGES = 5
message_count = 0


# Initialize Firehose client
client = FirehoseSubscribeReposClient()

def on_message_handler(message) -> None:
    global message_count
    if MAX_MESSAGES and message_count >= MAX_MESSAGES:
        print("Reached message limit. Stopping Firehose.")
        client.stop()
        return

    try:
        parsed_msg = parse_subscribe_repos_message(message)
        message_json = json.dumps(parsed_msg, default=str)

        # Store in SQLite instead of printing to prevent memory overload
        cur.execute("INSERT INTO firehose (timestamp, repo, message) VALUES (?, ?, ?)", 
                    (parsed_msg["time"], parsed_msg["repo"], message_json))
        conn.commit()

        print(f"Stored message {message_count + 1}: {parsed_msg['repo']}")
        message_count += 1

    except Exception as e:
        print(f"Error processing message: {e}")

# Clear the database before inserting new messages
cur.execute("DELETE FROM firehose")
conn.commit()

# Start Firehose listener
client.start(on_message_handler)


Stored message 1: did:plc:6mejpxzd7zrrjodfc7k6esjb
Stored message 2: did:plc:u7hnk4ginavxh6ipba7us3os
Stored message 3: did:plc:cocbyqzugxjp3wxugugy55ot
Stored message 4: did:plc:36rmtzudiva7diftxdnzqob5
Stored message 5: did:plc:ur2rls6j7gdydumu3odphdtz
Reached message limit. Stopping Firehose.


## Example Output:


In [14]:
import sqlite3
import json

# Connect to the database
conn = sqlite3.connect("firehose.db")
cur = conn.cursor()

# Set LIMIT to number of returns 
cur.execute("SELECT timestamp, repo, message FROM firehose LIMIT 1")
rows = cur.fetchall()

# Display the results
for row in rows:
    timestamp, repo, message_json = row
    message = json.loads(message_json)  # Convert JSON string to dictionary
    print(f"\n📌 Entry from {repo} at {timestamp}:")
    print(json.dumps(message, indent=2))  # Pretty print JSON

conn.close()



📌 Entry from did:plc:6mejpxzd7zrrjodfc7k6esjb at 2025-02-27T15:49:16.211Z:
"blobs=[] blocks=b':\\xa2eroots\\x81\\xd8*X%\\x00\\x01q\\x12 T\\xabS\\x92\\x1b\\xc4\\xf8\\xba>(:6\\x05vZ\\xea\\xb0o\\x0b\\xf6d\\x0b\\xf0\\x8a{g\\xef\\xf3t\\x90\\xc3Ugversion\\x01\\xd1\\x01\\x01q\\x12 \\xdcFH\\xd0\\x03nW\\xfb;\\xda\\xf2\\xdc\\x85\\xd5\\xe7\\x86J\\xe7\\x9f\\xf4\\xdf}\\xa2j\\xf1\\xddZY\\x8f\\x93k\\x15\\xa2ae\\x81\\xa4akX app.bsky.feed.like/3lchr65z6qc22ap\\x00at\\xd8*X%\\x00\\x01q\\x12 \\xdbR\\xaal`\\x92;\\xe28\\xc4=\\xe7\\xa6Ux\\xe1\\xa7\\x1e\\xb7\\x8d\\xc8\\x04\\xc5\\xce\\x89E\\xb8g\\xe35\\n\\x8aav\\xd8*X%\\x00\\x01q\\x12 \\xe1\\xd1\\xf6\\x8f+\\xf2\\x94\\x85\\xb1\\xd8\\x0b\\x90FH\\xeeWe4|\\x12`\\xd9F^T\\xc1\\x9a?\\xdb\\x895\\xcfal\\xd8*X%\\x00\\x01q\\x12 \\x0f!S\\x80>,\\x07\\x81Ta\\x06v\\xe6_\\x80\\xb5\\xe7\\xfa!b\\xe3\\xe2i\\xf5\\xf4\\x9b\\x0e\\xa5\\x8a\\xd1\\x07:S\\x01q\\x12 \\xdbR\\xaal`\\x92;\\xe28\\xc4=\\xe7\\xa6Ux\\xe1\\xa7\\x1e\\xb7\\x8d\\xc8\\x04\\xc5\\xce\\x89E\\xb8g\\xe35\\n\\x8a\\xa2a

In [3]:
import sqlite3

# Connect to SQLite database
conn = sqlite3.connect("firehose.db")
cur = conn.cursor()

# Create tables for structured data
cur.execute("""
    CREATE TABLE IF NOT EXISTS posts (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        timestamp TEXT,
        repo TEXT,
        post_text TEXT,
        post_uri TEXT,
        cid TEXT
    )
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS likes (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        timestamp TEXT,
        repo TEXT,
        liked_post TEXT,
        liked_by TEXT
    )
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS follows (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        timestamp TEXT,
        repo TEXT,
        followed_user TEXT
    )
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS reposts (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        timestamp TEXT,
        repo TEXT,
        reposted_post TEXT,
        reposted_by TEXT
    )
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS profiles (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        timestamp TEXT,
        repo TEXT,
        display_name TEXT,
        bio TEXT
    )
""")

conn.commit()
conn.close()


In [4]:
from atproto import FirehoseSubscribeReposClient, parse_subscribe_repos_message, CAR, models
import sqlite3
import json

MAX_MESSAGES = 50  # Limit to 50 messages for testing
message_count = 0

client = FirehoseSubscribeReposClient()

# Connect to database
conn = sqlite3.connect("firehose.db")
cur = conn.cursor()

_INTERESTED_RECORDS = {
    models.ids.AppBskyFeedPost: models.AppBskyFeedPost,  # Regular posts
    models.ids.AppBskyFeedLike: models.AppBskyFeedLike,  # Likes
    models.ids.AppBskyFeedRepost: models.AppBskyFeedRepost,  # Reposts
    models.ids.AppBskyGraphFollow: models.AppBskyGraphFollow,  # Follows
    models.ids.AppBskyActorProfile: models.AppBskyActorProfile,  # Profile updates
}

def on_message_handler(message) -> None:
    global message_count
    if MAX_MESSAGES and message_count >= MAX_MESSAGES:
        print("Reached message limit. Stopping Firehose.")
        client.stop()
        conn.close()
        return

    try:
        commit = parse_subscribe_repos_message(message)
        if not isinstance(commit, models.ComAtprotoSyncSubscribeRepos.Commit):
            return
        
        if not commit.blocks:
            return

        car = CAR.from_bytes(commit.blocks)

        for op in commit.ops:
            if op.action != 'create' or not op.cid:
                continue

            uri = f"at://{commit.repo}/{op.path}"
            record_raw_data = car.blocks.get(op.cid)
            if not record_raw_data:
                continue

            record = models.get_or_create(record_raw_data, strict=False)
            record_type = _INTERESTED_RECORDS.get(op.path.split("/")[0])

            if not record_type:
                continue
            
            # Store based on record type
            if record_type == models.AppBskyFeedPost:
                cur.execute("INSERT INTO posts (timestamp, repo, post_text, post_uri, cid) VALUES (?, ?, ?, ?, ?)",
                            (commit.time, commit.repo, getattr(record, 'text', ''), uri, str(op.cid)))

            elif record_type == models.AppBskyFeedLike:
                cur.execute("INSERT INTO likes (timestamp, repo, liked_post, liked_by) VALUES (?, ?, ?, ?)",
                            (commit.time, commit.repo, record.subject.uri, commit.repo))

            elif record_type == models.AppBskyFeedRepost:
                cur.execute("INSERT INTO reposts (timestamp, repo, reposted_post, reposted_by) VALUES (?, ?, ?, ?)",
                            (commit.time, commit.repo, record.subject.uri, commit.repo))

            elif record_type == models.AppBskyGraphFollow:
                cur.execute("INSERT INTO follows (timestamp, repo, followed_user) VALUES (?, ?, ?)",
                            (commit.time, commit.repo, record.subject.did))

            elif record_type == models.AppBskyActorProfile:
                cur.execute("INSERT INTO profiles (timestamp, repo, display_name, bio) VALUES (?, ?, ?, ?)",
                            (commit.time, commit.repo, getattr(record, 'displayName', ''), getattr(record, 'description', '')))

            conn.commit()

            print(f"Stored {record_type.__name__} from {commit.repo}")

            message_count += 1

    except Exception as e:
        print(f"Error processing message: {e}")

# Start Firehose listener
client.start(on_message_handler)


Stored atproto_client.models.app.bsky.feed.like from did:plc:chty73tequjbztywchad765d
Stored atproto_client.models.app.bsky.feed.like from did:plc:oc3ywcnnk2nswukqdm74d3yk
Stored atproto_client.models.app.bsky.feed.like from did:plc:sxhvvd27nuho3aqdfxybwt3d
Stored atproto_client.models.app.bsky.feed.like from did:plc:j7vj5wokcutei343cckwtex5
Stored atproto_client.models.app.bsky.feed.like from did:plc:v637xmeh4reewfywffsfhpie
Error processing message: 'str' object has no attribute 'did'
Stored atproto_client.models.app.bsky.feed.like from did:plc:vc5lvqfhce565dob5y3ssulk
Stored atproto_client.models.app.bsky.feed.like from did:plc:y7ck5r2twuoeeli4cgpl6e23
Error processing message: 'str' object has no attribute 'did'
Stored atproto_client.models.app.bsky.feed.like from did:plc:63e7n66nr6aie4vxtqukk467
Stored atproto_client.models.app.bsky.feed.like from did:plc:zhy5q6ywonyzggakeff7zwr3
Stored atproto_client.models.app.bsky.feed.post from did:plc:5xfyfym6exzynhxw4ajhuury
Stored atproto_c

In [5]:
import sqlite3

conn = sqlite3.connect("firehose.db")
cur = conn.cursor()

cur.execute("SELECT timestamp, repo, post_text FROM posts LIMIT 5")
rows = cur.fetchall()

for row in rows:
    timestamp, repo, post_text = row
    print(f"📌 Post from {repo} at {timestamp}: {post_text}")

conn.close()


📌 Post from did:plc:5xfyfym6exzynhxw4ajhuury at 2025-02-27T16:46:20.524Z: thank you estrogen goddess, i love my tiddies
📌 Post from did:plc:rxhtzw34gsd52znan2t65mmq at 2025-02-27T16:46:20.540Z: 👀
www.threads.net/@aaron.rupar...
📌 Post from did:plc:tydlpbiznphrg5gp3t7tdrao at 2025-02-27T16:46:20.563Z: Weekend outfit ready!
@kink3d.com
📌 Post from did:plc:ub5v44p7lhrqrjswhf5snq7h at 2025-02-27T16:46:20.567Z: کلا غذا رو سر کار ممنوع کنین. اصلا چه معنی داره غذا تو آفیس؟ حتی سالاد. بوی تخم مرغ و سرکه و بروکلی توش…
📌 Post from did:plc:ub5v44p7lhrqrjswhf5snq7h at 2025-02-27T16:46:20.567Z: حالا که بحث غذاست بی‌زحمت خوراکی تو سینما هم ممنوع کنین. دو ساعت فیلمه، غذا بخور بیا، نمیمیری… بوی کره و مرغ سوخاری و صدای چیپس و بادوم… 😑


In [6]:
conn = sqlite3.connect("firehose.db")
cur = conn.cursor()

cur.execute("SELECT timestamp, repo, liked_post FROM likes LIMIT 5")
rows = cur.fetchall()

for row in rows:
    timestamp, repo, liked_post = row
    print(f"👍 {repo} liked {liked_post} at {timestamp}")

conn.close()


👍 did:plc:chty73tequjbztywchad765d liked at://did:plc:2jkhlqvbhakuva6vtm5regvf/app.bsky.feed.post/3lj6cyacues22 at 2025-02-27T16:46:20.512Z
👍 did:plc:oc3ywcnnk2nswukqdm74d3yk liked at://did:plc:znqs6r4ode6z4clxboqy5ook/app.bsky.feed.post/3lj4fdce6kc2o at 2025-02-27T16:46:20.516Z
👍 did:plc:sxhvvd27nuho3aqdfxybwt3d liked at://did:plc:lvqd5umu4z7sowd7y67qn7ro/app.bsky.feed.post/3lj6dfntgke2v at 2025-02-27T16:46:20.517Z
👍 did:plc:j7vj5wokcutei343cckwtex5 liked at://did:plc:rxtvqlxzouolf665vb2f7o5i/app.bsky.feed.post/3lj647smgfs2x at 2025-02-27T16:46:20.518Z
👍 did:plc:v637xmeh4reewfywffsfhpie liked at://did:plc:tm3djotsgwdkml5jn5rpubhd/app.bsky.feed.post/3lj4rxrby3227 at 2025-02-27T16:46:20.519Z


In [7]:
import json
from atproto import FirehoseSubscribeReposClient, parse_subscribe_repos_message

MAX_MESSAGES = 50  # Capture first 50 messages for debugging
message_count = 0

client = FirehoseSubscribeReposClient()

def on_message_handler(message) -> None:
    global message_count
    if MAX_MESSAGES and message_count >= MAX_MESSAGES:
        print("Reached message limit. Stopping Firehose.")
        client.stop()
        return

    try:
        parsed_msg = parse_subscribe_repos_message(message)
        message_json = json.dumps(parsed_msg, default=str, indent=2)

        # 🔹 Print the raw JSON message to inspect its contents
        print("\n🔍 RAW FIREHOSE MESSAGE:")
        print(message_json)

        message_count += 1

    except Exception as e:
        print(f"Error processing message: {e}")

# Start Firehose listener
client.start(on_message_handler)



🔍 RAW FIREHOSE MESSAGE:
"blobs=[] blocks=b':\\xa2eroots\\x81\\xd8*X%\\x00\\x01q\\x12 \\x9d\\xb6~\\x7f\\x9c\\xd4\\x82opK\\xf3\\xb0)\\xb98N\\x07\\x1b\\xf8\\x0e;b\\xe7\\xd8l?9-\\r\\xf9?\\xd7gversion\\x01\\xd1\\x01\\x01q\\x12 \\xf4`t\\xdd\\xbe-\\xfb6\\xf5\\x93\\xce\\x95H\\x04K\\x1fY\\x9e\\x0c\\x98\"2\\x06]\\x8b9\\x9d\\xc5u\\t!\\x8b\\xa2ae\\x81\\xa4akX app.bsky.feed.like/3liwuu5htjn2fap\\x00at\\xd8*X%\\x00\\x01q\\x12 ?\\xb9\\xe9\\x9a7|\\xd1u|\\x10\\x01T\\x85\\xc4\\xaa<\\x13\\xf0\\x80\\xecvJ\\x16\\x89\\xb9r\\x9b\\xa1\\xd3\\xbahfav\\xd8*X%\\x00\\x01q\\x12 \\x05\\x15[\\x9c1\\xad\\x7f{\\x8c\\xa5\\xf9\\x90\\xe9\\x81\\x19\\xccz\\xed \\xcf\\x0c\\xf9*\\xfa(\\xad\\xb1=\\x92\\xe0\\x9f\\xe4al\\xd8*X%\\x00\\x01q\\x12 \\xce\\x1c\\x19K\\r\\xcf\\x82\\x92BD.\\xf0\\xd0\\n\\x96U\\xb7~ag\\xf6\\x02\\x05u\\x93\\x84\\xe5\"\\xce\\xea\\x88\\xa6S\\x01q\\x12 ?\\xb9\\xe9\\x9a7|\\xd1u|\\x10\\x01T\\x85\\xc4\\xaa<\\x13\\xf0\\x80\\xecvJ\\x16\\x89\\xb9r\\x9b\\xa1\\xd3\\xbahf\\xa2ae\\x80al\\xd8*X%\\x00\\x01q\\x12 d\\xb2\\