In [2]:
pip install atproto -q


You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from atproto import FirehoseSubscribeReposClient, parse_subscribe_repos_message

# Initialize Firehose client
client = FirehoseSubscribeReposClient()

MAX_MESSAGES = 1  # Limit messages for testing
message_count = 0  # Counter to track received messages

def on_message_handler(message) -> None:
    """Handler that stops after receiving MAX_MESSAGES"""
    global message_count
    if message_count >= MAX_MESSAGES:
        print("✅ Reached message limit. Stopping Firehose.")
        client.stop()
        return

    try:
        parsed_msg = parse_subscribe_repos_message(message)
        print("🔹 Received Message:", parsed_msg)

        message_count += 1

    except Exception as e:
        print(f"❌ Error processing message: {e}")

# Start the Firehose stream
client.start(on_message_handler)


🔹 Received Message: blobs=[] blocks=b':\xa2eroots\x81\xd8*X%\x00\x01q\x12 e\xd4\xee\xaa6q\x0c.\xd2\xc9\x04\xb0\xa0?\xb5\xf8\x14\xbb\xa1\xb4\xbf\xf0]-\x19\xe8\xe4]A\x86\x10Vgversion\x01\x99\x04\x01q\x12 \xa0#\x867\xff\x0f4t\n\xd2\x8a\xa2\x88\xbe\xa4\xe9\x84\xb9OP<\xa5]\xcdl\xce\x0c2b/\xe6a\xa2ae\x84\xa4akX app.bsky.feed.like/3lcgh6wxgdc2oap\x00at\xd8*X%\x00\x01q\x12 E\xeem\xf2Kb\x8d\xd2$\x94z\xc5\x05\xa5C+\xc4\x88\xb9k \xa6\xe5\x80\xe0w$nP\x1c\xd6\x96av\xd8*X%\x00\x01q\x12 ,h\x06) \xf5\x1b\x83\x05\xda\x94r\x0c\xca\x92\x01\x9a2dD\xf8\xe4\xfa\x06I\xdaf\xb8\xd1B\xaa(\xa4akKh6crnxknj2uap\x15at\xd8*X%\x00\x01q\x12 \x10\xe5\xb6\xe5\x7f~\x00\xf6"Xb@\x92\x88\xde\x9b\x97\xd5N\t\xf7\x8a\xf8\x94\x9e}\x9b\xb5]"\x02\x00av\xd8*X%\x00\x01q\x12 \x90I\xbd\x0e\xc22\x02\x0cgx`\xf8\xab/\xce/\xff\x9b\x9aoV@|<8\x1c\xe9:g\xd8~h\xa4akKit6pomhbv2pap\x15at\xd8*X%\x00\x01q\x12 V\x96"3\x1e\x02\x87\xf9\x1e\xb3V\'\x832\x850\xaaL9\xc7\xd4\xf1\x8d\xf3\x14;S\xd1A\x9d\x96\xd0av\xd8*X%\x00\x01q\x12 \xa0\x9d\xd7=\xa9\xaa\

In [3]:
from atproto import CAR, models, FirehoseSubscribeReposClient, parse_subscribe_repos_message
import json
import threading

client = FirehoseSubscribeReposClient()

MAX_MESSAGES = 50  # Limit messages for testing
message_count = 0  # Counter
firehose_data = []  # List to store structured messages

def stop_firehose():
    """Stops the Firehose listener after MAX_MESSAGES messages."""
    print("✅ Reached message limit. Stopping Firehose.")
    client.stop()

def on_message_handler(message) -> None:
    """Handler that parses Firehose messages and extracts key data"""
    global message_count
    if message_count >= MAX_MESSAGES:
        stop_firehose()
        return

    try:
        commit = parse_subscribe_repos_message(message)
        
        if not isinstance(commit, models.ComAtprotoSyncSubscribeRepos.Commit):
            print("⚠️ Skipping non-commit message.")
            return

        if not commit.blocks:
            print("⚠️ No blocks found in message.")
            return

        car = CAR.from_bytes(commit.blocks)

        structured_data = {
            "repo": commit.repo,
            "timestamp": commit.time,
            "rev": commit.rev,
            "seq": commit.seq,
            "ops": []
        }

        for op in commit.ops:
            if op.action != "create" or not op.cid:
                continue

            uri = f"at://{commit.repo}/{op.path}"
            record_raw_data = car.blocks.get(op.cid)
            if not record_raw_data:
                continue

            record = models.get_or_create(record_raw_data, strict=False)
            
            structured_data["ops"].append({
                "action": op.action,
                "path": op.path,
                "cid": str(op.cid),
                "record": record.__dict__,  # Convert record to dictionary
            })

        firehose_data.append(structured_data)  # Store structured data
        
        message_count += 1

        # If MAX_MESSAGES reached, stop Firehose in a separate thread
        if message_count >= MAX_MESSAGES:
            threading.Thread(target=stop_firehose).start()

        print(f"✅ Processed Message {message_count}/{MAX_MESSAGES} from {commit.repo}")

    except Exception as e:
        print(f"❌ Error processing message: {e}")

# Start the Firehose stream
client.start(on_message_handler)

# Display structured messages after collection
firehose_data[:10]  # Show first 3 parsed messages


✅ Processed Message 1/50 from did:plc:yjgoeuojdr6l65adgosgv2pq
✅ Processed Message 2/50 from did:plc:suobviajzflbzubtr3ygxxzv
✅ Processed Message 3/50 from did:plc:4ma6yfxzix5miy2fe3xuakvs
✅ Processed Message 4/50 from did:plc:pvd4vqmuppfifak37n3unpuh
✅ Processed Message 5/50 from did:plc:c7e6mer6xji3n3cq66humlyo
✅ Processed Message 6/50 from did:plc:pq6t6n54xllz3q46uy2sywgg
✅ Processed Message 7/50 from did:plc:cgg4li7ufbtawkn2ddldpnm5
✅ Processed Message 8/50 from did:plc:meyxcju45gqhq6mstj7asj2u
✅ Processed Message 9/50 from did:plc:5wy3qrh77shtlqea4ywkipka
✅ Processed Message 10/50 from did:plc:4fdwoqc2wm6utxd5thonlegz
✅ Processed Message 11/50 from did:plc:rovx5p2ij2qehhg6dxswzghk
✅ Processed Message 12/50 from did:plc:ctk7tn6rumlwooqvbjwxaotq
✅ Processed Message 13/50 from did:plc:dvu22kixdfguntdpokrgfymz
✅ Processed Message 14/50 from did:plc:vslxofb5f6utxcvplb6myxj6
✅ Processed Message 15/50 from did:plc:7xstlbt2gvhydxo2jypq7xzw
✅ Processed Message 16/50 from did:plc:lemuczx2mp

[{'repo': 'did:plc:yjgoeuojdr6l65adgosgv2pq',
  'timestamp': '2025-03-03T16:03:55.212Z',
  'rev': '3ljidattoib2t',
  'seq': 5941422866,
  'ops': [{'action': 'create',
    'path': 'app.bsky.graph.follow/3ljidattcrb2t',
    'cid': 'bafyreia3wfi226c7rhfcvrilwhl2oojl4y2ymp5xq6idz4t3x7h4vqtq2e',
    'record': {'created_at': '2025-03-03T16:03:55.033Z',
     'subject': 'did:plc:ikaiby2l66hk2l54pce5c5ag',
     'py_type': 'app.bsky.graph.follow'}}]},
 {'repo': 'did:plc:suobviajzflbzubtr3ygxxzv',
  'timestamp': '2025-03-03T16:03:55.213Z',
  'rev': '3ljidattuxe2y',
  'seq': 5941422867,
  'ops': [{'action': 'create',
    'path': 'app.bsky.feed.like/3ljidattn5e2y',
    'cid': 'bafyreidhivy6kbuskk5qxwbnocwuiuo5cpo5foqjjs3vuzm7jarewsstwq',
    'record': {'created_at': '2025-03-03T16:06:09.521Z',
     'subject': Main(cid='bafyreib233qc62srtqrjrwwjsgaucdhukpqrfbedgllinetxl3xxbwpsqe', uri='at://did:plc:wamrwgjy7ikqasv7vc7uuq35/app.bsky.feed.post/3ljicoa26ok23', py_type='com.atproto.repo.strongRef'),
   

In [None]:
# Filter the CREATE interactions based on type: Follow, Post, Repost, etc. and save it into Databases
# Modularize it --> 
# 

In [6]:
import sqlite3
import json

# Connect to SQLite
conn = sqlite3.connect("firehose.db")
cur = conn.cursor()

# Create table
cur.execute("""
    CREATE TABLE IF NOT EXISTS firehose_events (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        timestamp TEXT,
        repo TEXT,
        event_type TEXT,
        text TEXT,
        subject_uri TEXT,
        record_json TEXT
    )
""")

# Insert records from Firehose data
for message in firehose_data:
    for op in message["ops"]:
        event_type = op["path"].split("/")[0]  # Extract event type
        
        # 🛠️ Convert record JSON string to a dictionary
        record_data = json.loads(op["record"]) if isinstance(op["record"], str) else op["record"]
        
        text = record_data.get("text", None)  # Get post text, if available
        subject_uri = record_data.get("subject", {}).get("uri", None)  # Get subject URI if exists
        record_json = json.dumps(record_data)  # Store full event as JSON

        cur.execute("""
            INSERT INTO firehose_events (timestamp, repo, event_type, text, subject_uri, record_json)
            VALUES (?, ?, ?, ?, ?, ?)
        """, (message["timestamp"], message["repo"], event_type, text, subject_uri, record_json))

conn.commit()
conn.close()

print("✅ Firehose data successfully stored in SQLite database.")


AttributeError: 'str' object has no attribute 'get'

In [5]:
for message in firehose_data:
    for op in message["ops"]:
        print("🔍 DEBUG - op['record'] Type:", type(op["record"]))
        print("🔍 DEBUG - op['record'] Value:", op["record"])


🔍 DEBUG - op['record'] Type: <class 'dict'>
🔍 DEBUG - op['record'] Value: {'created_at': '2025-03-03T16:03:55.033Z', 'subject': 'did:plc:ikaiby2l66hk2l54pce5c5ag', 'py_type': 'app.bsky.graph.follow'}
🔍 DEBUG - op['record'] Type: <class 'dict'>
🔍 DEBUG - op['record'] Value: {'created_at': '2025-03-03T16:06:09.521Z', 'subject': Main(cid='bafyreib233qc62srtqrjrwwjsgaucdhukpqrfbedgllinetxl3xxbwpsqe', uri='at://did:plc:wamrwgjy7ikqasv7vc7uuq35/app.bsky.feed.post/3ljicoa26ok23', py_type='com.atproto.repo.strongRef'), 'py_type': 'app.bsky.feed.like'}
🔍 DEBUG - op['record'] Type: <class 'dict'>
🔍 DEBUG - op['record'] Value: {'created_at': '2025-03-03T16:03:55.009Z', 'subject': Main(cid='bafyreiekcxve4qwxbe4xkooputgfqesvp32nmmtowdlksp2asn737lmd7e', uri='at://did:plc:gc72jnrd4restjing3m4arzc/app.bsky.feed.post/3ljibi7s3fo2a', py_type='com.atproto.repo.strongRef'), 'py_type': 'app.bsky.feed.like'}
🔍 DEBUG - op['record'] Type: <class 'dict'>
🔍 DEBUG - op['record'] Value: {'created_at': '2025-03-03