In [1]:
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
KAFKA_SERVERS = "wilsoniumite.com:9092"
KAFKA_TOPIC = "bluesky-firehose"
SASL_USERNAME = "notebook1"
SASL_PASSWORD = "thisiisatesttoseeifkafkaworks"  # insert password here

In [3]:
spark = SparkSession.builder \
    .appName("BlueskyKafkaToEmbedding") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.1") \
    .master("local[48]")  \
    .config("spark.driver.memory", "12g") \
    .config("spark.driver.maxResultSize", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "8") \
    .config("spark.default.parallelism", "400") \
    .config("spark.sql.shuffle.partitions", "400") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.streaming.kafka.maxRatePerPartition", "50000") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .config("spark.network.timeout", "800s") \
    .config("spark.executor.heartbeatInterval", "60s") \
    .config("spark.sql.adaptive.skewJoin.enabled", "true") \
    .config("spark.sql.files.maxPartitionBytes", "256MB") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print("✓ Spark initialized")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/18 15:38:00 WARN Utils: Your hostname, Tomass-Desktop, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/10/18 15:38:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/wilso/github/bluesky-streaming/venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2.5.2/cache
The jars for the packages stored in: /root/.ivy2.5.2/jars
org.apache.spark#spark-sql-kafka-0-10_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-561d13a6-75d6-4f32-9a07-4d0b3dce0a9a;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.13;4.0.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.13;4.0.1 in central
	found org.apache.kafka#kafka-clients;3.9.1 in central
	f

✓ Spark initialized


In [4]:
OUTPUT_PARQUET = "output/raw_posts_kafka"
CHECKPOINT_DIR = "output/kafka_checkpoint"

COLLECTION_DURATION = 20
MAX_OFFSETS_PER_TRIGGER = 200000


def create_kafka_stream_and_save():
    """Create Kafka stream/batch read, extract posts, and save to parquet."""
    
    # Define schema for Bluesky data
    schema = StructType([
        StructField("did", StringType(), True),
        StructField("time_us", LongType(), True),
        StructField("kind", StringType(), True),
        StructField("commit", StructType([
            StructField("operation", StringType(), True),
            StructField("collection", StringType(), True),
            StructField("rkey", StringType(), True),
            StructField("record", StructType([
                StructField("text", StringType(), True),
                StructField("langs", ArrayType(StringType()), True),
                StructField("createdAt", StringType(), True)
            ]), True)
        ]), True)
    ])
    
    # Base Kafka options
    kafka_options = {
        "kafka.bootstrap.servers": KAFKA_SERVERS,
        "subscribe": KAFKA_TOPIC,
        "kafka.security.protocol": "SASL_PLAINTEXT",
        "kafka.sasl.mechanism": "PLAIN",
        "kafka.sasl.jaas.config": f'org.apache.kafka.common.security.plain.PlainLoginModule required username="{SASL_USERNAME}" password="{SASL_PASSWORD}";'
    }
    
    # STREAMING MODE - Memory-safe processing with controlled batch sizes
    print(f"Using STREAMING mode with {COLLECTION_DURATION}s timeout...")
    print(f"Max {MAX_OFFSETS_PER_TRIGGER} records per micro-batch")
    kafka_options["startingOffsets"] = "earliest"
    kafka_options["maxOffsetsPerTrigger"] = str(MAX_OFFSETS_PER_TRIGGER)
    
    # Read from Kafka as stream
    kafka_df = spark.readStream.format("kafka")
    for key, value in kafka_options.items():
        kafka_df = kafka_df.option(key, value)
    kafka_df = kafka_df.load()
    
    # Parse JSON
    parsed_df = kafka_df.select(
        from_json(col("value").cast("string"), schema).alias("data")
    ).select("data.*")
    
    # Filter for posts only
    posts_df = parsed_df.filter(
        (col("kind") == "commit") & 
        (col("commit.operation") == "create") &
        (col("commit.collection") == "app.bsky.feed.post")
    ).select(
        col("did"),
        col("commit.rkey").alias("rkey"),
        col("commit.record.text").alias("text"),
        col("commit.record.langs").alias("langs"),
        col("commit.record.createdAt").alias("created_at")
    )
    
    print(f"Starting data collection from earliest offset...")
    print(f"Saving to: {OUTPUT_PARQUET}")
    
    query = posts_df.writeStream \
        .outputMode("append") \
        .format("parquet") \
        .option("path", OUTPUT_PARQUET) \
        .option("checkpointLocation", CHECKPOINT_DIR) \
        .trigger(processingTime="5 seconds") \
        .start()
    
    # Monitor progress
    start_time = time.time()
    while query.isActive and (time.time() - start_time) < COLLECTION_DURATION:
        time.sleep(5)
        progress = query.lastProgress
        if progress:
            num_rows = progress.get("numInputRows", 0)
            if num_rows > 0:
                print(f"Processed {num_rows} rows in last batch...")
    
    query.stop()
    
    print(f"✓ Data collection complete")

In [5]:
create_kafka_stream_and_save()

Using STREAMING mode with 20s timeout...
Max 200000 records per micro-batch
Starting data collection from earliest offset...
Saving to: output/raw_posts_kafka


25/10/18 15:38:04 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/10/18 15:38:15 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000} milliseconds, but spent 10170 milliseconds
[Stage 1:>                                                          (0 + 1) / 1]

Processed 200000 rows in last batch...


25/10/18 15:38:22 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000} milliseconds, but spent 7181 milliseconds
[Stage 2:>                                                          (0 + 1) / 1]

Processed 200000 rows in last batch...
✓ Data collection complete


25/10/18 15:38:25 ERROR FileFormatWriter: Aborting job 69959fbb-1760-4aba-9c40-325285a42865.
java.lang.InterruptedException
	at java.base/java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireSharedInterruptibly(AbstractQueuedSynchronizer.java:1048)
	at scala.concurrent.impl.Promise$DefaultPromise.tryAwait0(Promise.scala:243)
	at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:255)
	at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:104)
	at org.apache.spark.util.ThreadUtils$.awaitReady(ThreadUtils.scala:374)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:998)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2484)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeWrite$1(FileFormatWriter.scala:240)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:270)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(

In [6]:
# Check how many posts we collected
print("\n" + "="*60)
print("COLLECTION SUMMARY")
print("="*60)


df_check = spark.read.parquet(OUTPUT_PARQUET)
post_count = df_check.count()
print(f"✓ Streaming mode complete")
print(f"Total posts collected: {post_count:,}")
print(f"Collection time: {COLLECTION_DURATION} seconds")
print(f"Throughput: {post_count / COLLECTION_DURATION:.1f} posts/second")

# Estimate data size
estimated_size_mb = post_count * 1024 / (1024 * 1024)  # Assume ~1KB per post
print(f"Estimated data size: ~{estimated_size_mb:.1f} MB")
print("="*60 + "\n")


COLLECTION SUMMARY


25/10/18 15:38:25 ERROR Utils: Aborting task
org.apache.spark.TaskKilledException
	at org.apache.spark.TaskContextImpl.killTaskIfInterrupted(TaskContextImpl.scala:276)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:36)
	at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583)
	at scala.collection.Iterator$$anon$6.hasNext(Iterator.scala:477)
	at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.writeWithIterator(FileFormatDataWriter.scala:110)
	at org.apache.spark.sql.execution.datasources.FileFo

✓ Streaming mode complete
Total posts collected: 376,192
Collection time: 20 seconds
Throughput: 18809.6 posts/second
Estimated data size: ~367.4 MB

