# Imports

In [1]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Helper Function

In [2]:
def generate_kmers(line: str, k: int = 3):
    """
    Generate all k-mers of length k from a line of text using a for-loop.
    Example: "abcdef", k=3 → ['abc', 'bcd', 'cde', 'def']
    """
    kmers = []
    line = line.strip().lower().replace(" ", "")
    line_length = len(line)

    if line_length >= k:
        for i in range(line_length - k + 1):
            kmer = line[i:i+k]
            kmers.append(kmer)

    return kmers

# K-Mer Count

## Initialize Spark and Streaming Contexts

In [4]:
sc = SparkContext("local[2]", "KMerStreaming")
ssc = StreamingContext(sc, batchDuration=10)



## Create DStream from TCP source

In [5]:
lines = ssc.socketTextStream("localhost", 9999)

## Generate k-mers for each line

In [6]:
kmers = lines.flatMap(lambda line: generate_kmers(line, k=3))

## Count each k-mer

In [7]:
kmer_counts = kmers.map(lambda kmer: (kmer, 1)).reduceByKey(lambda a, b: a + b)

## Print the results

In [8]:
kmer_counts.pprint()

## Start and wait for termination

In [None]:
ssc.start()
ssc.awaitTermination()