<a href="https://colab.research.google.com/github/PedroTechy/DataProcessingEdit/blob/main/spark_streaming/examples/2-checkpoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Checkpoint

# Setting up PySpark

In [1]:
%pip install pyspark



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()

In [3]:
!rm -rf content/input/*
!rm -rf content/output/*
!rm -rf content/checkpoint/*

In [4]:
from datetime import datetime
import csv

def generate_file():
  timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
  filename = f"content/input/file_{timestamp}.csv"
  with open(filename, 'w', newline='') as csvfile:
      fieldnames = ['col', 'value', 'file']
      writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=";")
      writer.writeheader()
      writer.writerow({'col': 'c1', 'value': 'v1', 'file': filename})
      writer.writerow({'col': 'c2', 'value': 'v2', 'file': filename})
      writer.writerow({'col': 'c3', 'value': 'v3', 'file': filename})

!mkdir -p content/input

In [6]:
generate_file()

In [7]:
spark.read.format("csv").option("sep", ";").option("header", True).load("content/input/").show(100, False)

+---+-----+-------------------------------------+
|col|value|file                                 |
+---+-----+-------------------------------------+
|c1 |v1   |content/input/file_20241123143435.csv|
|c2 |v2   |content/input/file_20241123143435.csv|
|c3 |v3   |content/input/file_20241123143435.csv|
+---+-----+-------------------------------------+



In [10]:
generate_file() #each time i run this it adds a new csv, simulating the arrival of data in our server

In [11]:
from pyspark.sql.types import *

schema = StructType([
StructField('Col',StringType(),True),
StructField('Value',StringType(),True),
StructField('File',StringType(),True)
])

stream = spark.readStream.format('csv').schema(schema).option("sep", ";").option('header', True).load('content/input/')

In [12]:
query = (stream.writeStream
.format('csv')
.option("header", True)
.queryName("stream")
.option('checkpointLocation', 'content/checkpoint')
.option('path', 'content/output')
.trigger(processingTime='5 seconds')
.outputMode('append')
.start()
)

In [15]:
stream.isStreaming

True

In [17]:
print(spark.read.csv('content/output', header=True, sep=";").count())
spark.read.csv('content/output', header=True, sep=",").show(100, False)

12
+---+-----+-------------------------------------+
|Col|Value|File                                 |
+---+-----+-------------------------------------+
|c1 |v1   |content/input/file_20241123143435.csv|
|c2 |v2   |content/input/file_20241123143435.csv|
|c3 |v3   |content/input/file_20241123143435.csv|
|c1 |v1   |content/input/file_20241123143527.csv|
|c2 |v2   |content/input/file_20241123143527.csv|
|c3 |v3   |content/input/file_20241123143527.csv|
|c1 |v1   |content/input/file_20241123143529.csv|
|c2 |v2   |content/input/file_20241123143529.csv|
|c3 |v3   |content/input/file_20241123143529.csv|
|c1 |v1   |content/input/file_20241123143530.csv|
|c2 |v2   |content/input/file_20241123143530.csv|
|c3 |v3   |content/input/file_20241123143530.csv|
+---+-----+-------------------------------------+



In [18]:
query.stop()

In [19]:
query.isActive

False