In [74]:
import findspark
from pyspark.sql import SparkSession, functions as F, types as T
findspark.init()

spark = SparkSession.builder.appName('test some functions').getOrCreate()

In [6]:
data = [("Omar Youssef",1),
        ("Nourhan Abdallah",2)]

df = spark.createDataFrame(data, schema=['name','id'])
df.show()

                                                                                

+----------------+---+
|            name| id|
+----------------+---+
|    Omar Youssef|  1|
|Nourhan Abdallah|  2|
+----------------+---+



In [None]:
words = df.select((F.split(df.name," ")))

+-------------------+
| split(name,  , -1)|
+-------------------+
|    [Omar, Youssef]|
|[Nourhan, Abdallah]|
+-------------------+



In [17]:
df.withColumns({'fname':F.split(df.name," ")[0],
                'lname':F.split(df.name,' ')[1]}).show()

+----------------+---+-------+--------+
|            name| id|  fname|   lname|
+----------------+---+-------+--------+
|    Omar Youssef|  1|   Omar| Youssef|
|Nourhan Abdallah|  2|Nourhan|Abdallah|
+----------------+---+-------+--------+



In [None]:
#Create read stream reads all csv files will appear or created in files directory with automatically schema  detection
stream_df  = spark.readStream.csv('files/',header=True,inferSchema=True)

#Transformation
#Concatenate fname and lname column and separate between both of them by a space
#Add timestamp column to the output
f_stream_df = stream_df.withColumns({'full_name':F.concat(F.col('fname'),F.col('lname'), F.lit(" ")),
                                     'timestamp': F.current_timestamp()})

#Append output mode is the only mode available for csv files format and save the output in output directory
#It saves the output as files any amendments will create new file named part-*.csv
query = f_stream_df.writeStream.outputMode('append').format('csv').option('path',"output").option('checkpointLocation','checkpointLocation').start()

#Prevent termination after 60 seconds
query.awaitTermination(60)

25/12/11 20:40:16 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


False

In [47]:
output_df = spark.read.csv('files/*.csv',header=True,inferSchema=True)
output_df.show()

25/12/11 20:47:53 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: files/*.csv.
java.io.FileNotFoundException: File files/*.csv does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:56)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:381)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.org$apache$spark$sql$catalyst$analysis$ResolveDataSource$$loadV1BatchSource(ResolveDataSource.scala:143)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource$$anonfun$ap

+-------+--------+
|  fname|   lname|
+-------+--------+
|  aziza|    omar|
|  pedri|gonzalez|
|  sayed|  moawad|
|   omar| youssef|
|nourhan|abdallah|
|mohamed|     ali|
|    leo|   messi|
+-------+--------+



In [None]:
#This format generates rows per second contains two columns timestamp and value (increasing numbers)
df = spark.readStream.format('rate').option('rowPerSecond',2).load()

query= df.writeStream.outputMode('append').format('console').start()
query.awaitTermination(10)

if query.isActive:
    query.stop()

25/12/11 21:22:44 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-3ff5990a-fce6-4530-9857-b749901d7fda. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/12/11 21:22:44 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


-------------------------------------------
Batch: 0
-------------------------------------------
+---------+-----+
|timestamp|value|
+---------+-----+
+---------+-----+

-------------------------------------------
Batch: 56
-------------------------------------------
+--------------------+-----+
|           timestamp|value|
+--------------------+-----+
|2025-12-11 21:22:...|   55|
+--------------------+-----+

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------------+-----+
|           timestamp|value|
+--------------------+-----+
|2025-12-11 21:22:...|    0|
+--------------------+-----+

-------------------------------------------
Batch: 57
-------------------------------------------
+--------------------+-----+
|           timestamp|value|
+--------------------+-----+
|2025-12-11 21:22:...|   56|
+--------------------+-----+

-------------------------------------------
Batch: 2
-------------------------------------------
+-

25/12/11 21:22:54 WARN DAGScheduler: Failed to cancel job group 6e5f8077-dd31-4321-b80d-31dbcf158f33. Cannot find active jobs for it.
25/12/11 21:22:54 WARN DAGScheduler: Failed to cancel job group 6e5f8077-dd31-4321-b80d-31dbcf158f33. Cannot find active jobs for it.


In [None]:
spark.stop()