In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Writing to csv").getOrCreate()

In [1]:
from pyspark.sql.types import (StructType, StructField,
                               StringType, IntegerType)

recordSchema = StructType([StructField('date', StringType(), True),
                           StructField('delay', IntegerType(), True),
                           StructField('distance', IntegerType(), True),
                           StructField('origin', StringType(), True),
                           StructField('destination', StringType(), True)])

In [2]:
df = spark.readStream.format("csv") \
    .schema(recordSchema) \
    .load("MyInputStream/")

In [3]:
import pyspark.sql.functions as F

In [4]:
df2 = df.groupBy('destination').agg(F.avg('delay').alias('AverageDelay'))

In [5]:
writerAvg = df2.writeStream.outputMode("complete") \
    .format("csv")  \
    .option("path", "OutStream/") \
    .option("checkpointLocation", "chkpnt") 

In [6]:
query = writerAvg.start()

AnalysisException: Data source csv does not support Complete output mode.

In [None]:
### Data source csv does not support Complete output mode.

In [13]:
df = spark.readStream.format("parquet") \
    .schema(recordSchema) \
    .load("MyInputStream/")

In [7]:
writer = df.writeStream.outputMode("append") \
    .format("csv")  \
    .option("path", "OutStream/") \
    .option("checkpointLocation", "chkpnt") 

In [8]:
writer = df.writeStream.outputMode("append") \
    .format("parquet")  \
    .option("path", "OutStream/") \
    .option("checkpointLocation", "chkpnt")

In [8]:
query = writer.start()
#query.awaitTermination()

24/04/07 14:56:54 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

In [9]:
query.stop()

In [12]:
df = spark.read.csv('OutStream/',header=True,inferSchema=True)
df.show(40)

+-------+---+---+------+-----------+
|   date|_c1|_c2|origin|destination|
+-------+---+---+------+-----------+
|1121215| -5|602|   ABE|        ATL|
|1121725| -1|602|   ABE|        ATL|
|1131215| 14|602|   ABE|        ATL|
|1130600| -7|369|   ABE|        DTW|
|1131725| -6|602|   ABE|        ATL|
|1131230|-13|369|   ABE|        DTW|
|1130625| 29|602|   ABE|        ATL|
|1131219| -8|569|   ABE|        ORD|
|1140600| -9|369|   ABE|        DTW|
|1141725| -9|602|   ABE|        ATL|
|1141230| -8|369|   ABE|        DTW|
|1140625| -5|602|   ABE|        ATL|
|1141219|-10|569|   ABE|        ORD|
|1150600|  0|369|   ABE|        DTW|
|1151725| -6|602|   ABE|        ATL|
|1151230|  0|369|   ABE|        DTW|
|1150625|  0|602|   ABE|        ATL|
|1150607|  0|569|   ABE|        ORD|
|1151219|  0|569|   ABE|        ORD|
|     44| 44| 44|    44|         44|
|     12| 23| 22|    22|         22|
|1011245|  6|602|   ABE|        ATL|
|1020600| -8|369|   ABE|        DTW|
|1021245| -2|602|   ABE|        ATL|
|

24/04/07 14:58:10 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: date, , , origin, destination
 Schema: date, _c1, _c2, origin, destination
Expected: _c1 but found: 
CSV file: file:///home/hatem/PySpark/Ubuntu_Final_Spark_Intake_43/L5_StructuredStreaming/OutStream/part-00000-803af41e-fb4b-4134-84aa-8d9a7d5546a0-c000.csv
24/04/07 14:58:10 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: date, , , origin, destination
 Schema: date, _c1, _c2, origin, destination
Expected: _c1 but found: 
CSV file: file:///home/hatem/PySpark/Ubuntu_Final_Spark_Intake_43/L5_StructuredStreaming/OutStream/part-00001-fd8322c4-1213-4f95-b829-a7d38a922196-c000.csv


In [10]:
df = spark.read.parquet('OutStream/')

In [11]:
df.show(40)

+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|   date| null|    null|origin|destination|
|1011245|    6|     602|   ABE|        ATL|
|1020600|   -8|     369|   ABE|        DTW|
|1021245|   -2|     602|   ABE|        ATL|
|1020605|   -4|     602|   ABE|        ATL|
|1031245|   -4|     602|   ABE|        ATL|
|1030605|    0|     602|   ABE|        ATL|
|1041243|   10|     602|   ABE|        ATL|
|1040605|   28|     602|   ABE|        ATL|
|1051245|   88|     602|   ABE|        ATL|
|1050605|    9|     602|   ABE|        ATL|
|1061215|   -6|     602|   ABE|        ATL|
|1061725|   69|     602|   ABE|        ATL|
|1061230|    0|     369|   ABE|        DTW|
|1060625|   -3|     602|   ABE|        ATL|
|1070600|    0|     369|   ABE|        DTW|
|1071725|    0|     602|   ABE|        ATL|
|1071230|    0|     369|   ABE|        DTW|
|1070625|    0|     602|   ABE|        ATL|
|1071219|    0|     569|   ABE| 