# WRITE

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = (
    SparkSession.builder
    .appName("example-write")
    .getOrCreate()
)

In [2]:
data = [('James','','Smith','1991-04-01','M',3000),
        ('Michael','Rose','','2000-05-19','M',4000),
        ('Robert','','Williams','1978-09-05','M',4000),
        ('Maria','Anne','Jones','1967-12-01','F',4000),
        ('Jen','Mary','Brown','1980-02-17','F',-1)]

columns = ["firstname", "middlename", "lastname", "dob", "gender", "salary"]

df = spark.createDataFrame(data=data, schema = columns)

In [3]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [None]:
 # saves Dataframe as a CSV file
df.write.format("csv").mode("ignore").save("../files/spark_output/datacsv")

In [None]:
# Write DataFrame data to CSV file
# AnalysisException: path ...datacsv.csv already exists.;
df.write.csv("../files/datacsv.csv")

# saves Dataframe as a CSV file and throws an error if a file already exists in the location
# AnalysisException: path ...datacsv.csv already exists.;
df.write.format("csv").mode("error").save("../files/datacsv.csv")

The code block displayed below contains an error. The code block is intended to write DataFrame transactionsDf to disk as a parquet file in location /FileStore/transactions_split, using column storeId as key for partitioning. Find the error.
>

Code block:
- `transactionsDf.write.format(“parquet”).partitionOn(“storeId”).save(“/FileStore/transactions_split”)`

In [None]:
#  Partitioning data by storeId is possible with the partitionBy expression, so partitionOn should be replaced by partitionBy.
df.write.format("parquet").mode("ignore").partitionBy("firstname").save("../files/spark_output/data_split.parquet")

In [None]:
# Error: 'DataFrameWriter' object has no attribute 'partitionOn'
df.write.format("parquet").partitionOn("firstname").save("../files/spark_output/data_split.parquet")

The code block shown below should write DataFrame transactionsDf as a parquet file to path storeDir, using brotli compression and replacing any previously existing file. Choose the answer that correctly fills the blanks in the code block to accomplish this.
>
- `transactionsDf.__1__.format(“parquet”).__2__(__3__).option(__4__, “brotli”).__5__(storeDir)`
>
- `1. save 2. mode 3. "ignore" 4. "compression" 5. path`
- `1. store 2. with 3. "replacement" 4. "compression" 5. path`
- `1. write 2. mode 3. "overwrite" 4. "compression" 5. save`
- `1. save 2. mode 3. "replace" 4. "compression" 5. path`
- `1. write 2. mode 3. "overwrite" 4. compression 5. parquet`

In [5]:
from pyspark.sql.types import *

data = [(1, 3, 4, 25, 1, None, 1587915332),
         (2, 6, 7, 2, 2, None, 1586815312),
         (3, 3, None, 25, 3, None, 1585824821),
         (4, None, None, 3, 2, None, 1583244275),
         (5, None, None, None, 2, None, 1575285427),
         (6, 3, 2, 25, 2, None, 1572733275)]

schema = StructType([StructField('transactionId', IntegerType(), True),
                     StructField('predError', IntegerType(), True),
                     StructField('value', IntegerType(), True),
                     StructField('storeId', IntegerType(), True),
                     StructField('productId', IntegerType(), True),
                     StructField('f', IntegerType(), True),
                     StructField('transactionDate', LongType(), True)])

transactionsDf = spark.createDataFrame(data=data, schema=schema)

In [None]:
transactionsDf.write.format("parquet").mode("overwrite").option("compression", "brotli").save("../files/spark_output/data_compression.parquet")

In [None]:
# AttributeError: 'DataFrame' object has no attribute 'save'
transactionsDf.save.format("parquet").mode("ignore").option("compression", "brotli").path("../files/spark_output/data_compression.parquet")

# SyntaxError: invalid syntax with
transactionsDf.store.format("parquet").with("replacement").option("compression", "brotli").path("../files/spark_output/data_compression.parquet")

# AttributeError: 'DataFrame' object has no attribute 'save'
transactionsDf.save.format("parquet").mode("replace").option("compression", "brotli").path("../files/spark_output/data_compression.parquet")

# NameError: name 'compression' is not defined
transactionsDf.write.format("parquet").mode("overwrite").option(compression, "brotli").parquet("../files/spark_output/data_compression.parquet")

Which of the following code blocks silently writes DataFrame itemsDf in avro format to location fileLocation if a file does not yet exist at that location?
>
- `itemsDf.write.avro(fileLocation)`
- `itemsDf.write.format("avro").mode("ignore").save(fileLocation)`
- `itemsDf.write.format("avro").mode("errorifexists").save(fileLocation)`
- `itemsDf.save.format("avro").mode("ignore").write(fileLocation)`
- `spark.DataFrameWriter(itemsDf).format("avro").write(fileLocation)`

In [4]:
data = [(1, 'Thick Coat for Walking in the Snow', 'Sports Company Inc.'),
        (2, 'Elegant Outdoors Summer Dress', 'YetiX'),
        (3, 'Outdoors Backpack', 'Sports Company Inc.')]

columns = ["itemId", "itemName", "supplier"]

itemsDf = spark.createDataFrame(data=data, schema = columns)

In [None]:
itemsDf.write.format("avro").mode("ignore").save("../files/spark_output/data_avro.avro")

In [None]:
# AttributeError: 'DataFrameWriter' object has no attribute 'avro'
itemsDf.write.avro("../files/spark_output/data_avro.avro")

# AttributeError: 'DataFrame' object has no attribute 'save'
itemsDf.save.format("avro").mode("ignore").write("../files/spark_output/data_avro.avro")

# AttributeError: 'SparkSession' object has no attribute 'DataFrameWriter'
spark.DataFrameWriter(itemsDf).format("avro").write("../files/spark_output/data_avro.avro")

# Error: errorifexists
itemsDf.write.format("avro").mode("errorifexists").save("../files/spark_output/data_avro.avro")


In [10]:
import pandas as pd

with pd.ExcelWriter('../files/excel_output.xlsx', engine='xlsxwriter') as writer:
    transactionsDf.toPandas().to_excel(writer, sheet_name='Sheet_Transaction')
    itemsDf.toPandas().to_excel(writer, sheet_name='Sheet_Items')