# Spark sample showing read/write methods
In this sample notebook, we will read CSV file(s) from HDFS, write it as parquet & orc file(s) and save a Hive table definition.

In [None]:
# Read the clickstream CSV file(s) into a spark data frame, print schema & top rows
results = spark.read.option("inferSchema", "true").csv('/clickstream_data').toDF(
            "wcs_click_date_sk", "wcs_click_time_sk", "wcs_sales_sk", "wcs_item_sk", "wcs_web_page_sk", "wcs_user_sk"
            )
results.printSchema()
results.show()

In [None]:
# Disable saving SUCCESS file
sc._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") 

# Print the current warehouse directory where the parquet files will be stored
print(spark.conf.get("spark.sql.warehouse.dir"))

# Save results as parquet & orc file and create hive table
results.write.format("parquet").mode("overwrite").saveAsTable("web_clickstreams")
results.write.format("orc").mode("overwrite").saveAsTable("web_clickstreams_orc")

In [None]:
# Read the product reviews CSV files into a spark data frame, print schema & top rows
results = spark.read.option("inferSchema", "true").csv('/product_review_data').toDF(
            "pr_review_sk", "pr_review_content"
            )
results.printSchema()
results.show()

In [None]:
# Save results as parquet, and orc formats and create hive table
results.write.format("parquet").mode("overwrite").saveAsTable("product_reviews")
results.write.format("orc").mode("overwrite").saveAsTable("product_reviews_orc")