In [None]:
import findspark, os
findspark.init('/home/hadoop/spark')

# Enable horizontal scroll
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import col, expr, explode, concat_ws

# sc.stop()

# Initialize Spark Context
sc = pyspark.SparkContext()
spark = SparkSession(sc)

# Read raw cards from HDFS
mtg_cards_df = spark.read.json(f'/user/hadoop/mtg/raw/2021/04/02')

# Explode the array into single elements
mtg_cards_exploded_df = mtg_cards_df\
    .select(explode('cards').alias('exploded'))\
    .select('exploded.*')

# Replace all null values with empty strings
mtg_cards_renamed_null_df = mtg_cards_exploded_df\
    .na.fill('')

# Remove all unnecessary columns
columns = ['name', 'subtypes', 'text', 'flavor', 'artist', 'multiverseid', 'imageUrl']
reduced_cards_df = mtg_cards_renamed_null_df.select(*columns)

# Flatten the subtypes from an array to a comma seperated string
flattened_subtypes_df = reduced_cards_df\
    .withColumn('subtypes', concat_ws(', ', 'subtypes'))

# Write data to HDFS
flattened_subtypes_df.write.format('json')\
    .mode('overwrite')\
    .save(f'/user/hadoop/mtg/final/2021/04/02')