# Spark 2: Listen storage bucket for uploads with Spark

In this lesson we modify the previous pyspark implementation so that it listens for uploaded files and automatically loads them to database.

## Step 1: Execute the code

In [None]:
# Imports
import collections
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.streaming import StreamingContext

# Load generic helper functions
%run ../../common/jupyter.ipynb
import src_common_database as db
%run ../../common/spark.ipynb
import src_common_util as util

# Use storage bucket defined with environment variables
bucket = os.environ['STORAGE_BUCKET_URL']
protocol = st.init_spark(sc)

In [None]:
# Function for saving data to database
def save_data( rdd ):
    if not rdd.isEmpty():
        header = rdd.first()
        df = rdd.filter(lambda row : row != header).toDF(header)
        
        # Determine table name and schema from the header
        table_name = None
        db_fd = None
        if collections.Counter(header) == collections.Counter(["Date", "Product", "Order", "Quantity", "Price"]):
            # Change dataframe schema to match the load_sales database view and
            # generate unique key by concatenating order number and product SKU
            table_name = "load_sales"
            db_df = df.select(
                concat(col("Order"), lit("."), col("Product")).alias("key"),
                col("Date").alias("date_key"),
                col("Product").alias("product_key"),
                col("Order").alias("order_number"),
                col("Quantity").alias("quantity").cast(IntegerType()),
                col("Price").alias("price").cast(FloatType())
            );

        if table_name:
            # Write the data to database
            db_df.write.mode("append").jdbc(db.get_jdbc_url(), table_name, properties=db.get_jdbc_options())
            # DEBUG: Show saved data
            print("INFO: Saved data to database table " + table_name)
            db_df.show()
        else:
            print("ERROR: Could not parse file with header: " + ','.join(header), file=sys.stderr)

In [None]:
# Set streaming context with 10 second interval
ssc = StreamingContext(sc, 10)

# Read new files from bucket as a stream and parse each non-empty CSV row into a list of whitespace-trimmed values
# NOTE: textFileStream does not support pathGlobFilter or recursiveFileLookup
stream_data = ssc.textFileStream(
    protocol + bucket + "/sales"
).filter(None).map( lambda x: [value.strip() for value in x.split(',')] )

# Save data to database
stream_data.foreachRDD(save_data)

# Start streaming until terminated by user
ssc.start()
ssc.awaitTermination()

## Step 2: Add a file to the storage bucket

- Execute `taito open bucket` on command-line to open the locally running bucket on web browser.
  - TIP: You can alternatively use `taito open bucket:ENV` to connect to a non-local bucket (ENV is `dev`, `test`, `stag`, or `prod`).
- Sign in with access key `minio` and secret key `secret1234`.
- Create a folder named `sales` and upload the `Sales.csv` file to the folder. If the file already exists, just overwrite it.
- In a few seconds you should see a notification that new data was saved to the database.
- Stop the execution by pressing stop button on the Jupyter Lab web user interface. You can ignore the `KeyboardInterrupt` error message.

## Next lesson: [Spark 3 - Analyze data with Spark](03.ipynb)