## Project test notebook 
Used to build the ETL with small, local data set

### imports

In [1]:
import configparser
from datetime import datetime
import os
import time

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import to_timestamp
from pyspark.sql import functions as f
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.functions import monotonically_increasing_id, md5
from pyspark.sql.types import *
from pyspark.sql import functions as f
from pyspark.sql import types as t

### read in aws credentials

In [2]:
config = configparser.ConfigParser()
config.read('../dl.cfg')

os.environ['AWS_ACCESS_KEY_ID'] = config.get('AWS', 'AWS_ACCESS_KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY'] = config.get('AWS', 'AWS_SECRET_ACCESS_KEY')

### create a spark session

In [3]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .getOrCreate()
#         .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        
    return spark


### get a spark session

In [4]:
spark = create_spark_session()

### read in song data

In [5]:
song_data_dir =  '../sample_data/song_data'
song_files = f'{song_data_dir}/*/*/*/*.json'
song_df = spark.read.json(song_files)

In [6]:
song_df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



### extract columns to create songs table
### write songs table to parquet files partitioned by year and artist

In [7]:
song_df.select \
    ('song_id', 'title', 'artist_id', 'year', 'duration') \
    .write \
    .mode("overwrite") \
    .partitionBy('year', 'artist_id') \
    .parquet('songs.parquet')

### extract columns to create artists table
### write artists table to parquet files

In [8]:
song_df.select \
    ('artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude') \
    .write \
    .mode("overwrite") \
    .parquet('artists.parquet')

### get filepath to log data file

In [9]:
log_data_dir =  '../sample_data/log_data'

### read log data file

### filter by actions for song plays

In [32]:
log_df = spark.read.json(log_data_dir).filter("page = 'NextSong'")

In [33]:
log_df.count()

6820

In [34]:
log_df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



### extract columns for users table    
### write users table to parquet files

In [13]:
log_df.filter("page = 'NextSong'") \
    .select('userId', 'firstName', 'lastName', 'gender', 'level') \
    .write \
    .mode("overwrite") \
    .parquet('users.parquet')

### create timestamp column from original timestamp column
### create datetime column from original timestamp column
### extract columns to create time table
### write time table to parquet files partitioned by year and month
### read in song data to use for songplays table
### extract columns from joined song and log datasets to create songplays table 
### write songplays table to parquet files partitioned by year and month

In [14]:
@udf
def ts_from_epoch(epoch):
    return time.strftime('%Y-%m-%d %H:%M:%S',  time.gmtime(epoch/1000))

In [35]:
log_df.select(
    to_timestamp(ts_from_epoch("ts"), "yyyy-MM-dd HH:mm:ss") \
    .alias("ts")) \
    .select("ts", hour("ts").alias("hour"), \
            dayofmonth("ts").alias("day"),  \
            weekofyear("ts").alias("week"), \
            month("ts").alias("month"),\
            year("ts").alias("year"), \
            date_format('ts', 'w').alias("weekday")) \
    .withColumnRenamed("ts", "start_time") \
    .write \
    .mode("overwrite") \
    .parquet('users.parquet')

In [None]:
time = spark.read.parquet('')