## Project test notebook 
Used to build the ETL with small, local data set

### imports

In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

### read in aws credentials

In [2]:
config = configparser.ConfigParser()
config.read('../dl.cfg')

os.environ['AWS_ACCESS_KEY_ID'] = config.get('AWS', 'AWS_ACCESS_KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY'] = config.get('AWS', 'AWS_SECRET_ACCESS_KEY')

### path to data files

In [38]:
log_data_dir =  '../sample_data/log_data'
song_data_dir =  '../sample_data/song_data'

In [55]:
song_files = f'{song_data_dir}/*/*/*/*.json'

### create a spark session

In [4]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .getOrCreate()
#         .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        
    return spark


### get a spark session

In [5]:
spark = create_spark_session()

### read in log data

In [16]:
log_df = spark.read.json(log_data_dir)

In [60]:
log_df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [23]:
log_df.createOrReplaceTempView('events')

In [59]:
spark.sql('select count(*) from events').show()

+--------+
|count(1)|
+--------+
|    8056|
+--------+



### read in song data

In [56]:
song_df = spark.read.json(song_files)

In [61]:
song_df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [57]:
song_df.createOrReplaceTempView('songs')

In [58]:
spark.sql('select count(*) from songs').show()

+--------+
|count(1)|
+--------+
|      71|
+--------+

