In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['default']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['default']['AWS_SECRET_ACCESS_KEY']

In [3]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [4]:
input_data = "s3a://udacity-dend/"
song_data = input_data+'song_data/*/*/*/*'
song_data

's3a://udacity-dend/song_data/*/*/*/*'

In [5]:
from pyspark.sql.types import StructType as R, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int, DateType as Date
song_schema = R([Fld("artist", Str()),
                     Fld("artist_id", Str()),
                     Fld("artist_latitude", Dbl()),
                     Fld("artist_location", Str()),
                     Fld("artist_longitude", Dbl()),
                     Fld("artist_name", Str()),
                     Fld("duration", Dbl()),
                     Fld("num_songs", Int()),
                     Fld("song_id", Str()),
                     Fld("title", Str()),
                     Fld("year", Int()),  
                     ])

In [6]:
df = spark.read.json(song_data, song_schema)

In [7]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: integer (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)



In [8]:
# extract columns to create songs table
cols = ['song_id','title', 'artist_id', 'year','duration']
songs_table = df.select(cols)
#print(songs_table.count())
songs_table = songs_table.drop_duplicates()
#print(songs_table.count())
songs_table.limit(5).toPandas()

Unnamed: 0,song_id,title,artist_id,year,duration
0,SOUJLXS12A3F1ECF59,Catholic Knees (Album Version),ARG9CID1187B9AB010,2009,168.25424
1,SOJWWEV12A8C136B97,Last Rose Of Summer/Walking In The Air,AR4FBUO1187B9B6072,2005,260.04853
2,SOAYZDX12A8C13B9C1,Did You Ever Go Steady,ARNUFGE1187B9B7881,1963,138.762
3,SOBGZZO12A6701CD25,Keep The Circle Around,AR9AM2N1187B9AD2F1,1999,230.05995
4,SOMCMZX12A67AE0221,The Old Sod,ARBW7IC1187B9908CE,1990,202.63138


In [None]:
songs_table.printSchema()

In [None]:
# write songs table to parquet files partitioned by year and artist
songs_table = songs_table.repartition('year','artist_id')
songs_table.write.parquet("s3a:://sparkify-datawharehouse23/songs_table/songs_table.parquet")

In [None]:
log_data = "s3a://udacity-dend/log_data/*"

In [None]:
# read song data file
df = spark.read.json(log_data)