## Imports

In [6]:
import configparser
import os
import itertools 
import time

import boto3
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
# from pyspark.sql.streaming import DataStreamReader

In [7]:
start_time = time.time()

## Get AWS credentials and region

In [10]:
# Read AWS creds
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

# Set credentials
KEY = config.get('AWS','KEY')
SECRET = config.get('AWS','SECRET')
REGION_NAME = config.get('AWS','REGION_NAME')

# # Set creds as env variables: https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html
# # env variables have the highest preference in the credential chain
# os.environ["AWS_ACCESS_KEY_ID"] = KEY
# os.environ["AWS_SECRET_ACCESS_KEY"] = SECRET
# os.environ["REGION_NAME"] = REGION_NAME

# Alternatively, set hadoop configuration
sc = SparkContext()
sc.setLogLevel("DEBUG")
hadoopConf = sc._jsc.hadoopConfiguration()
hadoopConf.set("fs.s3a.access.key", KEY)
hadoopConf.set("fs.s3a.secret.key", SECRET)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at <ipython-input-9-3de38d07304d>:17 

## Initialize a Spark Session 

In [12]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [13]:
INPUT_BUCKET='s3a://udacity-dend/'
def process_log_data(spark):    
    # get filepath to log data files    
    input_log_data = os.path.join(INPUT_BUCKET, 'log_data')   
    # read log data file    
    log_schema = build_log_schema()   
    log_df = spark.read.json(input_log_data, schema=log_schema)
def build_log_schema():    
    schema = StructType(        
     [            
      StructField('artist', StringType(), True),            
      StructField('auth', StringType(), True),            
      StructField('firstName', StringType(), True),            
      StructField('gender', StringType(), True),            
      StructField('itemInSession', IntegerType(), True),            
      StructField('lastName', StringType(), True),            
      StructField('length', DecimalType(), True),            
      StructField('level', StringType(), True),            
      StructField('location', StringType(), True),            
      StructField('method', StringType(), True),            
      StructField('page', StringType(), True),            
      StructField('registration', LongType(), True),            
      StructField('sessionId', IntegerType(), True),            
      StructField('song', StringType(), True),            
      StructField('status', IntegerType(), True),            
      StructField('ts', LongType(), True),            
      StructField('userAgent', StringType(), True),            
      StructField('userId', StringType(), True)        
     ]    
    )   
    return schema

In [15]:
log_df = process_log_data(spark)

In [16]:
log_df.head()

AttributeError: 'NoneType' object has no attribute 'head'

## Create boto s3 client

In [5]:
s3 = boto3.resource('s3',
                       region_name=REGION_NAME,
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )

## Set AWS bucket and prefixes

In [6]:
BUCKET = "udacity-dend"
EVENT_DIR = 'log-data'
SONG_DIR = 'song-data'

In [7]:
dataBucket =  s3.Bucket(BUCKET)
log_objects = dataBucket.objects.filter(Prefix=EVENT_DIR)
song_objects = dataBucket.objects.filter(Prefix=SONG_DIR)

## List some bucket objects

In [8]:
for obj in itertools.islice(log_objects, 6):
    if obj.key != 'log-data/':
        print(obj)

s3.ObjectSummary(bucket_name='udacity-dend', key='log-data/2018/11/2018-11-01-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log-data/2018/11/2018-11-02-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log-data/2018/11/2018-11-03-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log-data/2018/11/2018-11-04-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log-data/2018/11/2018-11-05-events.json')


In [9]:
for obj in itertools.islice(song_objects, 5):
    if obj.key != 'song-data/':
        print(obj)

s3.ObjectSummary(bucket_name='udacity-dend', key='song-data/A/A/A/TRAAAAK128F9318786.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song-data/A/A/A/TRAAAAV128F421A322.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song-data/A/A/A/TRAAABD128F429CF47.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song-data/A/A/A/TRAAACN128F9355673.json')


In [10]:
songs_path = f'udacity-dend/song-data/A/A/A/*.json'
logs_path = f'udacity-dend/log-data/2018/11/2018-11-0*-events.json'

## Read from S3

In [11]:
songs_df = spark.read \
    .format("json") \
    .option("mode", "FAILFAST") \
    .option("inferSchema", "true") \
    .load(f's3a://{songs_path}')

In [12]:
events_df = spark.read \
    .format("json") \
    .option("mode", "FAILFAST") \
    .option("inferSchema", "true") \
    .load(f's3a://{logs_path}')

In [13]:
songs_df.createOrReplaceTempView("songs")
events_df.createOrReplaceTempView("events")

In [14]:
songs_df.head()

Row(artist_id='ARTC1LV1187B9A4858', artist_latitude=51.4536, artist_location="Goldsmith's College, Lewisham, Lo", artist_longitude=-0.01802, artist_name='The Bonzo Dog Band', duration=301.40036, num_songs=1, song_id='SOAFBCP12A8C13CC7D', title='King Of Scurf (2007 Digital Remaster)', year=1972)

In [15]:
events_df.head()

Row(artist='A Fine Frenzy', auth='Logged In', firstName='Anabelle', gender='F', itemInSession=0, lastName='Simpson', length=267.91138, level='free', location='Philadelphia-Camden-Wilmington, PA-NJ-DE-MD', method='PUT', page='NextSong', registration=1541044398796.0, sessionId=256, song='Almost Lover (Album Version)', status=200, ts=1541377992796, userAgent='"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"', userId='69')