In [None]:
"""
                        Access Data on AWS S3
1. Create user from IAM service
2. Give policy to user "S3 Access"
3. create S3 bucket and folder inside it
4. Upload data into S3 folder     
5. Mount from AWS S3 buckets into local databricks files
means we can access data in S3 from local databricks folders (DBFS)

6. Load stream data As Auto Loader format to incrementaly read data
7. make processing on streaming data
8. upload new file to S3 & monitor results

9. Create, register & call UDFs (user defined functions)
"""

In [None]:
dbutils.fs.mkdirs("dbfs:/FileStore/shared_uploads/S3_dataset/")

Out[1]: True

In [None]:
dbutils.fs.ls("dbfs:/FileStore/shared_uploads/S3_dataset/")

Out[2]: [FileInfo(path='dbfs:/FileStore/shared_uploads/S3_dataset/tasneem_databricks_accessKeys.csv', name='tasneem_databricks_accessKeys.csv', size=99, modificationTime=1693476059000)]

In [None]:
# read S3 credantials file & uploaded it
aws_crediantials = spark.read.format("csv")\
    .option("header", "True")\
        .option("inferSchema", "True")\
            .option("sep", ",")\
                .load("dbfs:/FileStore/shared_uploads/S3_dataset/tasneem_databricks_accessKeys.csv")

display(aws_crediantials)

Access key ID,Secret access key
AKIAV7AP62A2RAFYUOEZ,9A+5Nz6PhUBLUIGB2Bb/j+wJ0uiMZe9b6vkMIDil


In [None]:
#read access key
access_key = aws_crediantials.select('Access key ID').collect()[0]['Access key ID']
access_key

In [None]:
#read secret key
secret_key = aws_crediantials.select('Secret access key').collect()[0]['Secret access key']
secret_key

In [None]:
import urllib
#encode/parse/ escape special chars in secret key
encode_sec_key = urllib.parse.quote(secret_key, "")
encode_sec_key

In [None]:

# when you store data in local databricks files from extrernal source, should dest path start with /mnt
aws_s3_bucket = "databricks1stbucket" #name of S3 bucket
mnt_dest_path = "/mnt/databricks_bucket" #dest in local databricks where we want to load data 
sourceURI = "s3n://{0}:{1}@{2}".format(access_key, encode_sec_key, aws_s3_bucket)

# mount S3 bucket to local databricks files
dbutils.fs.mount(sourceURI, mnt_dest_path)

Out[19]: True

In [None]:
%fs

ls "/mnt/databricks_bucket/databricks_files/"

path,name,size,modificationTime
dbfs:/mnt/databricks_bucket/databricks_files/car_ad_01.csv,car_ad_01.csv,1112,1693472792000
dbfs:/mnt/databricks_bucket/databricks_files/car_ad_02.csv,car_ad_02.csv,1101,1693474803000


In [None]:
car_stream_data = spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "csv")\
        .option("cloudFiles.schemaLocation", "dbfs:/mnt/databricks_bucket/databricks_files/")\
            .option("cloudFiles.schemaHints", "price float, mileage int, engV float, year int")\
                .load("dbfs:/mnt/databricks_bucket/databricks_files/")

display(car_stream_data)

car,price,body,mileage,engV,engType,registration,year,model,drive,_rescued_data
Ford,15500.0,crossover,68,2.5,Gas,yes,2010,Kuga,full,
Mercedes-Benz,20500.0,sedan,173,1.8,Gas,yes,2011,E-Class,rear,
Mercedes-Benz,35000.0,other,135,5.5,Petrol,yes,2008,CL 550,rear,
Mercedes-Benz,17800.0,van,162,1.8,Diesel,yes,2012,B 180,front,
Nissan,16600.0,crossover,83,2.0,Petrol,yes,2013,X-Trail,full,
Honda,6500.0,sedan,199,2.0,Petrol,yes,2003,Accord,front,
Renault,10500.0,vagon,185,1.5,Diesel,yes,2011,Megane,front,
Mercedes-Benz,21500.0,sedan,146,1.8,Gas,yes,2012,E-Class,rear,
Mercedes-Benz,22700.0,sedan,125,2.2,Diesel,yes,2010,E-Class,rear,
Nissan,20447.154,crossover,0,1.2,Petrol,yes,2016,Qashqai,front,


In [None]:
car_stream_data.filter("car = 'Nissan' ").display()

car,price,body,mileage,engV,engType,registration,year,model,drive,_rescued_data
Nissan,16600.0,crossover,83,2.0,Petrol,yes,2013,X-Trail,full,
Nissan,20447.154,crossover,0,1.2,Petrol,yes,2016,Qashqai,front,
Nissan,16600.0,crossover,83,2.0,Petrol,yes,2013,X-Trail,full,
Nissan,26033.553,crossover,0,1.6,Diesel,yes,2016,X-Trail,full,
Nissan,13980.0,hatch,31,,Other,yes,2013,Leaf,front,"{""engV"":""NA"",""_file_path"":""dbfs:/mnt/databricks_bucket/databricks_files/car_ad_02.csv""}"
Nissan,17300.0,hatch,24,,Other,yes,2013,Leaf,front,"{""engV"":""NA"",""_file_path"":""dbfs:/mnt/databricks_bucket/databricks_files/car_ad_02.csv""}"
Nissan,13275.0,hatch,12,,Other,yes,2013,Leaf,front,"{""engV"":""NA"",""_file_path"":""dbfs:/mnt/databricks_bucket/databricks_files/car_ad_03.csv""}"
Nissan,20241.896,crossover,0,1.6,Petrol,yes,2015,Juke,front,
Nissan,17000.0,hatch,38,,Other,yes,2013,Leaf,front,"{""engV"":""NA"",""_file_path"":""dbfs:/mnt/databricks_bucket/databricks_files/car_ad_03.csv""}"


In [None]:
car_stream_data.groupBy("car", "year").avg("price")\
    .display()

car,year,avg(price)
Jaguar,2008,18777.0
Land Rover,2016,0.0
Mercedes-Benz,2011,23200.0
BMW,2015,63000.0
Mitsubishi,2006,9200.0
Mercedes-Benz,2016,74499.16666666667
BMW,2006,19999.0
Audi,2012,37000.0
Chrysler,2008,13700.0
Ford,2010,15500.0


In [None]:
"""
    premium_price
if price > 20K , then return True
otherwise, return false
"""
@udf
def premium_price(price):
    return price > 20000

In [None]:
#return new dataframe with new column 'premium'
# with calling "premium_price" function and pass price to it 
car_stream_data.withColumn("premium", premium_price('price'))\
    .select("car", "price", "year", "premium")\
        .display()

car,price,year,premium
Audi,37000.0,2012,True
Mercedes-Benz,200000.0,2013,True
Audi,3850.0,2002,False
Nissan,13275.0,2013,False
Mercedes-Benz,20400.0,2011,True
BMW,1900.0,1997,False
BMW,39333.0,2016,True
Mercedes-Benz,99999.0,2016,True
Mercedes-Benz,70999.0,2016,True
BMW,63500.0,2016,True


In [None]:
"""
    like_new:
if year > 2010, return true
otherwise, return false
"""
@udf
def like_new(year):
    return year > 2010

In [None]:
# register the function in spark
spark.udf.register("likeNewUDF", like_new)

Out[36]: <function __main__.like_new(year)>

In [None]:
#create temp table avilable only in this notebook
car_stream_data.createOrReplaceTempView("car_stream_table")

In [None]:
%sql

select car, body, price, year, likeNewUDF(year) as new_car from car_stream_table;

car,body,price,year,new_car
Audi,sedan,37000.0,2012,True
Mercedes-Benz,van,200000.0,2013,True
Audi,vagon,3850.0,2002,False
Nissan,hatch,13275.0,2013,True
Mercedes-Benz,sedan,20400.0,2011,True
BMW,sedan,1900.0,1997,False
BMW,sedan,39333.0,2016,True
Mercedes-Benz,crossover,99999.0,2016,True
Mercedes-Benz,crossover,70999.0,2016,True
BMW,crossover,63500.0,2016,True
