In [1]:
#Gather relevant keys from our Secret Scope
ServicePrincipalID = dbutils.secrets.get(scope = "Analysts", key = "SPID")
ServicePrincipalKey = dbutils.secrets.get(scope = "Analysts", key = "SPKey")
DirectoryID = dbutils.secrets.get(scope = "Analysts", key = "DirectoryID")
DBUser = dbutils.secrets.get(scope = "Analysts", key = "DBUser")
DBPassword = dbutils.secrets.get(scope = "Analysts", key = "DBPword")


#Combine DirectoryID into full string
Directory = "https://login.microsoftonline.com/{}/oauth2/token".format(DirectoryID)

#Configure our ADLS Gen 2 connection with our service principal details
spark.conf.set("fs.azure.account.auth.type", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id", ServicePrincipalID)
spark.conf.set("fs.azure.account.oauth2.client.secret", ServicePrincipalKey)
spark.conf.set("fs.azure.account.oauth2.client.endpoint", Directory)

### Data Transformations
The kind of transformations we might do here are exactly those that we would do in any other ETL system. We can add calculated columns, strip out columns we don't need, rename columns to be more use friendly and lookup reference data

In [3]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Define the schema over the Taxi data we're going to be bringing in
factSchema = StructType([
  StructField("Dispatching_base_num", StringType(), True),
  StructField("Pickup_Datetime", TimestampType(), True),
  StructField("DropOff_datetime", TimestampType(), True),
  StructField("PULocationID", IntegerType(), True),
  StructField("DOlocationID", IntegerType(), True)])

# Define the Taxi data frame
factdf = (spark
     .read
     .option("header","true")
     .schema(factSchema)
     .csv("/mnt/taxi/taxiFull/SmallSlice.csv")
     )

# Define the Lookup data frame over our cleaned Taxi Zone data
lookupdf = (spark
       .read
       .parquet("abfss://root@dblake.dfs.core.windows.net/BASE/Public/TaxiZones/v1/parquet/")
     )

We now have a very large dataset with some IDs, and a lookup file that provides more information about those IDs. In this case, we're preparing a file for further analysis, so we want to denormalise those useful lookup attributes onto the fact table so we can query it as one, efficient dataset.

First, let's lookup the Pickup Location - we'll use an inner join which will trim down our data somewhat, but we're only interested in those rides where we have this information

Once we've joined the DataFrames, we'll have a new structure that contains the superset of colunmns. We'll do some column renaming to make it clear where those columns came from

In [5]:
# Define a new dataframe using a default (inner) join between the fact DataFrame and our lookup DataFrame
joindf = (factdf
           .join(lookupdf, factdf["PULocationID"] == lookupdf["LocationID"])
         )

# Tidy up the DataFrame, renaming columns to show they're from the Pickup Location
joindf = (joindf
           .drop("LocationID")
           .withColumnRenamed("Borough","PickupBorough")
           .withColumnRenamed("Zone","PickupZone")
           .withColumnRenamed("service_zone","PickupServiceZone")
         )

Ok, we've added some details around the pickup location. Now let's look at the DropOff location. This is a much sparser populated field and so we should use a left outer join, much as we would in SQL.

In [7]:
# Create a new DataFrame performing the left outer join
fulldf = (joindf
           .join(lookupdf, joindf["DOLocationID"] == lookupdf["LocationID"], "leftouter")
         )

# Tidy up the DataFrame once more
fulldf = (fulldf
           .drop("LocationID")
           .withColumnRenamed("Borough","DropOffBorough")
           .withColumnRenamed("Zone","DropOffZone")
           .withColumnRenamed("service_zone","DropOffServiceZone")
         )

In [8]:
# Let's review our results
display(fulldf.limit(100))

Dispatching_base_num,Pickup_Datetime,DropOff_datetime,PULocationID,DOlocationID,PickupBorough,PickupZone,PickupServiceZone,DropOffBorough,DropOffZone,DropOffServiceZone
B00029,2017-01-01T00:22:00.000+0000,,3,,Bronx,Allerton/Pelham Gardens,Boro Zone,,,
B00029,2017-01-01T00:01:00.000+0000,,3,,Bronx,Allerton/Pelham Gardens,Boro Zone,,,
B00029,2017-01-01T00:16:00.000+0000,,51,,Bronx,Co-Op City,Boro Zone,,,
B00029,2017-01-01T00:46:00.000+0000,,185,,Bronx,Pelham Parkway,Boro Zone,,,
B00029,2017-01-01T00:56:00.000+0000,,174,,Bronx,Norwood,Boro Zone,,,
