In [1]:
%%configure -f

{"conf":
     {"spark.pyspark.python":"/home/hadoop/epidemics-venv/bin/python3",
      "spark.pyspark.virtualenv.bin.path":"/home/hadoop/epidemics-venv/bin",
      "spark.pyspark.python":"/home/hadoop/epidemics-venv/bin/python3",
      "spark.dynamicAllocation.enabled":"true",
      "spark.executor.memory":"16g",
      "spark.driver.memory":"16g",
      "spark.executor.cores":"6",
      "spark.driver.cores":"6",
      "spark.driver.maxResultSize":"20g",
      "livy.server.session.timeout-check" :"false"}
}

In [2]:
import daphme as dm
import pandas as pd
from pyproj import Transformer

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

from sedona.spark import *

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,application_1702659324777_0001,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
spark = SparkSession.\
    builder.\
    master("local[*]").\
    appName("Example1").\
    getOrCreate()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Intro to DAPHME

In this script, we will demonstrate some of DAPHME's functionalities for analyzing human mobility data. Our analysis will focus on the month of February for the city of Philadelphia. Using a dataset that originally contains [60000] users. 

In our analysis, we will perform three different kinds of operations, which are common in the analyses in these types of data. 
* **Importing**, which optionally converts the datetime to a specified timezone and project the coordinates into ESPG:3857 (which has units in meters and might be appropriate for a local analysis not too far from the equator). 
* **Coarse Filtering**, where we subset to users to make computations for tractable. In particular, we filter out users with very few days/hours with activity, and keep TODO [users with "sufficient" pings in the area]. We also filter to users with activity in a given time period.
* **Persisting**

## Load Pings

Daphme can work with a single file in a number of formats (csv, parquet, txt) or with a directory containing partitioned data. In this example, we specify the path of the files to be loaded.

In [4]:
parqDF = spark.read.parquet("s3://phl-pings/gravy/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
parqDF = parqDF.withColumn(
    "timestamp",
    F.to_timestamp(F.col("timestamp") / 1000) # divide by 1000 because given in milliseconds
)

# parqDF = parqDF.withColumn(
#    "timestamp",
#    F.from_utc_timestamp(
#        F.to_utc_timestamp(parqDF.timestamp1, F.col("timezone")), # converts ts from tz to GMT
#    F.col("timezone")) # converts ts from GMT to tz

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
@F.pandas_udf(DoubleType())
def transform_lat(lat, lon):
    trans = Transformer.from_crs('epsg:4326','epsg:3857',always_xy=True)
    _, transformed_lat = trans.transform(lon, lat)
    return pd.Series(transformed_lat)

@F.pandas_udf(DoubleType())
def transform_long(lat, lon):
    trans = Transformer.from_crs('epsg:4326','epsg:3857',always_xy=True)
    transformed_long, _ = trans.transform(lon, lat)
    return pd.Series(transformed_long)

parqDF = parqDF.withColumn('latitude_3857', transform_lat(F.col('geohashlatitude'), F.col('geohashlongitude'))) \
    .withColumn('longitude_3857', transform_long(F.col('geohashlatitude'), F.col('geohashlongitude')))

parqDF.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------------+----------------+-----------+-------------------+----------------+--------------+------------+----------+-----------+-----------------+------------------+
|                grid|geohashlatitude|geohashlongitude|geohashnine|          timestamp|        timezone|     ipaddress|forensicflag|devicetype|recordcount|    latitude_3857|    longitude_3857|
+--------------------+---------------+----------------+-----------+-------------------+----------------+--------------+------------+----------+-----------+-----------------+------------------+
|00000006-23aa-3e4...|      39.947813|      -74.911673|  dr4er26bz|2020-03-01 21:50:37|America/New_York| 73.76.110.202|       33536|      AAID|          1|4858361.502393236| -8339129.29283222|
|00000013-ec62-3ef...|      40.013387|      -75.144017|  dr4edh1yc|2019-10-16 09:36:06|America/New_York|  172.56.28.49|      525056|      IDFA|          1|4867887.832599926|-8364993.708601093|
|00000013-ec62-3ef...|      40.0133

In [9]:
parqDF.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- grid: string (nullable = true)
 |-- geohashlatitude: string (nullable = true)
 |-- geohashlongitude: string (nullable = true)
 |-- geohashnine: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timezone: string (nullable = true)
 |-- ipaddress: string (nullable = true)
 |-- forensicflag: long (nullable = true)
 |-- devicetype: string (nullable = true)
 |-- recordcount: integer (nullable = true)
 |-- latitude_3857: double (nullable = true)
 |-- longitude_3857: double (nullable = true)

In [7]:
testDF = parqDF.limit(100)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Coarse Filtering

Takes in a geometry and filters to pings within the given geometry

In [11]:
testDF.createOrReplaceTempView("df")

counties_geom = spark.sql(
      "SELECT *, ST_MakePoint(latitude_3857, longitude_3857) as geometry from df"
)

counties_geom.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------------+----------------+-----------+-------------------+----------------+--------------+------------+----------+-----------+-----------------+------------------+--------------------+
|                grid|geohashlatitude|geohashlongitude|geohashnine|          timestamp|        timezone|     ipaddress|forensicflag|devicetype|recordcount|    latitude_3857|    longitude_3857|            geometry|
+--------------------+---------------+----------------+-----------+-------------------+----------------+--------------+------------+----------+-----------+-----------------+------------------+--------------------+
|00000006-23aa-3e4...|      39.947813|      -74.911673|  dr4er26bz|2020-03-01 21:50:37|America/New_York| 73.76.110.202|       33536|      AAID|          1|4858361.502393236| -8339129.29283222|POINT (4858361.50...|
|00000013-ec62-3ef...|      40.013387|      -75.144017|  dr4edh1yc|2019-10-16 09:36:06|America/New_York|  172.56.28.49|      525056|      IDFA| 

In [None]:
print(f'The PySpark {spark.version} version is running...')