In [1]:
%%configure -f

{"conf":
     {"spark.pyspark.python":"/home/hadoop/epidemics-venv/bin/python3",
      "spark.pyspark.virtualenv.bin.path":"/home/hadoop/epidemics-venv/bin",
      "spark.pyspark.python":"/home/hadoop/epidemics-venv/bin/python3",
      "spark.dynamicAllocation.enabled":"true",
      "spark.executor.memory":"16g",
      "spark.driver.memory":"16g",
      "spark.executor.cores":"6",
      "spark.driver.cores":"6",
      "spark.driver.maxResultSize":"20g",
      "livy.server.session.timeout-check" :"false"}
}

In [2]:
import pandas as pd
import geopandas as gpd
import json
import shapely
from shapely.geometry import Point, MultiPoint, Polygon, shape, box
from shapely.ops import unary_union, transform

from pyspark.sql import DataFrame, SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, ArrayType

import daphme as dm
from daphme import cleaning

from sedona.spark import *

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3,application_1703539116779_0004,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
spark = SparkSession.\
    builder.\
    master("local[*]").\
    appName("Example1").\
    getOrCreate()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql import functions as F
from sedona.spark import *

def to_local_time(df: DataFrame, 
                  timezone_to: str,
                  timestamp_col: str = "timestamp",
                  epoch_unit: str = "seconds") -> DataFrame:
    """Transforms a column of epoch times in a Spark DataFrame to a local time zone specified timezone_to. Additional columns for date, hour, and day of the week are also added to the DataFrame.

    Parameters
    ----------
    df : DataFrame
        A Spark DataFrame containing a column with epoch times to be converted.

    timezone_to : str
        A valid timezone identifier for the local timezone of the data (e.g., "America/New_York", "UTC").

    timestamp_col : str (default "timestamp")
        The name of the column in 'df' containing epoch times.

    epoch_unit : str (default "seconds")
        The unit of epoch time in the 'timestamp_col'. Acceptable values include "seconds", "milliseconds", 
        "microseconds", and "nanoseconds". Defaults to "seconds" if not specified.

    Returns
    ----------
    DataFrame
        A new Spark DataFrame with all original columns from 'df' and the following additional columns:
            - 'local_timestamp': The local timestamp derived from the original epoch time.
            - 'date': The date extracted from 'local_timestamp'.
            - 'date_hour': The date and hour extracted from 'local_timestamp'.
            - 'day_of_week': The day of the week (1 for Sunday, 2 for Monday, ..., 7 for Saturday) derived from 'local_timestamp'.

    Example
    ----------
    >>> # Assuming a SparkSession `spark` and a DataFrame `df` with a 'timestamp' column are predefined.
    >>> timezone_str = "America/New_York"
    >>> converted_df = to_local_time(df, timezone_str)
    >>> converted_df.show()
    """

    divisor = {
        "seconds": 1,
        "milliseconds": 1000,
        "microseconds": 1000000,
        "nanoseconds": 1000000000
    }.get(epoch_unit, 1)

    df = df.withColumn(
        "local_timestamp",
        F.from_utc_timestamp(
            F.to_timestamp(F.col(timestamp_col) / divisor),
            timezone_to
        ))

    df = df.withColumn(
        "date",
        F.to_date(F.col("local_timestamp"))
    ).withColumn(
        "date_hour",
        F.date_format(F.col("local_timestamp"), "yyyy-MM-dd HH")
    ).withColumn(
        "day_of_week",
        F.dayofweek(F.col("local_timestamp"))
    )

    return df

def to_mercator(df: DataFrame, 
                spark: SparkSession,
                longitude_col: str = "longitude", 
                latitude_col: str = "latitude") -> DataFrame:
    """Converts geographic coordinates from EPSG:4326 to EPSG:3857 (Web Mercator projection) and appends the result as a new column 'mercator_coord' to the input DataFrame.
    
    Parameters
    ----------
    df : DataFrame
        A Spark DataFrame containing columns corresponding to longitude and latitude values in EPSG:4326.

    spark : SparkSession
        The active SparkSession instance used to execute Spark SQL operations.

    Returns
    ----------
    DataFrame
        A new Spark DataFrame with all original columns from 'df' and an additional column 'mercator_coord' containing the geometry (point) of the original latitude and longitude values transformed to the EPSG:3857 coordinate system.

    Example
    ----------
    >>> # Assuming a SparkSession `spark` and a DataFrame `df` are predefined
    >>> mercator_df = to_mercator(df, spark)
    >>> mercator_df.show()
    """
    
    df.createOrReplaceTempView("df")
    
    query = f"""
        SELECT *,
               ST_FlipCoordinates(
                   ST_Transform(
                       ST_MakePoint({longitude_col}, {latitude_col}), 
                       'EPSG:4326', 'EPSG:3857'
                   )
               ) AS mercator_coord
        FROM df
        """
    
    return spark.sql(query)

def coarse_filter(df: DataFrame, 
                  bounding_wkt: str, 
                  spark: SparkSession,
                  longitude_col: str = "longitude", 
                  latitude_col: str = "latitude", 
                  id_col: str = "id") -> DataFrame:
    """Filters a DataFrame based on whether geographical points (defined by longitude and latitude) fall within a specified geometry.

    Parameters
    ----------
    df : DataFrame
        The Spark DataFrame to be filtered. It should contain columns corresponding to longitude and latitude values, as well as an id column.
    
    bounding_wkt : str
        The Well-Known Text (WKT) string representing the bounding geometry within which points are tested for inclusion. The WKT should define a polygon in the EPSG:4326 coordinate reference system.
    
    spark : SparkSession
        The active SparkSession instance used to execute Spark operations.
    
    longitude_col : str, default "longitude"
        The name of the column in 'df' containing longitude values. Longitude values should be in the EPSG:4326 coordinate reference system.
    
    latitude_col : str, default "latitude"
        The name of the column in 'df' containing latitude values. Latitude values should be in the EPSG:4326 coordinate reference system.
    
    id_col : str, default "id"
        The name of the column in 'df' containing user IDs.
    
    Returns
    ----------
    DataFrame
        A new Spark DataFrame filtered to include only rows where the point (longitude, latitude) falls within the specified geometric boundary defined by 'bounding_wkt'. This DataFrame includes all original columns from 'df'.

    Example
    ----------
    >>> # Assuming a SparkSession `spark` and a DataFrame `df` are predefined
    >>> bounding_wkt = "POLYGON((...))"  # Replace with actual WKT
    >>> filtered_df = coarse_filter(df, bounding_wkt, spark)
    >>> filtered_df.show()
    """
    
    df = df.withColumn("coordinate", F.expr(f"ST_MakePoint({longitude_col}, {latitude_col})"))
    df.createOrReplaceTempView("temp_df")
    
    query = f"""
        WITH UniqueIDs AS (
            SELECT DISTINCT {id_col} AS id
            FROM temp_df
            WHERE ST_Contains(ST_GeomFromWKT('{bounding_wkt}'), coordinate)
        )

        SELECT t.*
        FROM temp_df t
        INNER JOIN UniqueIDs u ON t.{id_col} = u.id
        """
    
    return spark.sql(query)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Intro to DAPHME

In this script, we will demonstrate some of DAPHME's functionalities for analyzing human mobility data. Our analysis will focus on the month of February for the city of Philadelphia. Using a dataset that originally contains [60000] users. 

In our analysis, we will perform three different kinds of operations, which are common in the analyses in these types of data. 
* **Importing**, which optionally converts the datetime to a specified timezone and project the coordinates into ESPG:3857 (which has units in meters and might be appropriate for a local analysis not too far from the equator). 
* **Coarse Filtering**, where we subset to users to make computations for tractable. In particular, we filter out users with very few days/hours with activity, and keep TODO [users with "sufficient" pings in the area]. We also filter to users with activity in a given time period.
* **Persisting**

### Load Pings

Daphme can work with a single file in a number of formats (csv, parquet, txt) but its true utility comes when working with partitions of the data, which are often organized in a directory. In this example, we specify the path of the files to be loaded.

In [15]:
#gravy_raw = spark.read.parquet("s3://phl-pings/gravy/")
gravy_raw = spark.read.parquet("s3://phl-pings/gravy/part_0.snappy.parquet")

gravy_raw = gravy_raw. \
    withColumn("latitude", F.col("geohashlatitude").cast(DoubleType())). \
    withColumn("longitude", F.col("geohashlongitude").cast(DoubleType()))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
gravy_raw.count()
gravy_raw.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- grid: string (nullable = true)
 |-- geohashlatitude: string (nullable = true)
 |-- geohashlongitude: string (nullable = true)
 |-- geohashnine: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- timezone: string (nullable = true)
 |-- ipaddress: string (nullable = true)
 |-- forensicflag: long (nullable = true)
 |-- devicetype: string (nullable = true)
 |-- recordcount: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)

In [None]:
#cuebiq_df = spark.read.options(header='True', inferSchema='True', delimiter=',').parquet("s3://phl-pings/cuebiq-jan-mar/")
#cuebiq_df.show()

### Convert UTC to local datetime

In [22]:
gravy_df = cleaning.to_local_time(gravy_raw, 'America/New_York', epoch_unit='milliseconds')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Coarse filter pings to Philadelphia

Takes in a geometry and filters to pings within the given geometry

In [8]:
# Load in a bounding box for Philadelphia

import boto3
from io import BytesIO

s3 = boto3.client('s3')
bucket_name = 'upenn-seas-wattscovid19lab'
object_key = 'paco/geometry/Census_Tracts_2010.geojson'

obj = s3.get_object(Bucket=bucket_name, Key=object_key)
geojson_data = json.load(BytesIO(obj['Body'].read()))

features = geojson_data['features']

polygons = [shape(feature["geometry"]).buffer(0) for feature in features if feature["properties"]["GEOID10"][:5] == "42101"]
phila_poly = unary_union(polygons).buffer(0.0015).simplify(0.0015)
phila_box = box(*phila_poly.bounds)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…



In [24]:
phila_box_wkt = phila_box.wkt

gravy_df_filtered = cleaning.coarse_filter(gravy_df, phila_box_wkt, spark, id_col="grid")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Convert coordinates to mercator in meters

In [25]:
gravy_df_filtered = cleaning.to_mercator(gravy_df_filtered, spark)
gravy_df_filtered = gravy_df_filtered.select("grid", "local_timestamp", "mercator_coord")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
gravy_df_filtered.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-------------------+--------------------+
|                grid|    local_timestamp|      mercator_coord|
+--------------------+-------------------+--------------------+
|00002b4c-410c-3a1...|2020-03-31 19:25:58|POINT (4850108.06...|
|0004887b-01de-392...|2019-11-27 17:24:09|POINT (4859015.82...|
|0004d938-bcf3-380...|2019-10-10 16:23:09|POINT (4956193.59...|
|0004d938-bcf3-380...|2019-10-10 16:23:51|POINT (4956193.59...|
|0004d938-bcf3-380...|2019-10-10 16:23:51|POINT (4956193.59...|
|0004d938-bcf3-380...|2019-10-10 16:24:05|POINT (4956199.90...|
|0004d938-bcf3-380...|2019-10-10 16:24:05|POINT (4956199.90...|
|0004d938-bcf3-380...|2019-10-10 16:24:09|POINT (4956199.90...|
|0004d938-bcf3-380...|2019-10-10 16:24:21|POINT (4956199.90...|
|0004d938-bcf3-380...|2019-10-10 16:24:33|POINT (4956199.90...|
|0004d938-bcf3-380...|2019-10-10 16:24:33|POINT (4956199.90...|
|0004d938-bcf3-380...|2019-10-10 16:24:55|POINT (4956199.90...|
|0004d938-bcf3-380...|2019-10-10 16:24:5

In [20]:
gravy_df_filtered.count()
#gravy_df_filtered.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

30005324

## Persist

In [None]:
output = gravy_df_filtered.select(
    'grid', 
    'local_timestamp', 
    'mercator_coord', 
    'date', 
    'date_hour', 
    'day_of_week')
output.write.partitionBy("date").option("header", "true").mode("overwrite").parquet("s3://phl-pings/gravy_clean/")