In [1]:
%%configure -f

{"conf":
     {"spark.pyspark.python":"/home/hadoop/epidemics-venv/bin/python3",
      "spark.pyspark.virtualenv.bin.path":"/home/hadoop/epidemics-venv/bin",
      "spark.dynamicAllocation.enabled":"true",
      "spark.executor.memory":"16g",
      "spark.driver.memory":"16g",
      "spark.executor.cores":"6",
      "spark.driver.cores":"6",
      "spark.driver.maxResultSize":"20g",
      "livy.server.session.timeout-check" :"false"}
}

In [None]:
# as per chatgpt's recommendations (to try after break)

%%configure -f

{"conf":
     {
      "spark.pyspark.python":"/home/hadoop/epidemics-venv/bin/python3",
      "spark.pyspark.virtualenv.bin.path":"/home/hadoop/epidemics-venv/bin",
      "spark.dynamicAllocation.enabled":"true",
      "spark.dynamicAllocation.minExecutors":"44",
      "spark.dynamicAllocation.maxExecutors":"88",
      "spark.executor.instances":"44",
      "spark.executor.memory":"21g",
      "spark.driver.memory":"28g",
      "spark.executor.cores":"5",
      "spark.driver.cores":"5",
      "spark.driver.maxResultSize":"20g",
      "spark.serializer":"org.apache.spark.serializer.KryoSerializer",
      "spark.kryoserializer.buffer.max":"1024m",
      "spark.default.parallelism":"528",
      "spark.sql.shuffle.partitions":"528",
      "spark.memory.fraction":"0.8",
      "spark.memory.storageFraction":"0.5",
      "spark.shuffle.service.enabled":"true",
      "spark.shuffle.compress":"true",
      "spark.io.compression.codec":"snappy",
      "spark.shuffle.file.buffer":"1m",
      "spark.rdd.compress":"true",
      "spark.executor.extraJavaOptions":"-XX:+UseG1GC",
      "spark.driver.extraJavaOptions":"-XX:+UseG1GC",
      "livy.server.session.timeout-check":"false"
     }
}


VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,application_1705592640666_0001,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
import pandas as pd
import geopandas as gpd
import json
import shapely
from shapely.geometry import Point, MultiPoint, Polygon, shape, box
from shapely.ops import unary_union, transform

from pyspark.sql import DataFrame, SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, DoubleType

import daphme as dm
from daphme import cleaning

from sedona.spark import *

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
UID = 'uid'
TIMESTAMP = 'timestamp'
MERCATOR_COORD = 'mercator_coord'
MERCATOR_X = 'x'
MERCATOR_Y = 'y'
LATITUDE = 'latitude'
LONGITUDE = 'longitude'
DATE = 'date'
DATE_HOUR = 'date_hour'
DAY_OF_WEEK = 'day_of_week'

schema = StructType([ 
    StructField(UID, StringType(), True), 
    StructField(TIMESTAMP, LongType(), True), 
    StructField(LATITUDE, DoubleType(), True), 
    StructField(LONGITUDE, DoubleType(), True), 
]) 

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
spark = SparkSession.\
    builder.\
    master("local[*]").\
    appName("Example1").\
    getOrCreate()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Intro to DAPHME

In this script, we will demonstrate some of DAPHME's functionalities for analyzing human mobility data. Our analysis will focus on the month of February for the city of Philadelphia. Using a dataset that originally contains [60000] users. 

In our analysis, we will perform three different kinds of operations, which are common in the analyses in these types of data. 
* **Importing**, which optionally converts the datetime to a specified timezone and project the coordinates into ESPG:3857 (which has units in meters and might be appropriate for a local analysis not too far from the equator). 
* **Coarse Filtering**, where we subset to users to make computations for tractable. In particular, we filter out users with very few days/hours with activity, and keep TODO [users with "sufficient" pings in the area]. We also filter to users with activity in a given time period.
* **Persisting**

### Load Pings

Daphme can work with a single file in a number of formats (csv, parquet, txt) but its true utility comes when working with partitions of the data, which are often organized in a directory. In this example, we specify the path of the files to be loaded.

In [30]:
# Define schema for the dataset

gravy_schema = StructType([ 
    StructField('grid', StringType()), 
    StructField('geohashlatitude', StringType()), 
    StructField('geohashlongitude', StringType()),
    StructField('geohashnine', StringType()),
    StructField('timestamp', LongType()), 
    StructField('timezone', StringType()), 
    StructField('ipaddress', StringType()),
    StructField('forensicflag', LongType()), 
    StructField('devicetype', StringType()),
    StructField('recordcount', IntegerType())
]) 

gravy_raw = spark.read.schema(gravy_schema).parquet("s3://phl-pings/gravy/")

gravy_raw = gravy_raw. \
    withColumn("geohashlatitude", F.col("geohashlatitude").cast(DoubleType())). \
    withColumn("geohashlongitude", F.col("geohashlongitude").cast(DoubleType()))

# We require expected column names for relevant columns

gravy_raw = gravy_raw.selectExpr("grid as uid",
                                 "timestamp",
                                 "geohashlatitude as latitude",
                                 "geohashlongitude as longitude")

gravy_raw.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-------------+---------+----------+
|                 uid|    timestamp| latitude| longitude|
+--------------------+-------------+---------+----------+
|00000006-23aa-3e4...|1583099437000|39.947813|-74.911673|
|00000013-ec62-3ef...|1571218566000|40.013387|-75.144017|
|00000013-ec62-3ef...|1571225799000|40.013387|-75.144017|
|00000013-ec62-3ef...|1571236576000|40.031884|-75.174015|
|00000013-ec62-3ef...|1572230042000|40.013087|-75.094707|
|00000013-ec62-3ef...|1572233767000|40.013087|-75.094707|
|00000013-ec62-3ef...|1572244569000|40.013087|-75.094707|
|00000013-ec62-3ef...|1572248169000|40.013087|-75.094707|
|00000013-ec62-3ef...|1572255366000|40.013087|-75.094707|
|00000013-ec62-3ef...|1572258968000|40.013087|-75.094707|
|00000013-ec62-3ef...|1572262567000|39.980814|-75.172212|
|00000013-ec62-3ef...|1572266173000|39.980814|-75.172212|
|00000013-ec62-3ef...|1572284181000|40.013087|-75.094707|
|00000013-ec62-3ef...|1572287781000|39.980814|-75.172212|
|00000013-ec62

In [5]:
gravy_raw.count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

18440128718

In [6]:
#cuebiq_df = spark.read.options(header='True', inferSchema='True', delimiter=',').parquet("s3://phl-pings/cuebiq-jan-mar/")
#cuebiq_df.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Convert UTC to local datetime

In [5]:
gravy_df = cleaning.to_local_time(gravy_raw, 'America/New_York', epoch_unit='milliseconds')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Coarse filter pings to Philadelphia

Takes in a geometry and filters to pings within the given geometry

In [6]:
# Load in a bounding box for Philadelphia

import boto3
from io import BytesIO

s3 = boto3.client('s3')
bucket_name = 'upenn-seas-wattscovid19lab'
object_key = 'paco/geometry/Census_Tracts_2010.geojson'

obj = s3.get_object(Bucket=bucket_name, Key=object_key)
geojson_data = json.load(BytesIO(obj['Body'].read()))

features = geojson_data['features']

polygons = [shape(feature["geometry"]).buffer(0) for feature in features if feature["properties"]["GEOID10"][:5] == "42101"]
phila_poly = unary_union(polygons).buffer(0.0015).simplify(0.0015)
phila_box = box(*phila_poly.bounds)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…



In [7]:
phila_box_wkt = phila_box.wkt

gravy_df_filtered = cleaning.coarse_filter(gravy_df, phila_box_wkt, spark, id_col="grid")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Convert coordinates to mercator in meters

In [8]:
gravy_df_filtered = cleaning.to_mercator(gravy_df_filtered, spark)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Persist

In [11]:
gravy_df_clean = gravy_df_filtered.select(
    'grid', 
    'local_timestamp', 
    'mercator_coord', 
    'x',
    'y',
    'in_geo',
    'date', 
    'date_hour', 
    'day_of_week'
).withColumnRenamed("grid", "identifier")

gravy_df_clean.write.partitionBy("date").option("header", "true").mode("overwrite").parquet("s3://phl-pings/gravy_clean/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
An error occurred while calling o158.parquet.
: org.apache.spark.SparkException: Job aborted due to stage failure: Authorized committer (attemptNumber=0, stage=8, partition=282) failed; but task commit success, data duplication may happen. reason=ExecutorLostFailure(13,false,Some(Container marked as failed: container_1703550315317_0002_01_000014 on host: ip-10-0-1-136.us-east-2.compute.internal. Exit status: -100. Diagnostics: Container released on a *lost* node.))
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2974)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2910)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2909)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.s

In [14]:
gravy_df_clean.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-------------------+--------------------+-------------------+------------------+------+----------+-------------+-----------+
|          identifier|    local_timestamp|      mercator_coord|                  x|                 y|in_geo|      date|    date_hour|day_of_week|
+--------------------+-------------------+--------------------+-------------------+------------------+------+----------+-------------+-----------+
|57f80ddf-e272-37c...|2020-01-05 06:28:39|POINT (-8339129.2...| -8339129.292832218| 4858361.502393234| false|2020-01-05|2020-01-05 06|          1|
|57f80ddf-e272-37c...|2020-01-07 18:43:31|POINT (-8367196.0...| -8367196.053406948| 4859015.823786038|  true|2020-01-07|2020-01-07 18|          3|
|57f904d1-7a14-436...|2020-03-23 03:30:28|POINT (-8369513.0...| -8369513.057288318| 4854598.321212323|  true|2020-03-23|2020-03-23 03|          2|
|57fb1ca6-13fe-39d...|2020-04-25 12:26:00|POINT (-8362108.1...|  -8362108.19608024|  4865112.55255486|  true|2020-04-2