In [1]:
url='https://raw.githubusercontent.com/databricks-industry-solutions/geoscan-fraud/main/data/transactions.csv'

In [2]:
from pyspark import SparkFiles
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.appName('Geo_Fraud').getOrCreate()
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [3]:
spark.sparkContext.addFile(url)
transactions = spark.read.option('schema', 'latitude float,longitude float,amount float,user string') \
          .option("header", "true") \
          .option("delim", ",") \
          .csv("file://" + SparkFiles.get("transactions.csv"),schema= 'latitude float,longitude float,amount float,user string')

In [4]:
display(transactions)

latitude,longitude,amount,user
40.720684,-74.01645,67.23,804c7fa2-8063-4ba...
40.72161,-73.993805,52.31,9cafdb6d-9134-4ee...
40.71957,-73.98714,89.58,72fc865a-0c34-409...
40.756462,-74.00364,186.77,72fc865a-0c34-409...
40.7317,-73.979996,198.84,65bd17be-b030-44a...
40.75478,-73.98101,150.34,65bd17be-b030-44a...
40.720787,-73.98129,197.0,804c7fa2-8063-4ba...
40.73443,-73.97867,139.47,65bd17be-b030-44a...
40.758423,-74.00124,42.8,65bd17be-b030-44a...
40.74708,-74.00015,99.79,9cafdb6d-9134-4ee...


In [5]:
import h3
from pyspark.sql.functions import udf
from pyspark.sql import functions as F


@udf("string")
def to_h3(lat, lng, precision):
    h = h3.geo_to_h3(lat, lng, precision)
    return h.upper()

In [6]:
display(
  transactions
    .groupBy(to_h3(F.col('latitude'), F.col('longitude'), F.lit(9)).alias('h3'))
    .count()
    .orderBy(F.desc('count'))
)

h3,count
892A107252FFFFF,210
892A1072197FFFF,210
892A1072CABFFFF,206
892A1072CA3FFFF,195
892A1072DD7FFFF,193
892A100D3CBFFFF,191
892A100D24BFFFF,191
892A100D24FFFFF,189
892A1072507FFFF,188
892A1072187FFFF,187


In [7]:
import folium
from folium import plugins

points = transactions.sample(0.1).toPandas()[['latitude', 'longitude']]
nyc = folium.Map([40.75466940037548,-73.98365020751953], zoom_start=12, width='80%', height='100%')
folium.TileLayer('Stamen Toner').add_to(nyc)
nyc.add_child(plugins.HeatMap(points.to_numpy(), radius=12))
nyc

In [8]:
from geoscan import Geoscan
import mlflow

with mlflow.start_run(run_name='GEOSCAN') as run:

    geoscan = Geoscan() \
        .setLatitudeCol('latitude') \
        .setLongitudeCol('longitude') \
        .setPredictionCol('cluster') \
        .setEpsilon(200) \
        .setMinPts(20)

    mlflow.log_param('epsilon', 200)
    mlflow.log_param('minPts', 20)

    model = geoscan.fit(points_df)
    mlflow.spark.log_model(model, 'model')
    run_id = run.info.run_id

TypeError: 'JavaPackage' object is not callable

In [None]:
geoJson = model.toGeoJson()
with open("{}/{}_geoscan.geojson".format(temp_directory, run_id), 'w') as f:
  f.write(geoJson)

import mlflow
client = mlflow.tracking.MlflowClient()
client.log_artifact(run_id, "{}/{}_geoscan.geojson".format(temp_directory, run_id))

In [None]:
folium.GeoJson(geoJson).add_to(nyc)
nyc

In [None]:
import random
from pyspark.sql.types import *

# we randomly select maximum 10 points within a same polygon of size 11 (30m)
def sample(latitudes, longitudes):
  l = list(zip(latitudes, longitudes))
  return random.sample(l, min(len(l), 10))

sample_schema = ArrayType(StructType([StructField("latitude", DoubleType()), StructField("longitude", DoubleType())]))
sample_udf = udf(sample, sample_schema)

sample_df = (
  points_df
    .groupBy(to_h3(F.col("latitude"), F.col("longitude"), F.lit(11)))
    .agg(F.collect_list(F.col("latitude")).alias("latitudes"), F.collect_list(F.col("longitude")).alias("longitudes"))
    .withColumn('sample', F.explode(sample_udf(F.col('latitudes'), F.col('longitudes'))))
    .select('sample.latitude', 'sample.longitude')
)

display(
  sample_df
    .groupBy(to_h3(F.col("latitude"), F.col("longitude"), F.lit(9)).alias("h3"))
    .count()
    .orderBy(F.desc("count"))
)


In [None]:
from pyspark.sql import functions as F

display(
  model
    .transform(points_df)
    .groupBy('cluster')
    .count()
    .orderBy(F.asc('cluster'))
)

In [None]:
from folium.plugins import MarkerCluster

nyc_anomalies_points = model.transform(points_df).filter(F.expr('cluster IS NULL')).sample(0.01).toPandas()
nyc_anomalies = folium.Map([40.75466940037548,-73.98365020751953], zoom_start=12, width='80%', height='100%')
folium.TileLayer('Stamen Toner').add_to(nyc_anomalies)
folium.GeoJson(geoJson, name="geojson").add_to(nyc_anomalies)
for _, point in nyc_anomalies_points.iterrows():
    folium.CircleMarker([point.latitude, point.longitude], radius=2, color='red').add_to(nyc_anomalies)

nyc_anomalies

In [None]:
dbutils.fs.rm("{}/{}_geoscan".format(temp_directory, run_id), True)
model.save("{}/{}_geoscan".format(temp_directory, run_id))


In [None]:
from geoscan import GeoscanModel
model = GeoscanModel.load("{}/{}_geoscan".format(temp_directory, run_id))

In [None]:
from geoscan import GeoscanPersonalized
import mlflow

with mlflow.start_run(run_name='GEOSCAN_PERSONALIZED') as run:

    geoscan = GeoscanPersonalized() \
        .setLatitudeCol('latitude') \
        .setLongitudeCol('longitude') \
        .setPredictionCol('cluster') \
        .setGroupedCol('user') \
        .setEpsilon(100) \
        .setMinPts(3)

    models = geoscan.fit(points_df)

    mlflow.log_param('epsilon', 100)
    mlflow.log_param('minPts', 3)
    run_id = run.info.run_id

In [None]:
model_path = config['model']['path']


models.save(model_path)


from geoscan import GeoscanPersonalizedModel
model_personalized = GeoscanPersonalizedModel.load(model_path)

In [None]:
geoJsons = model_personalized.toGeoJson()
display(geoJsons)

In [None]:
from pyspark.sql import functions as F

user = '9cafdb6d-9134-4ee8-bdf6-972ebc3af729'
personalized_geojson = geoJsons.filter(F.col('user') == user).toPandas().iloc[0].cluster
personalized_data = points_df.filter(F.col('user') == user).toPandas()[['latitude', 'longitude']]

nyc_personalized = folium.Map([40.75466940037548,-73.98365020751953], zoom_start=12, width='80%', height='100%')
folium.TileLayer('Stamen Toner').add_to(nyc_personalized)
nyc_personalized.add_child(plugins.HeatMap(personalized_data.to_numpy(), radius=8))
folium.GeoJson(personalized_geojson, name="geojson").add_to(nyc_personalized)
nyc_personalized

Reference
https://github.com/databricks-industry-solutions/geoscan-fraud
https://github.com/databricks-industry-solutions/geoscan-fraud/blob/main/01_geofraud_clustering.py#L174