# Spark notebook for computing Amphan Cyclone induced movement

### Installing and importing the Python/PySpark libraries

In [0]:
# install and import all the required Python/PySpark libraries
# Note that geospark must be installed here itself before registering Scala libraries below

!pip install geopandas
!pip install geospark
!pip install plotly
!pip install pymobility --upgrade

import os
import time
import calendar
from datetime import datetime, date
import pandas as pd
import geopandas as gp
import plotly.express as px

from pyspark.sql import functions as F

from mobility.odm.odm import OriginDestinationMigration
from mobility.odm.utils import get_net_migration_from_od_matrix, extract

### Importing the geospark libraries

In [0]:
%scala

// import the necessary libraries, packages for geospark

import com.vividsolutions.jts.geom.{Coordinate, Geometry, GeometryFactory}
import org.datasyslab.geospark.formatMapper.shapefileParser.ShapefileReader
import org.datasyslab.geospark.spatialRDD.SpatialRDD
import org.datasyslab.geosparksql.utils.{Adapter, GeoSparkSQLRegistrator}
GeoSparkSQLRegistrator.registerAll(sqlContext)

// enable delta cache for performance optimization
spark.conf.set("spark.databricks.io.cache.enabled", "true")

## Movement period: 2020/05/20 to 2020/05/27

### Defining the variables

In [0]:
# variables based on admin level, locations, and dates
admin = "adm2" # admin level to which we are calculating OD matrix
from_date = "2020/05/20"
to_date = "2020/05/27"
veraset_ind = '/mnt/CUBEIQ/esapv/India/delta_veraset_v3'
output_location = '/mnt/CUBEIQ/esapv/India/West Bengal/Amphan/200520_200527' # location where the OD matrix and net migration file are exported

india_gpkg_file_location = '/dbfs/mnt/CUBEIQ/esapv/India/India_Administrative_Boundaries.gpkg'
gpkg_admin = 'Admin3'

In [0]:
gdf = gp.read_file(india_gpkg_file_location, layer=gpkg_admin)

### OD matrices computation

In [0]:
# create instance of OriginDestinationMigration and get spark OD matrices
od_migration = OriginDestinationMigration(admin + '_code', veraset_ind, output_location, record_filter=2, spark=spark)

# extract just the data of our period
od_migration.sjr = extract(od_migration.sjr, from_date, to_date)

# filter out records of the devices which are not present in West Bengal at least once
relevant_devices = od_migration.sjr.filter(F.col('adm1_code') == '19').select('device_id').distinct()
od_migration.sjr = od_migration.sjr.join(relevant_devices, on='device_id', how='inner')

# get the OD matrix and save/export
od_matrix = od_migration.get_od_matrix(from_date, to_date)
od_matrix.coalesce(1).write.format("com.databricks.spark.csv").mode('overwrite').option("header", "true").save(od_migration.out_loc)

In [0]:
# read the exported CSV OD matrix file and convert it to net movement dataframe
file_location = '/dbfs/' + output_location + '/part-00000-tid-6702305786803399341-bd957b3e-e421-45be-b5c0-108bb16e5ac3-19009-1-c000.csv'
net_df = get_net_migration_from_od_matrix(file_location, 'origin', 'dest', 'count(1)', admin)

# just take West Bengal figures
wb = net_df[net_df['adm2'].isin(gdf[gdf['L1_CODE'] == 19]['L2_CODE'].drop_duplicates())]
wb['net_mvmt_pct'] = wb['net_mvmt'] / wb['total'] * 100
wb['adm2_name'] = wb['adm2'].map(dict(zip(gdf['L2_CODE'], gdf['L2_NAME'])))

## Export

In [0]:
wb.to_csv('path_to_export.csv')