# Spark notebook for analyzing agriculture induced movement

### Installing and importing the Python/PySpark libraries

In [0]:
# install and import all the required Python/PySpark libraries
# Note that geospark must be installed here itself before registering Scala libraries below

!pip install geopandas
!pip install geospark
!pip install plotly
!pip install pymobility --upgrade

import os
import time
import calendar
from datetime import datetime, date
import pandas as pd
import geopandas as gp
import plotly.express as px

from pyspark.sql import functions as F

from mobility.odm.odm import OriginDestinationMigration
from mobility.odm.utils import get_net_migration_from_od_matrix, extract

### Importing the geospark libraries

In [0]:
%scala

// import the necessary libraries, packages for geospark

import com.vividsolutions.jts.geom.{Coordinate, Geometry, GeometryFactory}
import org.datasyslab.geospark.formatMapper.shapefileParser.ShapefileReader
import org.datasyslab.geospark.spatialRDD.SpatialRDD
import org.datasyslab.geosparksql.utils.{Adapter, GeoSparkSQLRegistrator}
GeoSparkSQLRegistrator.registerAll(sqlContext)

// enable delta cache for performance optimization
spark.conf.set("spark.databricks.io.cache.enabled", "true")

## Pre-plantation

### Defining the variables

In [0]:
# variables based on admin level, locations, and dates
admin = "adm2" # admin level to which we are calculating OD matrix
from_date = "2019/05/01"
to_date = "2019/05/31"
unacast_ind = '/mnt/CUBEIQ/esapv/India/delta_unacast_v4'
output_location = '/mnt/CUBEIQ/esapv/India/West Bengal/Agricultural/Aman/Plantation/Pre' # location where the OD matrix and net migration file are exported

india_gpkg_file_location = '/dbfs/mnt/CUBEIQ/esapv/India/India_Administrative_Boundaries.gpkg'
gpkg_admin = 'Admin3'

In [0]:
gdf = gp.read_file(india_gpkg_file_location, layer=gpkg_admin)

### OD matrices computation

In [0]:
# create instance of OriginDestinationMigration and get spark OD matrices
od_migration = OriginDestinationMigration(admin + '_code', unacast_ind, output_location, record_filter=6, spark=spark)

od_migration.sjr = extract(od_migration.sjr, from_date, to_date)

relevant_devices = od_migration.sjr.filter(F.col('adm1_code') == '19').select('device_id').distinct()

od_migration.sjr = od_migration.sjr.join(relevant_devices, on='device_id', how='inner')

od_matrix = od_migration.get_od_matrix(from_date, to_date)

od_matrix.coalesce(1).write.format("com.databricks.spark.csv").mode('overwrite').option("header", "true").save(od_migration.out_loc)

In [0]:
file_location = '/dbfs/' + output_location + '/part-00000-tid-7610024131820182885-63d991da-9c91-4a4b-8120-ed2c56f21921-4031-1-c000.csv'
net_df = get_net_migration_from_od_matrix(file_location, 'origin', 'dest', 'count(1)', admin)

wb = net_df[net_df['adm2'].isin(gdf[gdf['L1_CODE'] == 19]['L2_CODE'].drop_duplicates())]

wb['net_mvmt_pct'] = wb['net_mvmt'] / wb['total'] * 100

wb['adm2_name'] = wb['adm2'].map(dict(zip(gdf['L2_CODE'], gdf['L2_NAME'])))

In [0]:
display(wb.sort_values(by=['net_mvmt_pct']))

adm2,net_mvmt,total,net_mvmt_pct,adm2_name
342,-5385,74106,-7.266618087604243,Kolkata
344,90,12326,0.7301638812266753,Paschim Medinipur
338,114,15175,0.7512355848434926,Hugli
345,133,10321,1.2886348222071504,Purba Medinipur
341,246,16205,1.5180499845726627,Haora
331,37,2421,1.5282940933498554,Dakshin Dinajpur
335,350,19632,1.78280358598207,Barddhaman
339,115,6214,1.8506598004505956,Bankura
337,952,40530,2.3488773747841107,North Twenty Four Parganas
343,407,17308,2.3515137508666517,South Twenty Four Parganas


## Post-plantation

### Defining the variables

In [0]:
# variables based on admin level, locations, and dates
admin = "adm2" # admin level to which we are calculating OD matrix
from_date = "2019/08/01"
to_date = "2019/08/31"
unacast_ind = '/mnt/CUBEIQ/esapv/India/delta_unacast_v4'
output_location = '/mnt/CUBEIQ/esapv/India/West Bengal/Agricultural/Aman/Plantation/Post' # location where the OD matrix and net migration file are exported

india_gpkg_file_location = '/dbfs/mnt/CUBEIQ/esapv/India/India_Administrative_Boundaries.gpkg'
gpkg_admin = 'Admin3'

### OD matrices computation

In [0]:
# create instance of OriginDestinationMigration and get spark OD matrices
od_migration = OriginDestinationMigration(admin + '_code', unacast_ind, output_location, record_filter=6, spark=spark)

od_migration.sjr = extract(od_migration.sjr, from_date, to_date)

relevant_devices = od_migration.sjr.filter(F.col('adm1_code') == '19').select('device_id').distinct()

od_migration.sjr = od_migration.sjr.join(relevant_devices, on='device_id', how='inner')

od_matrix = od_migration.get_od_matrix(from_date, to_date)

od_matrix.coalesce(1).write.format("com.databricks.spark.csv").mode('overwrite').option("header", "true").save(od_migration.out_loc)

In [0]:
import os
os.listdir('/dbfs/' + output_location)

In [0]:
file_location = '/dbfs/' + output_location + '/part-00000-tid-13350594704439879-43031924-b4ad-45d2-b848-ec817f0fca5f-4035-1-c000.csv'
net_df = get_net_migration_from_od_matrix(file_location, 'origin', 'dest', 'count(1)', admin)

wb = net_df[net_df['adm2'].isin(gdf[gdf['L1_CODE'] == 19]['L2_CODE'].drop_duplicates())]

wb['net_mvmt_pct'] = wb['net_mvmt'] / wb['total'] * 100

wb['adm2_name'] = wb['adm2'].map(dict(zip(gdf['L2_CODE'], gdf['L2_NAME'])))

In [0]:
display(wb.sort_values(by=['net_mvmt_pct']))

adm2,net_mvmt,total,net_mvmt_pct,adm2_name
335,-1261,26067,-4.837534046879196,Barddhaman
339,-314,7125,-4.407017543859649,Bankura
341,-339,15676,-2.162541464659352,Haora
345,-216,11245,-1.9208537127612275,Purba Medinipur
340,-78,4913,-1.5876246692448606,Puruliya
338,-217,15897,-1.3650374284456186,Hugli
331,-36,2671,-1.347809809060277,Dakshin Dinajpur
329,-61,5013,-1.2168362258128866,Koch Bihar
337,-460,42132,-1.0918067027437577,North Twenty Four Parganas
344,-91,12686,-0.7173261863471544,Paschim Medinipur
