# Spark notebook for computing weekly movement matrices

### Installing and importing the Python/PySpark libraries

In [0]:
# install and import all the required Python/PySpark libraries
# Note that geospark must be installed here itself before registering Scala libraries below

!pip install geopandas
!pip install geospark
!pip install plotly
!pip install pymobility

import os
import time
import calendar
from datetime import datetime, date
import pandas as pd
import geopandas as gp
import plotly.express as px

from pyspark.sql import functions as F

from mobility.odm.odm import OriginDestinationMigration
from mobility.odm.utils import get_net_migration_from_od_matrix

### Importing the geospark libraries

In [0]:
%scala

// import the necessary libraries, packages for geospark

import com.vividsolutions.jts.geom.{Coordinate, Geometry, GeometryFactory}
import org.datasyslab.geospark.formatMapper.shapefileParser.ShapefileReader
import org.datasyslab.geospark.spatialRDD.SpatialRDD
import org.datasyslab.geosparksql.utils.{Adapter, GeoSparkSQLRegistrator}
GeoSparkSQLRegistrator.registerAll(sqlContext)

// enable delta cache for performance optimization
spark.conf.set("spark.databricks.io.cache.enabled", "true")

### Defining the variables

In [0]:
# variables based on admin level, locations, and dates
admin = "adm2" # admin level to which we are calculating OD matrix
from_date = "2020/08/26"
to_date = "2020/12/31"
veraset_ind = '/mnt/CUBEIQ/esapv/India/delta_veraset_v3'
output_location = '/mnt/CUBEIQ/esapv/India/West Bengal/Weekly' # location where the OD matrix and net migration file are exported

india_gpkg_file_location = '/dbfs/mnt/CUBEIQ/esapv/India/India_Administrative_Boundaries.gpkg'
gpkg_admin = 'Admin3'

In [0]:
gdf = gp.read_file(india_gpkg_file_location, layer=gpkg_admin)

### Weekly OD matrices computation

In [0]:
# create instance of OriginDestinationMigration and get spark OD matrices
od_migration = OriginDestinationMigration(admin + '_code', veraset_ind, output_location, record_filter=2, spark=spark)
od_migration.get_od_matrices(from_date, to_date, period='weekly')

### Weekly net movement figures computation

In [0]:
# compute net movement figures from the od matrix computed
for period in sorted(os.listdir('/dbfs' + output_location + '/Weekly/')):
  od_filename = [i for i in os.listdir('/dbfs' + output_location + '/Weekly/' + period) if i.startswith('part-') and i.endswith('.csv')][0]
  od_df = pd.read_csv('/dbfs' + output_location + '/Weekly/' + period + '/' + od_filename)

  net_df = get_net_migration_from_od_matrix('/dbfs' + output_location + '/Weekly/' + period + '/' + od_filename, 'origin', 'dest', 'count(1)', admin)
  net_df.to_csv('/dbfs' + output_location + '/Weekly/' + period + '/net_migration.csv', index=False)

### Concatenate all the weekly net movement figures into a master dataframe

In [0]:
# actually we create master dataframe for three figures: net movement, total number of samples, and net movement percentage

master_df_nm = None
master_df_nmp = None
master_df_samples = None

for period in sorted(os.listdir('/dbfs' + output_location + '/Weekly/')):
    
  df = pd.read_csv('/dbfs' + output_location + '/Weekly/{}/net_migration.csv'.format(period))
  df['net_mvmt_pct'] = df['net_mvmt'] / df['total']
  
  if master_df_nm is None or master_df_nmp is None:
    master_df_nm = df
    master_df_nmp = df
    master_df_samples = df
    
    master_df_nm = master_df_nm.rename(columns={'net_mvmt': period})
    master_df_nmp = master_df_nmp.rename(columns={'net_mvmt_pct': period})
    master_df_samples = master_df_samples.rename(columns={'total': period})

    master_df_nm.drop(['total', 'net_mvmt_pct'], axis=1, inplace=True)
    master_df_nmp.drop(['total', 'net_mvmt'], axis=1, inplace=True)
    master_df_samples.drop(['net_mvmt_pct', 'net_mvmt'], axis=1, inplace=True)
    
  else:
    master_df_nm[period] = master_df_nm[admin].map(dict(zip(df[admin], df['net_mvmt'])))
    master_df_nmp[period] = master_df_nm[admin].map(dict(zip(df[admin], df['net_mvmt_pct'])))
    master_df_samples[period] = master_df_nm[admin].map(dict(zip(df[admin], df['total'])))

### Wide format to long format conversion

In [0]:
# convert all the master dataframes from wide format to long format
long_df_nm = master_df_nm.set_index(['adm2']).unstack().reset_index()
long_df_nm.columns = ['Week', 'adm2', 'net_mvmt']

long_df_nmp = master_df_nmp.set_index(['adm2']).unstack().reset_index()
long_df_nmp.columns = ['Week', 'adm2', 'net_mvmt_pct']

long_df_samples = master_df_samples.set_index(['adm2']).unstack().reset_index()
long_df_samples.columns = ['Week', 'adm2', 'total']

### Export

In [0]:
# export all the master dataframes to respective CSV files
long_df_nm.to_csv('/dbfs/mnt/CUBEIQ/esapv/India/West Bengal/Weekly/long_df_nm.csv', index=False)
long_df_nmp.to_csv('/dbfs/mnt/CUBEIQ/esapv/India/West Bengal/Weekly/long_df_nmp.csv', index=False)
long_df_samples.to_csv('/dbfs/mnt/CUBEIQ/esapv/India/West Bengal/Weekly/long_df_samples.csv', index=False)