# Spark notebook for computing weekly movement matrices

### Installing and importing the Python/PySpark libraries

In [0]:
# install and import all the required Python/PySpark libraries
# Note that geospark must be installed here itself before registering Scala libraries below

!pip install geopandas
!pip install geospark
!pip install plotly
!pip install pymobility
!pip install xlrd
!pip install openpyxl

import os
import time
import calendar
from datetime import datetime, date
import pandas as pd
import geopandas as gp
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from pyspark.sql import functions as F

from mobility.odm.odm import OriginDestinationMigration
from mobility.odm.utils import get_net_migration_from_od_matrix, extract

### Importing the geospark libraries

In [0]:
%scala

// import the necessary libraries, packages for geospark

import com.vividsolutions.jts.geom.{Coordinate, Geometry, GeometryFactory}
import org.datasyslab.geospark.formatMapper.shapefileParser.ShapefileReader
import org.datasyslab.geospark.spatialRDD.SpatialRDD
import org.datasyslab.geosparksql.utils.{Adapter, GeoSparkSQLRegistrator}
GeoSparkSQLRegistrator.registerAll(sqlContext)

// enable delta cache for performance optimization
spark.conf.set("spark.databricks.io.cache.enabled", "true")

### Defining the variables

In [0]:
# variables based on admin level, locations, and dates
admin = "adm2" # admin level to which we are calculating OD matrix
from_date = "2020/01/01"
to_date = "2020/12/31"
veraset_ind = '/mnt/CUBEIQ/esapv/India/delta_veraset_v3'
output_location = '/mnt/CUBEIQ/esapv/India/West Bengal/Monthly' # location where the OD matrix and net migration file are exported

india_gpkg_file_location = '/dbfs/mnt/CUBEIQ/esapv/India/India_Administrative_Boundaries.gpkg'
gpkg_admin = 'Admin3'

In [0]:
gdf = gp.read_file(india_gpkg_file_location, layer=gpkg_admin)

### Monthly OD matrices computation

In [0]:
# create instance of OriginDestinationMigration and get spark OD matrices
od_migration = OriginDestinationMigration(admin + '_code', veraset_ind, output_location, record_filter=6, spark=spark)

od_migration.sjr = extract(od_migration.sjr, from_date, to_date)

relevant_devices = od_migration.sjr.filter(F.col('adm1_code') == '19').select('device_id').distinct()

od_migration.sjr = od_migration.sjr.join(relevant_devices, on='device_id', how='inner')

od_migration.get_od_matrices(from_date, to_date, period='monthly')

### Weekly net movement figures computation

In [0]:
adm_map = dict(gdf[['L2_CODE', 'L1_CODE']].values)

In [0]:
# compute net movement figures from the od matrix computed
for period in sorted(os.listdir('/dbfs' + output_location + '/Monthly/')):
  od_filename = [i for i in os.listdir('/dbfs' + output_location + '/Monthly/' + period) if i.startswith('part-') and i.endswith('.csv')][0]
  od_df = pd.read_csv('/dbfs' + output_location + '/Monthly/' + period + '/' + od_filename)
  od_df['origin_state'] = od_df['origin'].map(adm_map)
  od_df['dest_state'] = od_df['dest'].map(adm_map)
  
  state_od_df = od_df.groupby(['origin_state', 'dest_state'])['count(1)'].sum().reset_index()
  state_od_df.to_csv('/dbfs' + output_location + '/Monthly/' + period + '/state_od_matrix.csv')
  
  state_net_df = get_net_migration_from_od_matrix('/dbfs' + output_location + '/Monthly/' + period + '/state_od_matrix.csv', 'origin_state', 'dest_state', 'count(1)', 'adm1')
  
  state_net_df.to_csv('/dbfs' + output_location + '/Monthly/' + period + '/state_net_migration.csv', index=False)

### Concatenate all the monthly net movement figures into a master dataframe

In [0]:
# actually we create master dataframe for three figures: net movement, total number of samples, and net movement percentage
admin = 'adm1'
master_df_nm = None
master_df_nmp = None
master_df_samples = None

for period in sorted(os.listdir('/dbfs' + output_location + '/Monthly/')):
    
  df = pd.read_csv('/dbfs' + output_location + '/Monthly/{}/state_net_migration.csv'.format(period))

  df['net_mvmt_pct'] = df['net_mvmt'] / df['total']
  
  if master_df_nm is None or master_df_nmp is None:
    master_df_nm = df
    master_df_nmp = df
    master_df_samples = df
    
    master_df_nm = master_df_nm.rename(columns={'net_mvmt': period})
    master_df_nmp = master_df_nmp.rename(columns={'net_mvmt_pct': period})
    master_df_samples = master_df_samples.rename(columns={'total': period})

    master_df_nm.drop(['total', 'net_mvmt_pct'], axis=1, inplace=True)
    master_df_nmp.drop(['total', 'net_mvmt'], axis=1, inplace=True)
    master_df_samples.drop(['net_mvmt_pct', 'net_mvmt'], axis=1, inplace=True)
    
  else:
    master_df_nm[period] = master_df_nm[admin].map(dict(zip(df[admin], df['net_mvmt'])))
    master_df_nmp[period] = master_df_nm[admin].map(dict(zip(df[admin], df['net_mvmt_pct'])))
    master_df_samples[period] = master_df_nm[admin].map(dict(zip(df[admin], df['total'])))

### Convert to long format

In [0]:
# convert all the master dataframes from wide format to long format
long_df_nm = master_df_nm.set_index([admin]).unstack().reset_index()
long_df_nm.columns = ['Week', admin, 'net_mvmt']

long_df_nmp = master_df_nmp.set_index([admin]).unstack().reset_index()
long_df_nmp.columns = ['Week', admin, 'net_mvmt_pct']

long_df_samples = master_df_samples.set_index([admin]).unstack().reset_index()
long_df_samples.columns = ['Week', admin, 'total']

### Compare monthly movement figures with CMIE figures

In [0]:
cmie = pd.read_excel('/dbfs/FileStore/tables/return_migration_2021_06_09.xlsx', sheet_name="West Bengal", skiprows=1, engine='openpyxl')
cmie = cmie.dropna(how='all', axis=1)
cmie = cmie.rename(columns={'Unnamed: 0': 'Date'})

cmie['Date'] = pd.to_datetime(cmie['Date'])

cmie = cmie[cmie['Date'].dt.year == 2020]

for column in ['All', 'Rural', 'Urban']:
    cmie[column] = 100 * cmie[column]

In [0]:
wb = long_df_nm[long_df_nm['adm1'] == 19].merge(
  long_df_nmp[long_df_nmp['adm1'] == 19], on='Week'
).merge(
  long_df_samples[long_df_samples['adm1'] == 19], on='Week'
).drop(['adm1_x', 'adm1_y', 'adm1'], axis=1)

wb['net_mvmt_pct'] = 100 * wb['net_mvmt_pct']
wb['Week'] = pd.to_datetime(wb['Week'], format='%Y_%m_%d')

import plotly.graph_objects as go
fig = go.Figure()
fig.add_traces([
  go.Scatter(x=wb['Week'], y=wb['net_mvmt_pct']),
  go.Scatter(x=cmie['Date'], y=cmie['All'])
])

In [0]:
# Using plotly.express
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_traces([
    go.Scatter(
        opacity=0.8,
        x=wb['Week'],
        y=wb['net_mvmt_pct'],
        line={
            'color': 'rgb(0, 0, 255)'
        },
        name='Net movement percentage (%)',
    ),
], secondary_ys=[False, False])

fig.add_trace(
    go.Scatter(
        opacity=0.8,
        x=cmie['Date'].iloc[1:],
        y=cmie['All'].iloc[1:],
        name="Returnees' Percentage (CPHS)",
    ),
    secondary_y=True,
)
for trace in fig['data']:
    if trace['name'] not in  {'Returnees\' Percentage (CPHS)', 'Net movement percentage (%)'}:
        trace['showlegend'] = False


fig.update_layout(
    title="Movement in West Bengal",
    xaxis_title="Date",
    yaxis_title="Net movement percentage (%)",
    legend=dict(
        orientation='h',
        yanchor="bottom",
        y=1.01,
        xanchor="right",
        x=1
    ),
    width=1000,
    height=500
)

# Set y-axes titles
fig.update_yaxes(title_text="Net movement percentage (%) -- <b>Veraset</b>", secondary_y=False)
fig.update_yaxes(title_text="Returnees' percentage (%) -- <b>CPHS</b>", secondary_y=True)

# fig.write_image("Charts/CMIE_WB.png", scale=15)
fig.show()