# Spark notebook for sample size statistics

### Installing and importing the Python/PySpark libraries

In [None]:
# install and import all the required Python/PySpark libraries
# Note that geospark must be installed here itself before registering Scala libraries below

!pip install geopandas
!pip install geospark
!pip install plotly
!pip install pymobility

import time
import calendar
from datetime import datetime, date
import pandas as pd
import geopandas as gp
import plotly.express as px

from pyspark.sql import functions as F

from mobility.eda.mobility_summary import MobilitySummary
from mobility.odm.utils import extract

### Importing the geospark libraries

In [None]:
%scala

// import the necessary libraries, packages for geospark

import com.vividsolutions.jts.geom.{Coordinate, Geometry, GeometryFactory}
import org.datasyslab.geospark.formatMapper.shapefileParser.ShapefileReader
import org.datasyslab.geospark.spatialRDD.SpatialRDD
import org.datasyslab.geosparksql.utils.{Adapter, GeoSparkSQLRegistrator}
GeoSparkSQLRegistrator.registerAll(sqlContext)

// enable delta cache for performance optimization
spark.conf.set("spark.databricks.io.cache.enabled", "true")

### Defining the variables

In [None]:
# define the variables using their respective values
ADMIN_LAYER_NAME = 'Admin3'

INDIA_VERASET_DELTA_LOCATION = '/mnt/CUBEIQ/esapv/India/delta_veraset_v3'

### Records frequency computation

In [None]:
# instantiate the MobilitySummary object with None admin
ms = MobilitySummary(INDIA_VERASET_DELTA_LOCATION, adm_col=None)

# filter for West Bengal
ms.sjr = ms.sjr.filter(sjr.adm1_code == '19')

# extract data of just 2020
ms.sjr = ms.extract(ms.sjr, '2020/01/01', '2020/12/31')

# generate the frequency table
frequency_table = ms.get_frequency_map()

# convert the frequency table to pandas dataframe
f_df = frequency_table.toPandas()

# save to a CSV file
f_df.to_csv('/dbfs/mnt/CUBEIQ/esapv/India/West Bengal/frequency_map_2020.csv', index=False)

### Unique devices per admin computation (Annual)

In [None]:
# admin level at which we are computing number of unique devices
admin = "adm2_code"

# create instance of MobilitySummary and get spark od_matrix
# as we are computing at annual level, we take 6 records filter
ms = MobilitySummary(INDIA_VERASET_DELTA_LOCATION, admin, quality_filter=6, spark=spark)

# filtering for just West Bengal
ms.sjr = ms.sjr.filter(ms.sjr.adm1_code == '19')

# computing number of unique devices in 2020
adm_samples = ms.get_devices_per_admin("2020/01/01", "2020/12/31")

# converting to pandas DataFrame and saving it to further produce charts
df = adm_samples.toPandas()
df.to_csv('/dbfs/mnt/CUBEIQ/esapv/India/West Bengal/annual_unique_devices_per_admin_2020.csv', index=False)

### Unique devices per admin computation (Monthly)

In [None]:
# admin level at which we are computing number of unique devices
admin = "adm2_code"

# create instance of MobilitySummary
# we are using 3 records filter as it is monthly analysis
ms = MobilitySummary(INDIA_VERASET_DELTA_LOCATION, admin, quality_filter=3, spark=spark)

# filtering for just West Bengal
ms.sjr = ms.sjr.filter(ms.sjr.adm1_code == '19')

# pandas master dataframe to store all the months statistics   
master_df = pd.DataFrame()

# looping over all the months, January to December
for i in range(1, 13):
  num_days = calendar.monthrange(2020, i)[1]
  from_date = f"2020/{str(i).zfill(2)}/01"
  to_date = f"2020/{str(i).zfill(2)}/{num_days}"
  
  # get number of unique devices per admin in the given interval range
  adm_samples = ms.get_devices_per_admin(from_date, to_date)
  
  # convert spark dataframe to pandas dataframe and concat it in the master dataframe
  df = adm_samples.toPandas()
  df['month'] = i
  
  if master_df.empty:
    master_df = df
  else:
    master_df = pd.concat([master_df, df])

# export master dataframe
master_df.to_csv('/dbfs/mnt/CUBEIQ/esapv/India/monthly_unique_devices_per_admin_2020.csv', index=False)