In [None]:
"""
Tshikana Rasehlomi
e-mail: rasehlomi@gmail.com
DS_CODE CHALLENGE
QUESTION 2: INITIAL DATA TRANSFORMATION 
10-07-2022
"""

# Load required modules
#=================================================================================================================================
import pandas as pd
import geopandas as gpd
import time
import matplotlib.pyplot as plt

# read the .geojson and csv files respective
#====================================================================================================

# read a geojson file
#====================================================================================================
hex_gjson = gpd.read_file('https://cct-ds-code-challenge-input-data.s3.af-south-1.amazonaws.com/city-hex-polygons-8.geojson') 

# read a service requests.csv file
#====================================================================================================
sr_df = pd.read_csv('https://cct-ds-code-challenge-input-data.s3.af-south-1.amazonaws.com/sr.csv.gz', compression='gzip', header=0, sep=',', quotechar='\"') 
# convert a panda dataframe to a geopandas geodataframe
#======================================================================================================
sr_df_gpd = gpd.GeoDataFrame(sr_df, crs={'init' :'epsg:4326'}, geometry=gpd.points_from_xy(sr_df.longitude, sr_df.latitude))
sr_df_gpd.head()

In [None]:
# quick visualization of the service request map
#========================================================================
fig, ax = plt.subplots(figsize=(12,10))
hex_gjson.plot(color="Gray", ax= ax)
sr_df_gpd.plot(ax=ax, markersize=1)
ax.set_xlabel('Longitude ($^oE$)', fontsize=10)
ax.set_ylabel('Latitude ($^oS$)', fontsize=10)
plt.show()

In [None]:
# check missing values on lat and longitude
#====================================================================================================
sr_df.isnull().values.any() # check on the entire dataframe
sr_df.isnull().values.any() # check on longitude
sr_df.latitude.isnull().any() # check on latitude
#if there are missing values, replace the lat and longitude values with zero
#=====================================================================================================
sr_df.longitude.fillna(0,inplace=True)
sr_df.latitude.fillna(0,inplace=True)

In [None]:
# Before we perform spatial joining of the 2 files we need to answer the following question:
# Are the layers in the same projection? 
#===================================================================================================================
hex_gjson.crs == sr_df_gpd.crs # False = No, True = Yes

# if they are NOT, then we re-project to the projection of the hex-level-8.json
#===================================================================================================================
sr_df_gpd = sr_df_gpd.to_crs(hex_gjson.crs)
# verify the new crs projection of service requests
#===================================================================================================================
print(sr_df_gpd.crs)
# verify that they have similar projection
#====================================================================================================================
sr_df_gpd.crs == hex_gjson.crs

In [None]:
# Now we perform a spatial joining of the service request to H3 res level 8.geojson file
# Record the time it takes to join service requests to single H3 resolution level 8
# include error log record
#=======================================================================================

start_time = time.time()
try:
    merged_service_requests = gpd.sjoin(sr_df_gpd, hex_gjson, op="within")
    print("dataframes successfully merged")
    
except FileNotFoundError:
    print('file not found')

end_time = time.time()

print("Operation_time_to_merge_df: ",(end_time-start_time),"sec")

# sample of rows from merged dataframe
#=============================================================================================
merged_service_requests.head()

In [None]:
# quick visualization of the merged dataframe
#=======================================================================================
merged_service_requests.plot(column="official_suburb", cmap="Blues_r", figsize=(10,6))
plt.xlabel('Longitude ($^oE$)', fontsize=10)
plt.ylabel('Latitude ($^oS$)', fontsize=10)
sr_df_gpd.plot(ax=ax)
plt.show()

In [None]:
# Did our joining function increase or reduce some data points? 
# check the lengths of the new dataframe vs original dataframe
#==============================================================================

len(merged_service_requests), len(sr_df_gpd)

In [None]:
# The new merged dataframe has fewer data points than the original service request dataframe (df_gpd)
# importantly, when plot layers on top of each other, it is clear that some points fall outside the grid squares
# this means that we may have not necessarily lost the data points
# let us visualize  these data points
#==============================================================================================
fig, ax = plt.subplots(figsize=(10,6))
hex_gjson.plot(color="Gray", ax=ax)
sr_df_gpd.plot(ax=ax, color='orange', markersize=5)
plt.xlabel('Longitude ($^oE$)', fontsize=10)
plt.ylabel('Latitude ($^oS$)', fontsize=10)
plt.show()