# Get raw data data from google cloud storage

In [11]:
from google.cloud import storage
import json

In [12]:
# Instantiate a Google Cloud Storage client and specify required bucket and file
storage_client = storage.Client()
bucket = storage_client.get_bucket('opengeokpi-bucket')

# get blobs
blob_departements = bucket.blob('refine/territory/departements.json')

# Download the contents of the blob as a string and then parse it using json.loads() method
data_departements = json.loads(blob_departements.download_as_string(client=None))

# discretizer function

In [13]:
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Polygon
import numpy as np
import shapely.speedups
shapely.speedups.enable()
import descartes
import matplotlib.pyplot as plt
from geopandas.tools import sjoin

In [14]:
def discretize(geojson_to_discretize, code, length, wide):
    
    ######################### prepare data to discretize
    
    # convert geojson to geopandadataframe
    gdf_data = gpd.GeoDataFrame.from_features(geojson_to_discretize["features"])
    
    # set crs
    gdf_data = gdf_data.set_crs(epsg=4326)
    
    # select code of territory to subset
    gdf_data_subset =  gdf_data[gdf_data.departements_code == '75']
    
    ######################## generate grids polygons
    
    # get bbox coordinates
    xmin,ymin,xmax,ymax = gdf_data_subset.total_bounds
    
    # define ranges
    np.arange(xmin,xmax,wide)
    cols = list(np.arange(xmin,xmax,wide))
    rows = list(np.arange(ymin,ymax,length))
    rows.reverse()
    
    # generate grid polygons
    polygons = []
    for x in cols:
        for y in rows:
            polygons.append( Polygon([(x,y), (x+wide, y), (x+wide, y-length), (x, y-length)]) )
    grid = gpd.GeoDataFrame({'geometry':polygons})
    
    # set crs
    grid = grid.set_crs(epsg=4326)
    
    # Add Id column
    grid['ID'] = grid.index + 1
    
    ###################### Generate grid centroids
    
    # copy grid data
    grid_centroid = grid.copy()
    
    #Find the center point
    grid_centroid['Center_point'] = grid_centroid['geometry'].centroid
    
    #Extract lat and lon from the centerpoint
    grid_centroid["lat"] = grid_centroid.Center_point.map(lambda p: p.x)
    grid_centroid["long"] = grid_centroid.Center_point.map(lambda p: p.y)
    
    # convert cells geometry column to string
    grid_centroid['str_geom_grid'] = grid_centroid.geometry.apply(lambda x: wkt.dumps(x))
    # convert centroid geometry column to string
    grid_centroid['str_geom_centroid'] = grid_centroid.Center_point.apply(lambda x: wkt.dumps(x))
    
    # subset columns
    grid_centroid_df = grid_centroid[['ID', 'str_geom_grid', 'str_geom_centroid']]
    
    ########################### Select points in polygons
    
    centroid = grid_centroid[['Center_point','ID']]
    centroid = gpd.GeoDataFrame(centroid, geometry='Center_point')
    
    # get centroids points within paris departements
    pointInPolys = sjoin(centroid, gdf_data_subset, op = 'within')
    
    # convert centroid geometry column to string
    pointInPolys['str_geom_centroid'] = pointInPolys.Center_point.apply(lambda x: wkt.dumps(x))
    
    # subset columns
    centroids_df_within = pointInPolys[['ID', 'departements_code', 'departements_name','str_geom_centroid']]
    
    # get cells within territory
    ID_within = centroids_df_within.ID
    mask = grid_centroid_df['ID'].isin(ID_within)
    grid_df_within = grid_centroid_df[mask]
    
    ########################## return results
    return(centroids_df_within, grid_df_within)

centroids, grid = discretize(geojson_to_discretize = data_departements,
                             code = '75',
                             length = 0.001,
                             wide = 0.00125 )




In [15]:
centroids.head()

Unnamed: 0,ID,departements_code,departements_name,str_geom_centroid
46,47,75,Paris,POINT (2.2247940000000006 48.8553649999999067)
47,48,75,Paris,POINT (2.2247940000000006 48.8543649999999090)
48,49,75,Paris,POINT (2.2247940000000006 48.8533649999999113)
129,130,75,Paris,POINT (2.2260439999999999 48.8593649999998902)
130,131,75,Paris,POINT (2.2260439999999999 48.8583649999999068)


# load results in bigquery

In [17]:
from google.cloud import bigquery

In [18]:
# connect to existing dataset

client = bigquery.Client(location="US")
print("Client creating using default project: {}".format(client.project))

# Define a name for the dataset.
dataset_id = 'opengeokpi'

# The project defaults to the Client's project if not specified.
dataset = client.get_dataset(dataset_id)  # API request# create new dataset

Client creating using default project: test-rstudio-gcp


## load centroids in bigquery

In [20]:
# load centroids_df
table_ref = dataset.table("centroids_df")
job = client.load_table_from_dataframe(centroids, table_ref, location="US")

job.result()  # Waits for table load to complete.
print("Loaded dataframe to {}".format(table_ref.path))

# create centroids_geo in bigquery
sql = """
    SELECT
  ST_GEOGFROMTEXT(str_geom_centroid) AS geom_centroids,
  ID,
  departements_code,
  departements_name
FROM
  `test-rstudio-gcp.opengeokpi.centroids_df` ;
"""
table_ref = dataset.table("centroids_geo")
job_config = bigquery.QueryJobConfig(
    destination=table_ref
)


# Start the query, passing in the extra configuration.
query_job = client.query(sql, location="US", job_config=job_config)

query_job.result()  # Waits for the query to finish
print("Query results loaded to table {}".format(table_ref.path))

Loaded dataframe to /projects/test-rstudio-gcp/datasets/opengeokpi/tables/centroids_df
Query results loaded to table /projects/test-rstudio-gcp/datasets/opengeokpi/tables/centroids_geo


## load grid in bigquery

In [22]:
# load centroids_df
table_ref = dataset.table("grid_df")
job = client.load_table_from_dataframe(grid, table_ref, location="US")

job.result()  # Waits for table load to complete.
print("Loaded dataframe to {}".format(table_ref.path))

# create centroids_geo in bigquery
sql = """
    SELECT
  ST_GEOGFROMTEXT(str_geom_grid) AS geom_grid,
  ID
FROM
  `test-rstudio-gcp.opengeokpi.grid_df` ;
"""
table_ref = dataset.table("grid_geo")
job_config = bigquery.QueryJobConfig(
    destination=table_ref
)


# Start the query, passing in the extra configuration.
query_job = client.query(sql, location="US", job_config=job_config)

query_job.result()  # Waits for the query to finish
print("Query results loaded to table {}".format(table_ref.path))

Loaded dataframe to /projects/test-rstudio-gcp/datasets/opengeokpi/tables/grid_df
Query results loaded to table /projects/test-rstudio-gcp/datasets/opengeokpi/tables/grid_geo
