## 1_download_VIDA_S2grid_datasets
### This notebook downloads the building footprint S2 partitions by VIDA for a given polygons in a Parquet format

### Initial configuration
#### To start working with this particular notebook, you need to provide necessary credential and settings
#### Below is an template of configuration, which is necessary prepare aside of this notebook and copy & paste all content in triple quotes to the next cell's input field
    """
    {
    "COS_ENDPOINT_URL": "s3.private.eu-de.cloud-object-storage.appdomain.cloud",
    "COS_AUTH_ENDPOINT_URL": "https://iam.cloud.ibm.com/oidc/token",
    "COS_APIKEY": "xxx",
    "COUNTRY_NAME": "India",
    "VIDA_PARQUET_BUCKET": "parquets"
    }
    """


In [1]:
# Read notebook configuration
import getpass
import json

config_str = getpass.getpass('Enter your prepared config: ')
config = json.loads(config_str)

In [2]:
# !pip install s2cell

In [16]:
# Import necessary libraries
import requests
import os
from botocore.client import Config
import ibm_boto3
from botocore import UNSIGNED
from botocore.handlers import disable_signing
import boto3
import s2cell
import geopandas as gpd
import pandas as pd
import shapely
from tqdm import tqdm

from pyproj import Geod
import plotly.express as px
import json
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import time
import io
import pickle

In [4]:
# countries ISO mapper - Add new countries if needed
country_mapper = {
    'Kenya': 'KEN',
    'India': "IND"
}

regions_pickle = 'regions_S2_ids.pkl' # this pickle file will be created after uploading all the S2 searched partions
# file contains a selected region names with appropriate to them S2 grid IDs

In [6]:
# # init S3 client in order to work with last tiff file version
cos_client = ibm_boto3.client(service_name='s3',
                                  ibm_api_key_id=config["COS_APIKEY"],
                                  ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
                                  config=Config(signature_version='oauth'),
                                  endpoint_url=config["COS_ENDPOINT_URL"])


response = cos_client.list_objects_v2(Bucket=config["UTILS_BUCKET"])

utils_to_download = ['india_state.geojson']

try:
    for obj in response['Contents']:
        name = obj['Key']
        if name in utils_to_download:
            streaming_body_1 = cos_client.get_object(Bucket=config["UTILS_BUCKET"], Key=name)['Body']
            print("Copying to localStorage :  " + name)
            with io.FileIO(name, 'w') as file:
                for i in io.BytesIO(streaming_body_1.read()):
                    file.write(i)
    
except Exception as e:
    print('Error occured: ', e)

Copying to localStorage :  india_state.geojson


In [47]:
def download_country_parquet_by_S2_Grid(country:str, directory:str, S2_grid: str, target_bucket=None, remove_after_upload:bool=False) -> None:
    '''
        This function is aimed for downloading VIDA geoparquets from data.source.coop
        Input positional arguments:
            1. country -> country name, can be Kenya or India, in case there your desired country isn't present in thin function,
            just add a new "Country name":"Country ISO CODE" pair to the 'country_mapper' dictionary
            2. directory -> target directory where desired parquet will be saved
            3. target_bucket -> (optional) if defined the downloaded parquet will be uploaded to the bucket assigned to this argumemt
        
    '''
    
    # check desired directory existence
    if os.path.exists(directory):
        print(f'\033[92mDirectory: "{directory}" exists')
        
    else:
        print(f'\033[93mTarget directory not exists, creating...')
        
        try:
            os.makedirs(directory)
            print(f'\033[92mDirectory "{directory}" successfully created')
            
        except Exception as e:
            print(f"\033[91mError occurred while creating directory {directory} \n Error: {str(e)}")
    
    # assembly final url
    
    # countries ISO mapper - Add new countries if needed
    country_mapper = {
        'Kenya': 'KEN',
        'India': "IND"
    }
    country_iso = country_mapper[country]
    url = f'https://data.source.coop/vida/google-microsoft-open-buildings/geoparquet/by_country_s2/country_iso={country_iso}/{S2_grid}.parquet'
    # get file size
    
    try:
        response = requests.head(url, allow_redirects=True)
        size = response.headers.get('content-length', -1)
        # size in megabytes
        print('FILE SIZE: {:.2f} MB'.format(int(size) / float(1 << 20)))
        
    except Exception  as e:
        print(f'Headers retrieval error {e}')
        
        
    try:
        # download file
        response = requests.get(url)
    except Exception  as e:
        print(f'\033[91mDownload error {e}')
        # download file
        print('\033[93mWait and try again to download..')
        
        try:
            time.sleep(5)
            response = requests.get(url)
        except Exception  as e:
            print(f'\033[91mSecond attempt download error {e}')
            return S2_grid
        
    
    if response.status_code == 200:
        
        filename = f"{country}_{S2_grid}.parquet"
        file_path = os.path.join(directory, filename)
        with open(file_path, "wb") as file:
            file.write(response.content)
            print(f"\033[92mFile: {filename} downloaded successfully!")

        
        if isinstance(target_bucket, str):
            try:
                cos_client.upload_file(
                    Filename=file_path,
                    Bucket=target_bucket,
                    Key=filename,
                    ExtraArgs={'ContentDisposition': 'attachment'}
                )
                
                if remove_after_upload:
                    try:
                        os.remove(file_path)
                    except Exception as e:
                        print(f"\033[91mFailed to remove {file_path}")
                
                print(f'\033[92mFile {filename} successfully uploaded to the COS {target_bucket} bucket')
            except Exception as e:
                print(f"\033[91mFailed upload file to the bucket {target_bucket}. Error: {e}")
    else:
        print("\033[91mFailed to download the file.")
        return S2_grid
    
   

In [48]:
try:
    SENTINEL2_BUCKET_NAME = 'vida'
    S3client = boto3.client('s3', region_name='us-west-2', config=Config(signature_version=UNSIGNED), endpoint_url='https://data.source.coop')

    partial_tile_name = 'google-microsoft-open-buildings/geoparquet/by_country_s2/country_iso=IND'
    bucket_objects = [obj['Key'] for obj in S3client.list_objects_v2(Bucket=SENTINEL2_BUCKET_NAME, Prefix=partial_tile_name, MaxKeys=1000,)['Contents']]
    
except Exception as e:
    print(f'S2 grid partition obtaining error occurred: {e}')

In [49]:
# obtain coordinates of S2 points

S2_ids = [int(i.split('/')[-1].replace('.parquet', '')) for i in bucket_objects]
S2_points = {i: s2cell.cell_id_to_lat_lon(i)for i in S2_ids}


In [50]:
india_states_df = gpd.read_file('india_state.geojson')


regions_polygons = {
    'Madhya Pradesh': [
        india_states_df[india_states_df.NAME_1.isin(['Madhya Pradesh'])].geometry.iloc[0]
        ],
    'South-India': [
        india_states_df[india_states_df.NAME_1 == 'Tamil Nadu'].geometry.iloc[0].geoms[-1],
        india_states_df[india_states_df.NAME_1 == 'Kerala'].geometry.iloc[0].geoms[-1]
    ],
    'East-India': [
        india_states_df[india_states_df.NAME_1 == 'Jharkhand'].geometry.iloc[0],
        india_states_df[india_states_df.NAME_1 == 'Nagaland'].geometry.iloc[0],
        india_states_df[india_states_df.NAME_1 == 'Mizoram'].geometry.iloc[0],
        india_states_df[india_states_df.NAME_1 == 'Assam'].geometry.iloc[0].geoms[-1],
    ]
}

regions_polygons

{'Madhya Pradesh': [<POLYGON ((78.365 26.869, 78.367 26.863, 78.37 26.858, 78.375 26.847, 78.381...>],
 'South-India': [<POLYGON ((80.076 13.527, 80.076 13.526, 80.079 13.529, 80.087 13.527, 80.08...>,
  <POLYGON ((74.996 12.788, 75 12.783, 75.004 12.786, 75.005 12.785, 75.004 12...>],
 'East-India': [<POLYGON ((87.6 25.315, 87.607 25.311, 87.614 25.316, 87.623 25.311, 87.626 ...>,
  <POLYGON ((95.214 26.937, 95.217 26.934, 95.226 26.934, 95.229 26.931, 95.23...>,
  <POLYGON ((92.801 24.419, 92.804 24.419, 92.807 24.42, 92.809 24.419, 92.809...>,
  <POLYGON ((95.952 27.942, 95.952 27.939, 95.952 27.937, 95.958 27.937, 95.95...>]}

In [51]:
default_boundaries = [[k, c] for k, v in regions_polygons.items() for c in v]
default_boundaries

df_1 = gpd.GeoDataFrame(default_boundaries, columns=['region', 'geometry'])
df_1['boundary_type'] = 'default boundary'

df_2 = df_1.copy()
scale_ratio = 1 # adjust this parameter to cover all necessary S2 points in below map
df_2['geometry'] = df_2['geometry'].apply(lambda x: x.buffer(scale_ratio, join_style=0))

df_2['boundary_type'] = 'expanded boundary'


# filter all S2 ids witin expanded polygons
points_within_polygons = {}

for row in df_2.itertuples():
    
    # mgrs_tile = mgrs_row.geometry   
    for S2_id, S2_point in S2_points.items():
        
        S2_point = shapely.Point(S2_point[1], S2_point[0])
        if row.geometry.contains(S2_point):
            points_within_polygons[S2_id] = S2_point

print(f'S2 point amount {len(points_within_polygons)}')


# save for further visualization
df_2.to_file('expanded_boundaries.json', driver='GeoJSON')


result_df = pd.concat([
    df_1,
    df_2, 
    pd.DataFrame([[i, 'point'] for i in points_within_polygons.values()], columns=['geometry', 'boundary_type'])
])

result_df['longitude'] = result_df['geometry'].apply(lambda g: g.centroid.xy[0][0], convert_dtype=True)
result_df['latitude'] = result_df['geometry'].apply(lambda g: g.centroid.xy[1][0], convert_dtype=True)


the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.



S2 point amount 75



the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.


the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.



In [52]:
# for visual control of expanded states polygons

geojson_2 = json.load(open('expanded_boundaries.json'))

fig = px.scatter_mapbox(result_df[result_df.boundary_type == 'point'], lat="latitude", lon="longitude", hover_name="boundary_type", hover_data=["boundary_type"],
                        zoom=3, height=800, mapbox_style='open-street-map')

fig.update_traces(marker={"size": 10, "color": "#34eb98", "symbol": "circle"})
    
fig.update_layout(mapbox_layers=[{
            "name": "expanded",
            "below": 'traces',
            "sourcetype": "geojson",
            "type": "fill",
            "color": "blue",
            "source": geojson_2
        },
    ])

fig.show()

In [None]:
S2_target_ids = list(points_within_polygons.keys())

download_errors = []

for i in tqdm(S2_target_ids, total=len(S2_target_ids), desc='Downloading partitions'):
    
    state = download_country_parquet_by_S2_Grid('India', 'parquets', int(i), target_bucket='vida-s2-partitions', remove_after_upload=True)
    
    if state != 'OK':
        download_errors.append(state)
    print('_'*60)

In [57]:
regions_S2_ids = {}
for row in df_2.itertuples():
    
    ids = []
        
    for S2_id, S2_point in S2_points.items():
        
        S2_point = shapely.Point(S2_point[1], S2_point[0])
        if row.geometry.contains(S2_point):
            ids.append(S2_id)
    
    
    if row.region not in regions_S2_ids.keys():
        regions_S2_ids[row.region] = ids
    else:
        regions_S2_ids[row.region] += ids     

regions_S2_ids     
with open(regions_pickle, 'wb') as f:
    pickle.dump(regions_S2_ids, f)
    
    
# upload to bucket
try:
    res=cos_client.upload_file(Filename=regions_pickle, Bucket=config["UTILS_BUCKET"],Key=regions_pickle)
except Exception as e:
    print(Exception, e)
else:
    print(f'Pickle file with S2 regions succesfully uploaded')

Pickle file with S2 regions succesfully uploaded
