# Prepare vector data
## Purpose
In this notebooks we will clean and simplify the vector data that will be using for zonal stats.
## Setup
### Library import
**Setup software libraries** 

In [17]:
import pandas as pd
import geopandas as gpd

### Utils

**prepare_vector_data**

In [18]:
def prepare_vector_data(iso: str = None, tolerance: float = None, level: int = 1) -> dict[str, pd.DataFrame()]:
    if iso:
        bboxs = pd.read_csv('../../data/mbtiles/country_bbox.csv', converters={"bbox": literal_eval})
        bbox = bboxs[bboxs['gid_0'] == iso].bbox.iloc[0]
    else:
        bbox = None
        
    # Read Political boundaries:
    print('Reading Political boundaries')
    gdf_pb = gpd.read_file('../../data/mbtiles/political_boundaries/political_boundaries.shp', bbox = bbox)
    # Select level 0 areas
    gdf_pb_0 = gdf_pb[gdf_pb['level'] == 0]
    # Select level 1 areas
    gdf_pb = gdf_pb[gdf_pb['level'] == level]
    #Simplify geometries
    if tolerance:
        gdf_pb['geometry'] = gdf_pb['geometry'].apply(lambda x: x.simplify(tolerance)) 
    # Add area in ha
    gdf_pb['area_ha'] = gdf_pb['geometry'].to_crs({'init': 'epsg:6933'}).map(lambda p: p.area / 10**4)    
    gdf_pb = gdf_pb[['name_0', 'gid_0', 'name_1', 'gid_1', 'level', 'bbox', 'area_ha', 'id', 'id_0', 'geometry']]
    
    # Read Landforms
    print('Reading Landforms')
    gdf_land = gpd.read_file('../../data/mbtiles/ne_10m_geography_regions/ne_10m_geography_regions.shp', bbox = bbox)
    # Select level 1 areas
    gdf_land = gdf_land[gdf_land['level'] == level]
    # Add area in ha
    gdf_land = gdf_land.set_crs(epsg=4326, allow_override=True)
    gdf_land = gdf_land.to_crs("EPSG:4326")
    if iso == None:
        gdf_land['area_ha'] = gdf_land['geometry'].to_crs({'init': 'epsg:6933'}).map(lambda p: p.area / 10**4)   
        gdf_land = gdf_land[['featurecla', 'name', 'region', 'ne_id', 'level', 'bbox', 'area_ha', 'id', 'id_0', 'geometry']]
    else:
        gdf_land = gdf_land[['featurecla', 'name', 'region', 'ne_id', 'level', 'bbox', 'id', 'id_0', 'geometry']]
    
    # Read Biomes
    print('Reading Biomes')
    gdf_bio = gpd.read_file('../../data/mbtiles/ecoregions_by_biome/ecoregions_by_biome.shp', bbox = bbox)
    # Select level 1 areas
    gdf_bio = gdf_bio[gdf_bio['level'] == level]    
    # Add area in ha
    gdf_bio = gdf_bio.set_crs(epsg=4326, allow_override=True)
    gdf_bio = gdf_bio.to_crs("EPSG:4326")
    if iso == None:
        gdf_bio['area_ha'] = gdf_bio['geometry'].to_crs({'init': 'epsg:6933'}).map(lambda p: p.area / 10**4) 
        gdf_bio = gdf_bio[['biome_name', 'biome_num', 'eco_name', 'eco_biome_', 'eco_id', 'level', 'bbox', 'area_ha', 'id', 'id_0', 'geometry']]
    else:
        gdf_bio = gdf_bio[['biome_name', 'biome_num', 'eco_name', 'eco_biome_', 'eco_id', 'level', 'bbox', 'id', 'id_0', 'geometry']]
    
    # Read Hydrological basins
    print('Reading Hydrological basins')
    gdf_hb = gpd.read_file('../../data/mbtiles/hydrological_basins/hydrological_basins.shp', bbox = bbox)
    # Select level 1 areas
    gdf_hb = gdf_hb[gdf_hb['level'] == level]    
    #Make valid geometries
    gdf_hb['geometry'] = gdf_hb['geometry'].apply(lambda x: x.buffer(0))
    # Add area in ha
    gdf_hb = gdf_hb.set_crs(epsg=4326, allow_override=True)
    gdf_hb = gdf_hb.to_crs("EPSG:4326")
    if iso == None:
        gdf_hb['area_ha'] = gdf_hb['geometry'].to_crs({'init': 'epsg:6933'}).map(lambda p: p.area / 10**4) 
        gdf_hb = gdf_hb[['maj_bas', 'maj_name', 'maj_area', 'sub_bas', 'sub_name', 'sub_area', 'level', 'bbox', 'area_ha', 'id', 'id_0', 'geometry']]
    else:
        gdf_hb = gdf_hb[['maj_bas', 'maj_name', 'maj_area', 'sub_bas', 'sub_name', 'sub_area', 'level', 'bbox', 'id', 'id_0', 'geometry']]

    vector_data = {f'political_boundaries_{str(level)}': gdf_pb, f'landforms_{str(level)}': gdf_land, f'biomes_{str(level)}': gdf_bio, f'hydrological_basins_{str(level)}': gdf_hb}
    
    if iso:
        print('Intersecting areas with the selected country')
        gdf_pb = gdf_pb[gdf_pb['gid_0'] == iso]
            
        vector_data[f'political_boundaries_{str(level)}'] = gdf_pb
        
        gdf_pb_0 = gdf_pb_0[gdf_pb_0['gid_0'] == iso]
        country = gdf_pb_0[gdf_pb_0['level'] == 0]['geometry'].iloc[0].buffer(0)
        
        for data_name in list(vector_data.keys())[1:]:
            print(data_name)
            gdf = intersect_areas(vector_data[data_name], country)
            gdf['area_ha'] = gdf['geometry'].to_crs({'init': 'epsg:6933'}).map(lambda p: p.area / 10**4)
            vector_data[data_name] = gdf
                    
    # Set index
    for data_name in list(vector_data.keys()):
        vector_data[data_name] = vector_data[data_name].reset_index(drop=True).reset_index()
        
    # Exceptions 
    # change bboxes
    if level == 1 and iso == None:
        # Alaska
        vector_data['political_boundaries_1'].at[1707,'bbox'] = '[-179.1506, 51.2097, -125, 72.6875]'
    
   
    return vector_data

## Vector data

### Level 1 geometries
**Read data**

In [19]:
# Read vector data
print('Reading  vector data.')
vector_data = prepare_vector_data(iso=None, tolerance=0.075, level=1)

Reading  vector data.
Reading Political boundaries


  in_crs_string = _prepare_from_proj_string(in_crs_string)


Reading Landforms


  in_crs_string = _prepare_from_proj_string(in_crs_string)


Reading Biomes


  in_crs_string = _prepare_from_proj_string(in_crs_string)


Reading Hydrological basins


  in_crs_string = _prepare_from_proj_string(in_crs_string)


**Save data**

In [20]:
for name in vector_data.keys():
    vector_data[name].to_file(f"../../data/processed/vector_data/{name}.geojson", driver='GeoJSON', index=False)

### Level 0 geometries
**Read data**

In [21]:
# Read vector data
print('Reading  vector data.')
vector_data = prepare_vector_data(iso=None, tolerance=0.075, level=0)

Reading  vector data.
Reading Political boundaries


  in_crs_string = _prepare_from_proj_string(in_crs_string)


Reading Landforms


  in_crs_string = _prepare_from_proj_string(in_crs_string)


Reading Biomes


  in_crs_string = _prepare_from_proj_string(in_crs_string)


Reading Hydrological basins


  in_crs_string = _prepare_from_proj_string(in_crs_string)


**Save data**

In [22]:
for name in vector_data.keys():
    vector_data[name].to_file(f"../../data/processed/vector_data/{name}.geojson", driver='GeoJSON', index=False)