# Introduction
This notebook uses Google Open Building dataset for Indonesia in order to showcase the following data science skills:
1. to extract large geospatial datasets from Google Open Buildings
2. to transform datasets 
3. to load datasets for the further analysis

# Setup

In [1]:
!pip install geopandas
!pip install matplotlib



In [3]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import requests
import time
import random
import gzip
from tqdm import tqdm
from shapely import wkt

# Data

Google Open Building dataset can be found and downloaded at: https://sites.research.google/open-buildings/

For a easier data access, I chose to download region-specific datasets, upload them on a personal Google Drive, then access them by mounting the Drive. 

### (Optional) Mounting Google Drive for data access

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

ModuleNotFoundError: No module named 'google.colab'

# Extract the Data

In [None]:
all_files = [['/content/gdrive/MyDrive/DataDive/indonesia1.csv.gz',
              '/content/gdrive/MyDrive/DataDive/indonesia2.csv.gz',
              '/content/gdrive/MyDrive/DataDive/indonesia3.csv.gz',
              '/content/gdrive/MyDrive/DataDive/indonesia4.csv.gz']]

### Sampling

As some Open Building datasets are too large to process on notebooks, I decided to randomly sample rows. The code below randomly sample rows from gzipped CSV files and reads them as a dataframe. 

In [None]:
#read 10000 random rows
def sample_rows_from_csv_gz(filename: str, n: int = 10000) -> pd.DataFrame:
    """
    Randomly sample n rows from a gzipped CSV file.

    Parameters:
    - filename (str): Path to the gzipped CSV file.
    - n (int): Number of rows to sample. Default is 10,000.

    Returns:
    - pd.DataFrame: A dataframe containing the sampled rows.
    """

    # Determine the total number of rows in the file
    with gzip.open(filename, "rt") as f:
        total_rows = sum(1 for row in tqdm(f, desc="Counting rows"))

    # If there are fewer than n rows, return all rows
    if total_rows <= n:
        return pd.read_csv(filename, compression='gzip')

    # Calculate the number of rows to skip
    skip_rows = total_rows - n
    # Randomly select rows to skip
    skipped_rows = random.sample(range(1, total_rows), skip_rows)

    # Read the CSV file with progress updates and concatenate the chunks
    chunks = pd.read_csv(filename, compression='gzip', skiprows=skipped_rows, iterator=True, chunksize=1000)
    df = pd.concat(tqdm(chunks, total=n//1000, desc="Loading data"), ignore_index=True)

    return df

Using the function above, we iterate through the files to randomly sample 10,000 rows from each gzipped CSV files, then append them into one dataframe for a country. 

In [None]:
if __name__ == "__main__":
    # Declare dataframe
    df = pd.DataFrame()
    # Get list of files from the local files
    all_files = ['/content/gdrive/MyDrive/DataDive/indonesia1.csv.gz',
                 '/content/gdrive/MyDrive/DataDive/indonesia2.csv.gz',
                 '/content/gdrive/MyDrive/DataDive/indonesia3.csv.gz',
                 '/content/gdrive/MyDrive/DataDive/indonesia4.csv.gz']
    # Iterate through the files and append to current dataframe
    for file in all_files:
      df = df.append(sample_rows_from_csv_gz(file))
    print(df)

### Confidence Level 

As the Open Buildings data is subject to both omission and commission errors, it is recommended to choose the confidence score threshold at which buildings are filtered out. 

In [None]:
# Filter data to include confidence level of 85% or above
high_confidence = 0.85 #Can be adjusted
df = df[df['confidence'] >= high_confidence

### Visualization 

The visualization below shows the geospatial distribution of the dataframe, df. 

In [None]:
df['geometry'] = df['geometry'].apply(wkt.loads)
df = data.set_geometry('geometry')
df.boundary.head()
    
#Plot all Google Open Buildings Coverage
fig, ax = plt.subplots(figsize=(10, 10))
df[:100000].boundary.plot(ax=ax, color='blue')
ax.set_title("Indonesia Building Footprints")
plt.show()

# Transform the Data

### Reverse Geocoding

Nominatim API is used to convert geographic coordinates into a human-readable address and associate building footprint data with specific geographic locations and building attributes such as type and class. More information on the Nominatim API can be found here: https://nominatim.org/release-docs/latest/api/Overview/ 

In [None]:
chosen_country = 'Indonesia' #Can be adjusted

# Define a function to get an address from given latitude and longitude using the Nominatim API.
def get_address_from_coords(lat, lon):
    # Base URL for the Nominatim reverse geocoding API
    base_url = "https://nominatim.openstreetmap.org/reverse"
    # Parameters to be passed with the API request
    params = {
        "format": "json",
        "lat": lat,
        "lon": lon
    }
    try:
        response = requests.get(base_url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json() # Parse the JSON response to get the address data
        return data
    except requests.RequestException as e:
        # Handle any exceptions related to the request, such as timeouts, connectivity issues, etc.
        print(f"Error fetching data for lat={lat}, lon={lon}. Error: {e}")
        return None

In [None]:
# Entry point for the script execution
if __name__ == "__main__":
    # Display the number of buildings that we are going to fetch data for
    print(f"Finding administrative divisions for {len(df)} buildings \n\n")

    # Loop through each building in the dataframe
    for index, row in df.iterrows():
        # Fetch the human-readable address for the given coordinates of the building
        address = get_address_from_coords(row.latitude, row.longitude)

        # If an address is found and it is located in the chosen country, update the dataframe with the relevant details
        if address and address.get('address', {}).get('country') == chosen_country:
            df.at[index, 'Country'] = address['address'].get('country', '')
            df.at[index, 'Province'] = address['address'].get('province', '')
            df.at[index, 'County'] = address['address'].get('county', '')
            df.at[index, 'Country_code'] = address['address'].get('country_code', '')
            df.at[index, 'State'] = address['address'].get('state', '')
            df.at[index, 'ISO3166-2-lvl4'] = address['address'].get('ISO3166-2-lvl4', '')
            df.at[index, 'Type'] = address.get('type', '')
            df.at[index, 'Class'] = address.get('class', '')


        # Print progress for each row to track the process
        print(f"Row <------> {index+1} written <------> {address}")
        2
        # Pause for 1 second (Nominatim's usage policy)
        time.sleep(1)

### Extract Building Height

# Load the Data