<a href="https://colab.research.google.com/github/WRFitch/fyp/blob/main/src/fyp_data_import_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Import Pipeline
It's a pipeline that imports the data I need. 

### TODO
- Implement image splitting into 100px squares
- Import CO2 dataset
- Implement a normal coding standard for variable names, method names, etc
- Extract unnecessary methods into normal python files and import where necessary. 

## Setup
*   Import necessary libraries
*   Set up Earth Engine authentication and mount google drive  


In [1]:
import ee
import folium

from google.colab import drive
from osgeo import gdal
from PIL import Image
from pprint import pprint

In [None]:
ee.Authenticate()
ee.Initialize()

In [None]:
drive.mount('/content/drive')

print(folium.__version__)

# Dataset import

### Import the following datasets into Google Drive

*   [Sentinel-2 Satellite photography](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S2_SR)
*   [Sentinel-5 Precursor Data](https://developers.google.com/earth-engine/datasets/catalog/sentinel)
  *   [Carbon Monoxide](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S5P_OFFL_L3_CO)
  *   [Formaldehyde](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S5P_OFFL_L3_HCHO)
  *   [Nitrogen Dioxide](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S5P_OFFL_L3_NO2)
  *   [Ozone](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S5P_OFFL_L3_O3)
  *   [Sulphur Dioxide](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S5P_OFFL_L3_SO2)
  *   [Methane](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S5P_OFFL_L3_CH4)
*   [ODIAC Fossil Fuel CO2 Emissions](https://db.cger.nies.go.jp/dataset/ODIAC/DL_odiac2019.html)

In [9]:
# Earth engine username, used to import classified image into ee assets folder
USERNAME = 'wrfitch'
OUTPUT_DIR = USERNAME + "/out/"

# Define image collections for each dataset to be used 
s2 = ee.ImageCollection("COPERNICUS/S2_SR")
s5_CO = ee.ImageCollection("COPERNICUS/S5P/OFFL/L3_CO")
s5_HCHO = ee.ImageCollection("COPERNICUS/S5P/OFFL/L3_HCHO")
s5_NO2 = ee.ImageCollection("COPERNICUS/S5P/OFFL/L3_NO2")
s5_O3 = ee.ImageCollection("COPERNICUS/S5P/OFFL/L3_O3")
s5_SO2 = ee.ImageCollection("COPERNICUS/S5P/OFFL/L3_SO2")
s5_CH4 = ee.ImageCollection("COPERNICUS/S5P/OFFL/L3_CH4")
#TODO import CO2 dataset

CO_band = 'CO_column_number_density'
HCHO_band = 'tropospheric_HCHO_column_number_density'
NO2_band = 'tropospheric_NO2_column_number_density'
O3_band = 'O3_column_number_density'
SO2_band = 'SO2_column_number_density'
CH4_band = 'CH4_column_volume_mixing_ratio_dry_air'

# Define dataset boundaries for britain and london 
# TODO work out polygon segmentation algo - there's probably a clever algorithm for this, but I could also just iterate 
#      through simple squares that fit my bandwidth and storage constraints. I run out of memory when using the gbr 
#      polygon anyway, so an iterative approach is necessary. 
great_britain = ee.Geometry.Polygon(
        [[[-1.836112801004015, 59.808076330562756],
          [-8.779472176004015, 58.82140293049428],
          [-7.988456551004015, 55.71069203454839],
          [-11.196464363504015, 54.42753859549109],
          [-11.328300301004015, 50.967746003015044],
          [-9.526542488504015, 50.77361752815123],
          [-6.274589363504015, 51.81776248652293],
          [-5.395683113504015, 51.21615275310099],
          [-6.582206551004015, 49.56332371186494],
          [-3.110526863504015, 49.904165426606255],
          [1.240059073995985, 50.80139967619036],
          [2.426582511495985, 52.33095407387208],
          [1.767402823995985, 53.4183511305661],
          [0.5369340739959849, 53.44453305344514],
          [-1.616386238504015, 56.32474216074427],
          [-0.7814253010040151, 57.805828290000164]]])

london = ee.Geometry.Polygon(
        [[[-1.0666833726431624, 51.89360084338857],
          [-0.9321008531119124, 51.38908166135181],
          [-0.18503054061191238, 51.08470683562287],
          [0.4741491468881076, 51.193274483099074],
          [0.9822668226693576, 51.60282356474035],
          [0.2269567640756076, 52.071221592742454]]])

uxbridge = ee.Geometry.Polygon(
        [[[-0.6005789665912831, 51.48903144928163],
          [-0.27304905936472057, 51.49751989601588],
          [-0.27648228690378307, 51.63069551948597],
          [-0.6143118767475331, 51.63069551948597]]]);

# Could the start and end dates be shifted or focused on one area, so emissions can be monitored across the seasons? 
# would that even be useful? 
start_date = '2020-01-01'
end_date = '2020-12-31'
vis_palette = ['black', 'blue', 'purple', 'cyan', 'green', 'yellow', 'red']

drive_path = "/content/drive/MyDrive/"

### Visualise Data

In [5]:
# Import datasets 
# TODO analyse whether these min/max values are valid, recalibrate for highest variance where necessary. Separate values
#      may be necessary for different samples - for example, the perfect calibration for the UK won't work on the world. 
# TODO analyse whether it makes sense to analyse these on a highly localised level

# pre-filter to remove clouds - we can add them back in as data points from sentinel 5 if necessary
def maskS2clouds(image) :
  qa = image.select('QA60');

  # Bits 10 and 11 are clouds and cirrus, respectively.
  cloud_bitmask = 1 << 10
  cirrus_bitmask = 1 << 11

  # Both flags should be set to zero, indicating clear conditions.
  mask = qa.bitwiseAnd(cloud_bitmask).eq(0).And( \
         qa.bitwiseAnd(cirrus_bitmask).eq(0))

  return image.updateMask(mask).divide(10000)

# High-resolution satellite photograph 
s2_img = ee.ImageCollection('COPERNICUS/S2_SR') \
                  .filterDate(start_date, end_date) \
                  .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20)) \
                  .filterBounds(great_britain) \
                  .map(maskS2clouds).median()
s2_id = s2_img.getMapId({'bands': ['B4', 'B3', 'B2'], \
                        'min': 0, \
                        'max': 0.3})

# Carbon monoxide
# Minmax scale is a bit off - recalibrate for Britain 
CO_img = s5_CO.filterDate(start_date, end_date) \
              .filterBounds(great_britain) \
              .select(CO_band).mean()
CO_id = CO_img.getMapId( \
    {'palette': vis_palette, \
    'min': 0, \
    'max': 0.05})

# Formaldehyde
# Minmax scale is a bit off - recalibrate for Britain
HCHO_img = s5_HCHO.filterDate(start_date, end_date) \
                  .filterBounds(great_britain) \
                  .select(HCHO_band).mean()
HCHO_id = HCHO_img.getMapId( \
    {'palette': vis_palette, \
    'min': 0.0, \
    'max': 0.0003})

# Nitrogen Dioxide
NO2_img = s5_NO2.filterDate(start_date, end_date) \
                .filterBounds(great_britain) \
                .select(NO2_band).mean()
NO2_id = NO2_img.getMapId( \
    {'palette': vis_palette, \
    'min': 0.0, \
    'max': 0.0002})

# Ozone
O3_img = s5_O3.filterDate(start_date, end_date) \
              .filterBounds(great_britain) \
              .select(O3_band).mean()
O3_id = O3_img.getMapId( \
    {'palette': vis_palette, \
    'min': 0.12, \
    'max': 0.15})

# Sulphur Dioxide
SO2_img = s5_SO2.filterDate(start_date, end_date) \
                .filterBounds(great_britain) \
                .select(SO2_band).mean()
SO2_id = SO2_img.getMapId( \
    {'palette': vis_palette, \
    'min': 0.0, \
    'max': 0.0005})

# Methane
CH4_img = s5_CH4.filterDate(start_date, end_date) \
                .filterBounds(great_britain) \
                .select(CH4_band).mean()
CH4_id = CH4_img.getMapId( \
    {'palette': vis_palette, \
    'min': 1750, \
    'max': 1900})

In [6]:
# For easier iteration down the line. I know I'm not supposed to, but google can't tell me what to do, even if it's a good idea!
ghg_imgs = [CO_img, HCHO_img, NO2_img, O3_img, SO2_img, CH4_img]
ghg_ids = [CO_id, HCHO_id, NO2_id, O3_id, SO2_id, CH4_id]

In [7]:
# Visualise data on a Folium map 
# Attribution has to stay earthengine.google.com, since that's where these maps came from. 
map = folium.Map(
    location = [51.5, 0.1], 
    prefer_canvas = True)

layerOpacity = 0.5

folium.TileLayer(
    tiles = s2_id['tile_fetcher'].url_format,
    attr = 'Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
    overlay = True,
    name = 'satellite photography median composite '
  ).add_to(map)

folium.TileLayer(
    tiles = CO_id['tile_fetcher'].url_format,
    attr = 'Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
    overlay = True,
    name = 'Carbon Monoxide',
    opacity = layerOpacity
  ).add_to(map)

folium.TileLayer(
    tiles = HCHO_id['tile_fetcher'].url_format,
    attr = 'Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
    overlay = True,
    name = 'Formaldehyde',
    opacity = layerOpacity
  ).add_to(map)

folium.TileLayer(
    tiles = NO2_id['tile_fetcher'].url_format,
    attr = 'Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
    overlay = True,
    name = 'Nitrogen Dioxide',
    opacity = layerOpacity
  ).add_to(map)

folium.TileLayer(
    tiles = O3_id['tile_fetcher'].url_format,
    attr = 'Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
    overlay = True,
    name = 'Ozone',
    opacity = layerOpacity
  ).add_to(map)

folium.TileLayer(
    tiles = SO2_id['tile_fetcher'].url_format,
    attr = 'Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
    overlay = True,
    name = 'Sulphur Dioxide',
    opacity = layerOpacity
  ).add_to(map)

folium.TileLayer(
    tiles = CH4_id['tile_fetcher'].url_format,
    attr = 'Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
    overlay = True,
    name = 'Methane',
    opacity = layerOpacity
  ).add_to(map)
  
map.add_child(folium.LayerControl())
map

### Export Data

Exports as unmarked .csv tables and GeoTIFF images. 

##### TODO
- Figure out how to get and access location data from fastai. Would writing it out into the filename work? 
  - If I need to experiment with this KML format that's fine. 
- Update filepath definitions once training schema is defined 
- Ensure the hi-res image segmentation is behaving correctly - no overlaps! 

In [27]:
# All export methods export to the google drive defined above 
# TODO A lot of these methods reuse scaling. Investigate whether this is necessary and remove where possible. 
# TODO rewrite export filename prefixes so they aren't garbage
def exportTable(table, scale):
  ee.batch.Export.table.toDrive(
    collection = table,
    description = str(scale) + 'm_res_csv_export',
    folder = str(scale) + "m",
    fileFormat = "CSV"
    # geometry = polygon
    # selectors = 
  ).start()

# Export one table of the given image, at the scale and dimensions specified.
def exportTableFromImage(image, polygon, scale):
  exportTable(sample(image, polygon, scale), scale)

# Export one GeoTIFF image of the given image, at the scale and dimension specified. 
# TODO reevaluate image export options - description needs coordinates
# maxPixels is just so it lets me export london at 10m/px. Dividing the dataset into 1km squares is the next step. 
def exportGeotiff(image, polygon, scale):
  image_export_options = {
    'patchDimensions': [100, 100],
    'maxFileSize': 104857600,
    'compressed': True 
  }

  ee.batch.Export.image.toDrive(
    description = str(scale) + 'm_scale_img',
    fileFormat = 'GeoTIFF',
    folder = str(scale) + "m",
    # formatOptions = image_export_options,
    image = image,
    maxPixels = 10e9,
    region = polygon,
    scale = scale
  ).start()
  
# Does this really need to exist?
def sample(img, region, scale):
  return img.sampleRegions(
      collection = region,
      geometries = True,
      scale = scale
  )

#Export the given image as a list of geotiffs in the given polygon, broken into 100px^2 1km images. 
def exportSqKmGeotiffList(image, polygon, scale):
  ee.batch.Export.image.toDrive(
      description = str(scale) + "m_scale_img_list",
      fileFormat = "GeoTIFF",
      folder = str(scale) + "m/sqkm_png_exports",
      image = image,
      maxPixels = 10e9,
      region = polygon,
      scale = scale,
      # shardSize = 100, # shardSize is deprecated on cloud API. This is not documented. 
      # Depending on resolution, images could be exported as a 256x image, split into 2 128x's, split into quarters and upscaled via fast.ai. This really overcomplicates things, though. 
      fileDimensions = [6400, 6400]
  ).start()

In [None]:
# take a sample of the image at the points given
# TOOD Save a small partition in google drive, then work on getting the next via a thread. 
#      This should also start the training process, then delete the small partition in the drive. 
sizes = {1000}#, 500, 100, 50, 10}

for scale in sizes:
  print("this should only be run once, when setting up")
  """
  exportTableFromImage(CO_img, london, scale)
  exportTableFromImage(HCHO_img, london, scale)
  exportTableFromImage(NO2_img, london, scale)
  exportTableFromImage(O3_img, london, scale)
  exportTableFromImage(SO2_img, london, scale)
  exportTableFromImage(CH4_img, london, scale)
  """

In [30]:
pprint(ee.batch.Task.list())

[<Task EXPORT_IMAGE: 10m_scale_img_list (READY)>,
 <Task EXPORT_IMAGE: 10m_scale_img (READY)>,
 <Task EXPORT_IMAGE: 10m_scale_img_list (FAILED)>,
 <Task EXPORT_IMAGE: 10m_scale_img_list (FAILED)>,
 <Task EXPORT_IMAGE: 10m_scale_img_list (FAILED)>,
 <Task EXPORT_IMAGE: 10m_scale_img (FAILED)>,
 <Task EXPORT_IMAGE: 10m_scale_img (FAILED)>,
 <Task EXPORT_IMAGE: 10m_scale_img (FAILED)>,
 <Task EXPORT_IMAGE: 1000m_scale_img (FAILED)>,
 <Task EXPORT_FEATURES: 1000m_res_csv_export (COMPLETED)>,
 <Task EXPORT_FEATURES: 1000m_res_csv_export (COMPLETED)>,
 <Task EXPORT_FEATURES: 1000m_res_csv_export (COMPLETED)>,
 <Task EXPORT_FEATURES: 1000m_res_csv_export (COMPLETED)>,
 <Task EXPORT_FEATURES: 1000m_res_csv_export (COMPLETED)>,
 <Task EXPORT_FEATURES: 1000m_res_csv_export (COMPLETED)>,
 <Task EXPORT_FEATURES: 1000m_res_csv_export (COMPLETED)>,
 <Task EXPORT_FEATURES: 1000m_res_csv_export (COMPLETED)>,
 <Task EXPORT_FEATURES: 1000m_res_csv_export (COMPLETED)>,
 <Task EXPORT_FEATURES: 1000m_res_c

In [28]:
sizes = {10}#, 100, 50, 10} 1000, 500, 

for scale in sizes:
  # this should only be run once, when setting up
  exportGeotiff(s2_img, london, scale)

In [29]:
#exportGeotiff(s2_img, uxbridge, 10)
exportSqKmGeotiffList(s2_img, uxbridge, 10)

In [None]:
#exportGeotiff(s2_img, great_britain, 1000)

# Data processing

### TODO
- ~~Convert GeoTIFF to PNG~~
- Reorganise datasets for fast.ai retraining. 2-400 images was sufficient for object recog, maybe double that for top-down sat photos?


In [None]:
# converts geotiff to png, using selected bands. There seems to be a limited range of functional bands, including only 3 
# being available for a non-transparent image. The bands also display in black-and-white when displayed individually. 

def geotiffToPng(tif_path):
  #TODO remap to ARGB to get more defined brightness data
  options_list = [
    '-ot Byte',
    '-of PNG',
    '-b 4',
    '-b 3',
    '-b 2',
    '-scale'
  ]
  options_string = " ".join(options_list)

  yourpath = os.path.join(drive_path, tif_path)
  print(yourpath)
  
  for root, dirs, files in os.walk(yourpath, topdown=False):
    for name in files:
      fullpath = os.path.join(root, name)
      print(fullpath)
      splitpath = os.path.splitext(fullpath)
      if splitpath[1].lower() == ".tif":
        path = splitpath[0]
        if os.path.isfile(path + ".png"):
          print("A png file already exists for %s" % name)
          # Return statement removed while this is under development. I'll clean things up manually till then. 
          #return
        
        gdal.Translate(
          path + 'band_432.png',
          path + '.tif',
          options=options_string
        )

geotiffToPng("1000m")