<a href="https://colab.research.google.com/github/WRFitch/fyp/blob/main/src/fyp_data_import_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Import Pipeline

This code is still under construction, and is therefore very very bad in places. 

### TODO
- Import CO2 dataset
- Extract unnecessary methods into normal python files and import where necessary. 
- Remove unnecessary variable changes where necessary - this stacks up all the JSON, making everything harder than it needs to be. 
  - Actually, it might not
- Change unnecessary image imports to feature imports 
- Figure out a way of iterating through existing images and displaying the area currently covered by my dataset on a map. 
- Define and import other regions of interest - stick to cities and suburbs for now, since that will have the best health data. Converting this to include rural or rocky areas is an increase in feature set. 
- Figure out how accurate the image exports are
  - Are the points definitely centered on the given coordinates? 
  - is there a way of standardising lighting? 
- add file indexing into one CSV with all our existing latlong exports, so we're not constantly querying the filesystem. 
- Begin exporting commonalities across notebooks, and incorporate a single pipeline that can be employed in a demonstration script

## Setup
*   Import necessary libraries
*   Set up Earth Engine authentication and mount google drive  


In [1]:
import ee
# TODO test not importing this and still use eec.map 
import folium
import os

from google.colab import drive
from osgeo import gdal
from PIL import Image
from pprint import pprint

In [None]:
ee.Authenticate()
ee.Initialize()
drive.mount('/content/drive')

In [3]:
%cd /content
%rm -rf fyp

/content


In [None]:
# Import FYP repo so we can access fyputil common library 
%cd /content
!git clone https://github.com/WRFitch/fyp.git

In [None]:
# Import fyputil library
%cd fyp/src/fyputil
import constants as c
import ee_constants as eec
import ee_utils as eeutil
import fyp_utils as fyputil
%cd /content

# Dataset import

### Import the following datasets into Google Drive

*   [Sentinel-2 Satellite photography](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S2_SR)
*   [Sentinel-5 Precursor Data](https://developers.google.com/earth-engine/datasets/catalog/sentinel)
  *   [Carbon Monoxide](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S5P_OFFL_L3_CO)
  *   [Formaldehyde](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S5P_OFFL_L3_HCHO)
  *   [Nitrogen Dioxide](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S5P_OFFL_L3_NO2)
  *   [Ozone](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S5P_OFFL_L3_O3)
  *   [Sulphur Dioxide](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S5P_OFFL_L3_SO2)
  *   [Methane](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S5P_OFFL_L3_CH4)
*   [ODIAC Fossil Fuel CO2 Emissions](https://db.cger.nies.go.jp/dataset/ODIAC/DL_odiac2019.html)

##### Import necessary variables

In [6]:
# Define dataset boundaries for britain and london 
great_britain = eec.great_britain
london = eec.london
uxbridge = eec.uxbridge
millennium_dome = eec.millennium_dome
greenwich = eec.greenwich
w_hemisphere = eec.west_hemisphere
e_hemisphere = eec.east_hemisphere

In [11]:
# TODO move this into a CONSTANTS.py file
# Earth engine username, used to import classified image into ee assets folder
USERNAME = 'wrfitch'
OUTPUT_DIR = USERNAME + "/out/"

# Define collections for each dataset to be used 
s2 = eec.s2
s5_CO = eec.s5_CO
s5_HCHO = eec.s5_HCHO
s5_NO2 = eec.s5_NO2
s5_O3 = eec.s5_O3
s5_SO2 = eec.s5_SO2
s5_CH4 = eec.s5_CH4
#TODO import CO2 dataset

CO_band = c.CO_band
HCHO_band = c.HCHO_band
NO2_band = c.NO2_band
O3_band = c.O3_band
SO2_band = c.SO2_band
CH4_band = c.CH4_band

start_date = eec.start_date
end_date = eec.end_date
vis_palette = eec.vis_palette

drive_path = c.drive_path
export_dir = c.export_dir
geotiff_dir = c.geotiff_dir

In [13]:
# Import datasets 
# TODO analyse whether it makes sense to analyse these on a highly localised level
# TODO figure out whether this object duplication is an antipattern that will 
#      eat all my RAM 
s2_img = eec.s2_img
CO_img = eec.CO_img
HCHO_img = eec.HCHO_img
NO2_img = eec.NO2_img
O3_img = eec.O3_img
SO2_img = eec.SO2_img
CH4_img = eec.CH4_img

s2_id = eec.s2_id
CO_id = eec.CO_id
HCHO_id = eec.HCHO_id
NO2_id = eec.NO2_id
O3_id = eec.O3_id
SO2_id = eec.SO2_id
CH4_id = eec.CH4_id

# For easier iteration down the line. I know I'm not supposed to, but google 
# can't tell me what to do, even if it's a good idea!
ghg_imgs = [CO_img, HCHO_img, NO2_img, O3_img, SO2_img, CH4_img]
ghg_ids = [CO_id, HCHO_id, NO2_id, O3_id, SO2_id, CH4_id]

### Visualise Data

In [8]:
eec.map 

### Export Data

Exports as .csv tables and GeoTIFF images. 

#### Define export methods

In [None]:
# All export methods export to the google drive defined above 
# TODO add checking for existing files in these methods.

def exportTable(table, scale, folder="no_export_folder", desc="no_desc"):
  ee.batch.Export.table.toDrive(
    collection = table,
    description = desc,
    folder = folder,
    fileFormat = "CSV"
  ).start()

# Export one table of the given image, at the scale and dimensions specified.
def exportTableFromImage(image, polygon, scale, folder="no_export_folder", desc="no_desc"):
  exportTable(sample(image, polygon, scale), scale, folder, desc)
  
# samples image into feature_collection. 
# TODO why is this called sample() ?
def sample(img, region, scale):
  return img.sampleRegions(
      collection = region,
      geometries = True,
      scale = scale
  )

# Export one GeoTIFF image of the given image, at the scale and dimension specified. 
# TODO reevaluate image export options - description needs coordinates
# maxPixels is just so it lets me export london at 10m/px. Dividing the dataset into 1km squares is the next step. 
def exportGeotiff(image, polygon, scale, folder="no_export_folder", desc="no_desc"):
  ee.batch.Export.image.toDrive(
    crs = 'EPSG:3857',
    description = desc,
    fileFormat = 'GeoTIFF',
    folder = folder,
    # formatOptions = image_export_options,
    image = image,
    maxPixels = 10e9,
    region = polygon,
    scale = scale
  ).start()

In [None]:
#pprint(ee.batch.Task.list())

#### Exporting CSVs

This method of getting the data is very very stupid, but also it does exactly what I need. 

In [None]:
# Only once this is completed can you move forward and get pictures from these spreadsheets.
for ghg_img in ghg_imgs:
  csv_name = ghg_img.getInfo().get('bands')[0].get('id')
  #exportTableFromImage(ghg_img,london, 1000, export_dir, csv_name)

#### Getting Images From CSV Data

In [None]:
eeutil.getImgsFromCsv(f"{c.data_dir}/{c.SO2_band}.csv", s2_img)

# Data processing

In [None]:
# removes old geoTIFF images or xml conversion artifacts from the given directory. 
def rmArtifact(artifact_path, rmTif = False, rmXml = False):
  if not os.path.isfile(artifact_path): return
  if not (rmTif or rmXml): return
  
  extension = os.path.splitext(artifact_path)[1].lower()
  if (extension == ".tif" and rmTif) or \
      (extension == ".xml" and rmXml): 
    print(f"removing {artifact_path}")
    os.remove(artifact_path)

def rmConversionArtifacts(path, rmTif = False, rmXml = False):
  # No point checking all these files if we're not going to do anything 
  if not (rmTif or rmXml): return

  parent_path = os.path.join(drive_path, path)
  print(parent_path)

  for root, dirs, files in os.walk(parent_path, topdown=True):
    for name in files:
      fullpath = os.path.join(root, name)
      rmArtifact(fullpath, rmTif, rmXml)

def geotiffToPng(tif_path, rm_artifacts = False):
  # Define rgb bands and file extension
  options_list = [
    '-ot Byte',
    '-of PNG',
    '-b 4',
    '-b 3',
    '-b 2',
    '-scale'
  ]
  options_string = " ".join(options_list)
  parent_path = os.path.join(drive_path, tif_path)

  # Recursively walk through all files (this has to be simpler)
  for root, dirs, files in os.walk(parent_path, topdown=False):
    for name in files:
      full_path = os.path.join(root, name)
      print(full_path)
      split_path = os.path.splitext(full_path)

      if split_path[1].lower() != ".tif": continue

      path = split_path[0]
      filename = path.split("/")[-1]
      if os.path.isfile(path + ".png") or os.path.isfile(f"{drive_path}{export_dir}/png/{filename}.png"):
        print(f"A png file already exists for {full_path}")
        continue
      
      gdal.Translate(
        path + '.png',
        path + '.tif',
        options = options_string
      )
      print(f"Converted {filename} from GeoTIFF to PNG")
      if rm_artifacts: rmArtifact(full_path, True, True)

# Move files from src to dest if they have the correct extension
def moveFilesByExtension(src, dest, extension):
  parent_path = os.path.join(drive_path, src)
  print(parent_path)

  for root, dirs, files in os.walk(parent_path, topdown=True):
    for name in files:
      full_path = os.path.join(root, name)
      split_path = os.path.splitext(full_path)
      if split_path[1].lower() == extension:
        dest_path = full_path.replace(src, dest)
        os.rename(full_path, dest_path)

# TODO currently unused, since the rest of the code checks for duplicates before initiating an export. 
def deDupe(path):
  print("removing duplicates")
  parent_path = os.path.join(drive_path, path)
  print(parent_path)

  for root, dirs, files in os.walk(parent_path, topdown=True):
    for name in files:
      # TODO update to check if the filename fits the intended format for coordinate images. 
      try:
        float(name.split("_")[-1][:-4])
      except Exception:
        print(f"removing {name}")
        #os.remove(os.path.join(root, name))

# TODO add file indexing into one CSV with all our latlong exports.

In [None]:
# add de-duplication back in once it actually does something. I've managed to avoid dupes so far anyway, so it's 
# unnecessary until it becomes an actual problem. YAGNI! 
# TODO parameterise filepaths in constants.py
#deDupe(f"{drive_path}{geotiff_dir}")
geotiffToPng(geotiff_dir, rm_artifacts=False)
moveFilesByExtension(geotiff_dir, f"{export_dir}/geotiff", ".tif")
moveFilesByExtension(geotiff_dir, f"{export_dir}/png", ".png")
rmConversionArtifacts(geotiff_dir, rmTif=False, rmXml=True)

#deDupe(f"{drive_path}{export_dir}/png")
#deDupe(f"{drive_path}{export_dir}/geotiff")

In [None]:
# Cleaning up if things go a bit wrong
moveFilesByExtension(f"{export_dir}/geotiff", f"{export_dir}/png", ".png")
rmConversionArtifacts(f"{export_dir}/geotiff", rmTif=False, rmXml=True)
moveFilesByExtension(geotiff_dir, f"{export_dir}/geotiff", ".tif")

In [None]:
# Extracts lat and long volumes from .geo param in csv. 
def parseCsvCoords(csv_path):
  print(csv_path)
  test_suffix = "_parsing_coordinates.csv"
  
  with open(csv_path, 'r') as read_obj, \
      open(csv_path + test_suffix, 'w', newline='') as write_obj:
    csv_reader = csv.reader(read_obj, delimiter=",")
    csv_writer = csv.writer(write_obj, delimiter=",")

    firstRow = True
    for row in csv_reader:
      if firstRow: 
        # break early if this method has already been applied to the given csv. 
        if "longitude" in row: 
          print(f"{csv_path} has already been processed - exiting...")
          return 
        row.append("longitude")
        row.append("latitude")
        firstRow = False
      else:
        # Could string parsing be made more efficient? 
        # This method only needs to run once per csv so optimisation isn't that important. 
        coords = json.loads(row[2]).get("coordinates")
        row.append(coords[0])
        row.append(coords[1])

      csv_writer.writerow(row)
  
  os.remove(csv_path)
  os.rename(csv_path + test_suffix, csv_path)

In [None]:
# TODO add these all into one csv file as part of import pipeline. 
parseCsvCoords(f"{drive_path}{export_dir}/{CO_band}.csv")
parseCsvCoords(f"{drive_path}{export_dir}/{HCHO_band}.csv")
parseCsvCoords(f"{drive_path}{export_dir}/{NO2_band}.csv")
parseCsvCoords(f"{drive_path}{export_dir}/{O3_band}.csv")
parseCsvCoords(f"{drive_path}{export_dir}/{SO2_band}.csv")
parseCsvCoords(f"{drive_path}{export_dir}/{CH4_band}.csv")