In [1]:
from pathlib import Path

import geopandas as gpd
import pandas as pd

import ee

In [2]:
PROJECT_ROOT = Path('/home/rhamilton/code/cnwi/test_data/aoi_NS')
SOURCE_DATA = PROJECT_ROOT / 'data'


In [3]:
## On init move raw data to output/data


output = PROJECT_ROOT / Path("output")
if not output.exists():
    output.mkdir()

data = PROJECT_ROOT /output / Path("data")
if not data.exists():
    data.mkdir()

raw = PROJECT_ROOT / output / data / Path("raw")
if not raw.exists():
    raw.mkdir()




In [4]:
from shutil import copytree

copytree(SOURCE_DATA, raw, dirs_exist_ok=True)


PosixPath('/home/rhamilton/code/cnwi/test_data/aoi_NS/output/data/raw')

# 1: Create the Data Manifest
- Point to the data source

In [5]:
# create the manifest
from cnwi.cnwilib.data import Manifest

manifest = Manifest(raw)
manifest.create()



<cnwi.cnwilib.data.Manifest at 0x7feaf72b1c10>

In [6]:
# export reference data
# map from the manifest
# and the manifest to the reference data 
ref_dir = output / Path("manifests")
ref_dir.mkdir(exist_ok=True)
manifest.save(ref_dir / Path("manifest.csv"))

<cnwi.cnwilib.data.Manifest at 0x7feaf72b1c10>

# 2: Process the Data in the Data Manifest
- Data Engineering happens in this step
- export of manifest to disk
- export of data lookup to disk
- export of features and regions to disk

In [7]:
from cnwi.cnwilib.data import ManifestProcessor

proc = ManifestProcessor(manifest)
proc.process()


In [8]:

proc.training.head()


Unnamed: 0,class_name,geometry,type,region_id
0,1,POINT (-62.66561 45.66387),1,122
1,1,POINT (-63.62492 45.79504),1,122
2,1,POINT (-62.59665 45.56264),1,122
3,1,POINT (-64.05334 45.86434),1,122
4,1,POINT (-64.22656 46.01314),1,122


In [9]:
# create the root directory for the processed data

# save the training data and regions to disk
# create a processed dir under the output dir
proc_dir = data / Path("processed")
if not proc_dir.exists():
    proc_dir.mkdir()
    
proc.save_training(where=proc_dir, fname='features.geojson', driver='GeoJSON')
proc.save_regions(where=proc_dir, fname='regions.geojson', driver='GeoJSON')


<cnwi.cnwilib.data.ManifestProcessor at 0x7fec4caaf350>

# 3. Data Transformation
- Compress the shapefile to a zip file

In [12]:
from cnwi.cnwilib.data import features2Zip

features2Zip(
    gdf = proc.training,
    groupby_col='region_id',
    where=proc_dir,
)

features2Zip(
    gdf=proc.regions,
    groupby_col='region_id',
    where=proc_dir,
    file_prefix='regions'
)

# 4. Upload data to asset
- you have 2 options here
    - upload to a new asset
    - upload to cloud storage and then ingest the asset from cloud storage to the asset store
- this example will assume you are uploading to a new asset
- you would create a staging area for your data
- you would then upload all the data in the zipped dir to the staging area before moving to the next step

In [7]:
# # create the staging area for the data
# ! earthengine create folder projects/ee-nwrc-geomatics/assets/cnwi

After Creating the staging area you would manually upload the processed zip files to the staging area


In [13]:
import pandas as pd
import ee
ee.Initialize()
sever_assets = ee.data.listAssets({'parent': 'projects/ee-nwrc-geomatics/assets/cnwi'})
server_df = pd.DataFrame(sever_assets['assets'])
server_df

Unnamed: 0,type,name,id,updateTime
0,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:06:50.431786Z
1,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:09:14.404835Z
2,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:09:36.218640Z
3,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:10:07.712464Z
4,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:10:14.911408Z
5,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:10:40.991216Z
6,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:10:46.263744Z
7,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:11:02.586896Z
8,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/regions...,projects/ee-nwrc-geomatics/assets/cnwi/regions...,2024-01-02T17:12:04.963964Z
9,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/regions...,projects/ee-nwrc-geomatics/assets/cnwi/regions...,2024-01-02T17:12:22.956626Z


In [18]:
# split name into root and asset_name
server_df['root'] = server_df['name'].apply(lambda x: x.split('/')[-2])
server_df['asset_name'] = server_df['name'].apply(lambda x: x.split('/')[-1])
server_df['data_type'] = server_df['asset_name'].apply(lambda x: x.split('_')[0])
server_df['region_id'] = server_df['asset_name'].apply(lambda x: x.split('_')[1])
server_df.to_csv(output/ Path('server_assets.csv'), index=False)

Unnamed: 0,type,name,id,updateTime,root,asset_name
0,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:06:50.431786Z,cnwi,features_122
1,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:09:14.404835Z,cnwi,features_123
2,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:09:36.218640Z,cnwi,features_124
3,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:10:07.712464Z,cnwi,features_125
4,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:10:14.911408Z,cnwi,features_126
5,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:10:40.991216Z,cnwi,features_127
6,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:10:46.263744Z,cnwi,features_128
7,TABLE,projects/ee-nwrc-geomatics/assets/cnwi/feature...,projects/ee-nwrc-geomatics/assets/cnwi/feature...,2023-12-22T15:11:02.586896Z,cnwi,features_129


In [None]:
# for sampling only need the training data