# Waste generation and scenario planning transfer

Files are stored no S3 as csv files.

This module loads the files and saves it as a csv local partitioned dataset.

In [None]:
%%html
<style>
table {float:left}
</style>

| Document info | |
| --- | --- | 
| Area of interest: | Cape Town |
| Planning type: | All REL type producers |
| Prepared by: | Waste Labs (wastelabs.co) |
| Prepared for: | Johan W. Joubert |
| Contact: | elias@wastelabs.co |

In [None]:
%reload_kedro
import pandas as pd
import geopandas as gpd
import numpy as np
import boto3
import yaml
from GPSOdyssey import Kepler

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

2022-05-30 20:55:24,748 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2022-05-30 20:55:24,811 - root - INFO - ** Kedro project Demand estimation and waste collection routing optimisation for the City of Cape Town
2022-05-30 20:55:24,812 - root - INFO - Defined global variable `context`, `session` and `catalog`
2022-05-30 20:55:24,821 - root - INFO - Registered line magic `run_viz`


## Custom modules

In [None]:
def load_credentials():
    global_credentials_path = "../conf/local/credentials.yml"
    with open(global_credentials_path, "r") as read_file:
        credentials = yaml.load(read_file, Loader=yaml.SafeLoader)
    s3_cred = credentials["dev_s3"]["client_kwargs"]
    return s3_cred

def get_s3_bucket_session(s3_cred, bucket="project-rdi-cpt-public"):
    aws_access_key_id = s3_cred["aws_access_key_id"]
    aws_secret_access_key = s3_cred["aws_secret_access_key"]
    session = boto3.Session(
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
    )
    s3 = session.resource("s3")
    my_bucket = s3.Bucket(bucket)
    return my_bucket


def get_latest_bucket_files(my_bucket, path):
    prefix = path
    existing_s3_files = my_bucket.objects.filter(Prefix=prefix)
    pulled_results = []
    for key in existing_s3_files:
        pulled_results.append(
            {"filename": key.key, "size": key.size, "last_modified": key.last_modified}
        )
    file_info = pd.DataFrame(pulled_results)
    return file_info


def get_file(bucket, file, s3_cred):
    df = pd.read_csv(
        f"s3://{bucket}/{file}",
        storage_options={
            "key": s3_cred["aws_access_key_id"],
            "secret": s3_cred["aws_secret_access_key"]
        },
    )
    return df

In [None]:
s3_cred = load_credentials()
my_bucket = get_s3_bucket_session(s3_cred)

In [None]:
%%time
%reload_kedro

files = get_latest_bucket_files(my_bucket, path="data/01_raw/synthetic_populations/cpt/")
for i in range(files.shape[0]):
    file = files.iloc[i]["filename"]
    print(f"\n{i + 1} of {files.shape[0] + 1}: {file}")
    scenario_file = get_file(bucket="project-rdi-cpt-public", file=file, s3_cred=s3_cred)
    parition_key = file[file.rfind("/") + 1:].replace(".csv.gz", "")
    catalog.save("syn_pop_scenarios_local", {parition_key:scenario_file})

2022-05-17 11:14:02,231 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2022-05-17 11:14:02,303 - root - INFO - ** Kedro project Demand estimation and waste collection routing optimisation for the City of Cape Town
2022-05-17 11:14:02,305 - root - INFO - Defined global variable `context`, `session` and `catalog`
2022-05-17 11:14:02,314 - root - INFO - Registered line magic `run_viz`

1 of 101:data/01_raw/synthetic_populations/cpt/households_001.csv.gz
2022-05-17 11:14:05,097 - kedro.io.data_catalog - INFO - Saving data to `syn_pop_scenarios_local` (PartitionedDataSet)...

2 of 101:data/01_raw/synthetic_populations/cpt/households_002.csv.gz
2022-05-17 11:14:05,986 - kedro.io.data_catalog - INFO - Saving data to `syn_pop_scenarios_local` (PartitionedDataSet)...

3 of 101:data/01_raw/synthetic_populations/cpt/households_003.csv.gz
2022-05-17 11:14:06,591 - kedro.io.data_catalog - INFO - Saving data to `syn_pop_scenarios_local

In [None]:
%%time
%reload_kedro

files = get_latest_bucket_files(my_bucket, path="data/01_raw/waste_generation_scenarios/cpt/")
for i in range(files.shape[0]):
    file = files.iloc[i]["filename"]
    print(f"\n{i + 1} of {files.shape[0] + 1}: {file}")
    scenario_file = get_file(bucket="project-rdi-cpt-public", file=file, s3_cred=s3_cred)
    parition_key = file[file.rfind("/") + 1:].replace(".csv.gz", "")
    catalog.save("waste_gen_scenarios_local", {parition_key:scenario_file})

2022-05-17 11:25:02,595 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2022-05-17 11:25:02,667 - root - INFO - ** Kedro project Demand estimation and waste collection routing optimisation for the City of Cape Town
2022-05-17 11:25:02,668 - root - INFO - Defined global variable `context`, `session` and `catalog`
2022-05-17 11:25:02,676 - root - INFO - Registered line magic `run_viz`

1 of 101:data/01_raw/waste_generation_scenarios/cpt/households_001.csv.gz
2022-05-17 11:25:06,047 - kedro.io.data_catalog - INFO - Saving data to `waste_gen_scenarios_local` (PartitionedDataSet)...

2 of 101:data/01_raw/waste_generation_scenarios/cpt/households_002.csv.gz
2022-05-17 11:25:07,096 - kedro.io.data_catalog - INFO - Saving data to `waste_gen_scenarios_local` (PartitionedDataSet)...

3 of 101:data/01_raw/waste_generation_scenarios/cpt/households_003.csv.gz
2022-05-17 11:25:08,138 - kedro.io.data_catalog - INFO - Saving data to `wast