In [16]:
import bw_processing as bwp
import bw2data as bd
import bw2io as bi
import numpy as np
import pandas as pd
import csv
from fs.zipfs import ZipFS
from collections import defaultdict
import uuid

In [2]:
bi.restore_project_directory("/Users/ajakobs/Documents/data/hybrid-eco-exio/CH_household_hybrid_project_eco391_ex381_march_28.tar.gz")

Restoring project backup archive - this could take a few minutes...


'eco-ex-March-2023'

In [3]:
bd.projects.set_current('eco-ex-March-2023')

In [4]:
fu_file = '/Users/ajakobs/Documents/CCL/consumption_model_ch/consumption_model_ch/data/functional_units/habe20152017_hh_prepared_imputed.csv'

In [5]:
arr = np.genfromtxt(fu_file, delimiter=',')[1:, 1:]

`columns` are the string labels for the different consumption items.

`rows` are the individual household string IDs.

In [6]:
columns = next(csv.reader(open(fu_file)))[1:]
rows = [row[0] for row in csv.reader(open(fu_file))][1:]

Map lower-case cluster codes to a list of string household IDs

In [7]:
df = pd.read_csv('/Users/ajakobs/Documents/CCL/scripts/notebooks/write_files/ccl_dev/households_clustering_total.csv')
cluster_mapping = defaultdict(list)
for code, household in zip(df['cluster_code'], df['HaushaltID']):
    cluster_mapping[code].append(str(household))

In [8]:
consump = bd.Database('swiss consumption 1.0')
code_mapping = {x['code']: x.id for x in consump}

In [9]:
fu_list = sorted([
    act['name'] for act in consump 
    if act['name'].startswith('ch hh average') or act['name'].startswith('household cluster')
])
len(fu_list)

107

In [10]:
def clean_name(string):
    return string.replace("household cluster ", "").replace(", years 151617", "")\
                 .replace("consumption", "")\
                 .replace("disaggregated", "").strip()

In [23]:
def create_samples_datapackage(func_unit_names, filename, data_arr, database_name='swiss consumption 1.0'):
    dp = bwp.create_datapackage(
        fs=ZipFS(filename, write=True),
        name="Array datapackage for Swiss household consumption",
        sum_intra_duplicates=True,
        sum_inter_duplicates=False,
    )

    # Iterate over activity names from the database
    for func_unit_name in func_unit_names:
        arr = data_arr.copy()
        node = bd.get_node(database=database_name, name=func_unit_name)
    
        if not func_unit_name.startswith('ch hh average consumption'):
            cluster_code = func_unit_name.replace("household cluster", "").replace("consumption, years 151617", "").strip()
            row_mask = np.array([elem in cluster_mapping[cluster_code] for elem in rows])
            arr = arr[row_mask, :]
            print(func_unit_name, row_mask.shape, row_mask.sum())
                
        col_mask = np.array([col_label in code_mapping for col_label in columns])
        arr = arr[:, col_mask]

        # Make sure there aren't any other exchanges which we don't have in our sampled data
        for exc in node.technosphere():
            assert exc.input['code'] in columns

        indices = np.array(
            [
                (code_mapping[col_label], node.id) 
                for col_label in columns if col_label in code_mapping
            ], dtype=bwp.INDICES_DTYPE
        )

        dp.add_persistent_array(
            matrix="technosphere_matrix",
            indices_array=indices,
            name="Array for {}".format(clean_name(func_unit_name)+str(uuid.uuid4())),
            # Transpose as inputs are rows and samples are columns
            data_array=arr.T,
            # All inputs; need to have signs flipped in technosphere matrix
            flip_array=np.array([True] * len(indices))
        )

    dp.finalize_serialization()

In [24]:
create_samples_datapackage(
    fu_list,
    '..//write_files/ccl_dev/household_archetypes.zip',
    arr
)

household cluster 1 consumption, years 151617 (9955,) 1861
household cluster 2 consumption, years 151617 (9955,) 2017
household cluster 3 consumption, years 151617 (9955,) 2057
household cluster 4 consumption, years 151617 (9955,) 2049
household cluster 5 consumption, years 151617 (9955,) 1771
household cluster 6 consumption, years 151617 (9955,) 200
household cluster A consumption, years 151617 (9955,) 1592
household cluster A1 consumption, years 151617 (9955,) 309
household cluster A2 consumption, years 151617 (9955,) 293
household cluster A3 consumption, years 151617 (9955,) 316
household cluster A4 consumption, years 151617 (9955,) 311
household cluster A5 consumption, years 151617 (9955,) 363
household cluster Aa consumption, years 151617 (9955,) 336
household cluster Ab consumption, years 151617 (9955,) 163
household cluster Ac consumption, years 151617 (9955,) 264
household cluster Ad consumption, years 151617 (9955,) 223
household cluster Af consumption, years 151617 (9955,) 42