In [2]:
import os
import shutil
import csv
import json
import glob

import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import Font

## Utilities

In [99]:
def copy_file(source_path, destination_dir):
    """
    Copy file from sourch path to target directory. 
    Use for copying disparate datasets to centralized directory.
    """
    destination_dir = os.path.expanduser(destination_dir)
    os.makedirs(destination_dir, exist_ok=True)
    filename = os.path.basename(source_path)
    destination_path = os.path.join(destination_dir, filename)
    
    if os.path.exists(destination_path):
        print(f"{filename} already exists in {destination_dir}")
        return False
    try:
        shutil.copy2(source_path, destination_dir)
        return True
    except Exception:
        print(f"Error copying {filename}")
        return False

def move_file(source_path, destination_dir):
    """
    Move file from sourch path to target directory. 
    Use for moving disparate datasets to centralized directory.
    """
    destination_dir = os.path.expanduser(destination_dir)
    os.makedirs(destination_dir, exist_ok=True)
    filename = os.path.basename(source_path)
    destination_path = os.path.join(destination_dir, filename)

    if os.path.exists(destination_path):
        print(f"{filename} already exists in {destination_dir}")
        return False
    try:
        shutil.move(source_path, destination_dir)
        return True
    except Exception:
        print(f"Error moving {filename}")
        return False

def to_snake_case(name):
    """
    Many datasets and folders containing spaces and special characters.
    Snake case for easier handling.
    """
    name = name.replace('.', '_').replace('-', '_')
    name = ''.join(c.lower() if c.isalnum() or c == '_' else '_' for c in name)
    while '__' in name:
        name = name.replace('__', '_')
    return name.strip('_')

def map_census_aliases(file_path, column_mapping_dict):
    """
    Census datasets are double headered. 2 part function to handle this:
    1. Map column aliases to column codes and add dict mapping to corresponding 
    dataset dict entry.
    2. Create a multiindex pandas dataframe using codes and aliases 

    Returns:
    Multiindex pandas dataframe using codes and aliases
    """
    file_path = os.path.expanduser(file_path)
    
    # Map aliases to column codes using csv module to properly handle quoted fields
    with open(file_path, 'r', newline='') as f:
        csv_reader = csv.reader(f)
        codes = next(csv_reader)  # First row contains codes
        names = next(csv_reader)  # Second row contains names
        # Update provided dictionary directly
        column_mapping_dict.update(dict(zip(names, codes)))
    
    # Read data using column aliases
    df = pd.read_csv(file_path, skiprows=[0])
    
    # Create MultiIndex columns with names as primary level
    df.columns = pd.MultiIndex.from_tuples(
        list(zip(codes, names)),
        names=['Code', 'Alias']
    )
    
    return df

def census_drop_cols(df, cols_to_drop):
    """
    Drop columns from multiindex dataframes based on alias header.
    Drop listed estimate columns and corresponding margin of error columns.
    Drop Empty columns.
    
    Returns:
    Multiindex pandas dataframe with estimate and margin of error columns dropped.
    """
    column_names = df.columns.names
    
    aliases = df.columns.get_level_values('Alias')
    codes = df.columns.get_level_values('Code')
    
    # Create a mapping of aliases to their original columns
    alias_to_col = {}
    for col, alias in zip(df.columns, aliases):
        # Remove quotes if present
        clean_alias = alias.strip('"')
        alias_to_col[clean_alias] = col
    
    # Initialize columns to drop
    columns_to_drop = set()
    
    # Drop specified columns and their margin of error pairs
    for remove_col in cols_to_drop:
        # Look for exact matches in cleaned aliases
        if remove_col in alias_to_col:
            columns_to_drop.add(alias_to_col[remove_col])
            
            # If this is an estimate column, find and drop corresponding margin of error
            if remove_col.startswith('Estimate!!'):
                margin_col = 'Margin of Error!!' + remove_col[len('Estimate!!'):]
                if margin_col in alias_to_col:
                    columns_to_drop.add(alias_to_col[margin_col])
    
    # Drop only columns that have both missing/empty headers AND no values
    for col, alias, code in zip(df.columns, aliases, codes):
        # Check for missing or empty headers in either level
        has_empty_header = (pd.isna(alias) or pd.isna(code) or 
                          alias.strip('"').strip() == '' or code.strip('"').strip() == '')
        
        has_values = not df[col].isna().all()
        
        if has_empty_header and not has_values:
            columns_to_drop.add(col)
    
    # Convert set back to list for dropping
    columns_to_drop = list(columns_to_drop)
    
    print(fr"\Dropping {len(columns_to_drop)} columns ({len(columns_to_drop)//2} pairs)")
    
    cleaned_df = df.drop(columns=columns_to_drop)
    cleaned_df.columns.names = column_names
    
    return cleaned_df

def export_census_csv(df, output_dir, filename, overwrite=False):
    """
    Export a multiindex dataframe to a single header csv (Aliases, drop Codes header).
    If overwrite, then overwrites existing files with same name, else throw error
    """
    try:
        output_path = os.path.join(os.path.expanduser(output_dir), filename)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Check if file exists and we're not overwriting
        if os.path.exists(output_path) and not overwrite:
            raise FileExistsError(f"File {filename} already exists and overwrite=False")
        
        # Get both column headers levels
        aliases = df.columns.get_level_values('Alias')
        cleaned_aliases = [alias.replace('"', '').replace('"', '').strip() 
                           if isinstance(alias, str) else alias for alias in aliases]

        temp_df = df.copy()
        
        # Write to csv using csv module to properly handle commas in column names
        with open(output_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            # Write only the aliases row
            writer.writerow(cleaned_aliases)
            
            temp_df.columns = cleaned_aliases
            temp_df.to_csv(f, index=False, header=False)
            
        print(f"Exported to {output_path}")
        return True
        
    except Exception as e:
        print(f"Error exporting CSV: {str(e)}")
        return False

In [3]:
path_head = os.path.expanduser("~/Desktop/Nextcloud/SCOVI Project/")
notes_wb_path = os.path.join(path_head, "Metrics/Notes/JPL_CCSVI_all_fields.xlsx")

In [4]:
notes_wb = load_workbook(notes_wb_path)
sheet = notes_wb.active

In [5]:
file_blocks = {}
current_file_name = None
cols_not_to_drop = ["Geography", "Geographic Area Name"] 

# Iterate through rows to find blocks for each csv file
for row in sheet.iter_rows(min_row=1, max_col=3, values_only=False):
    cell_value = row[0].value
    is_bold = row[0].font.bold if row[0].font else False
    
    # Detect file block by bold file name
    if is_bold and cell_value:
        original_name = cell_value.strip()
        current_file_name = to_snake_case(original_name)
        file_blocks[current_file_name] = {
            'original_name': original_name,
            'cols_to_drop': [],
            'code_to_alias_column_mappings': {},
            'original_file_path': '',
            'centralized_file_dir': '' 
        }
        continue
    
    # Check for columns with a "Subfield to keep" value
    if current_file_name and any(col.value for col in row):
        
        col_name = row[0].value
        subfield_value = row[1].value
        # Track columns with no value in the subfields to keep column
        if col_name and not subfield_value:
            if col_name not in cols_not_to_drop:
                file_blocks[current_file_name]['cols_to_drop'].append(col_name.strip())

In [6]:
list(file_blocks.items())

[('age_of_structure',
  {'original_name': 'Age of Structure',
   'cols_to_drop': ['Estimate!!Total:',
    'Estimate!!Total:!!Built 2020 or later',
    'Estimate!!Total:!!Built 2010 to 2019',
    'Estimate!!Total:!!Built 2000 to 2009',
    'Estimate!!Total:!!Built 1990 to 1999'],
   'code_to_alias_column_mappings': {},
   'original_file_path': '',
   'centralized_file_dir': ''}),
 ('aggregate_number_of_vehicles_available_by_tenure',
  {'original_name': 'Aggregate number of vehicles available by tenure',
   'cols_to_drop': ['Estimate!!Aggregate number of vehicles available:!!Owner occupied',
    'Estimate!!Aggregate number of vehicles available:!!Renter occupied'],
   'code_to_alias_column_mappings': {},
   'original_file_path': '',
   'centralized_file_dir': ''}),
 ('health_insurance_coverage_by_age',
  {'original_name': 'Health insurance coverage by age',
   'cols_to_drop': ['Estimate!!Total:',
    'Estimate!!Total:!!Under 19 years:',
    'Estimate!!Total:!!Under 19 years:!!With one ty

In [7]:
exposures_path_head = os.path.expanduser("~/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures")
central_path_head = os.path.expanduser("~/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/JPL")
cleaned_path_head = os.path.expanduser("~/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data")
json_dir_path = os.path.expanduser("~/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/census-jsons")

### Age of Structure

In [8]:
key = 'age_of_structure'
alias = key
original_file_path = exposures_path_head + r"Table1/Data Describing Unsafe Conditions/Higher risk housing and infrastructure/Age of Structure/Age of Structure/ACSDT5Y2022.B25034-Data.csv"
centralized_file_dir = os.path.join(central_path_head, alias)

# Update file_blocks entry
file_blocks[key]['original_file_path'] = original_file_path
file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # Copy the file to a centralized directory
# copy_file(original_file_path, centralized_file_dir)

# Build path to centralized directory path
file_name = os.path.basename(original_file_path)
file = os.path.join(centralized_file_dir, file_name)

cols_to_drop = file_blocks[key]['cols_to_drop']
column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

alias_df = map_census_aliases(file, column_mapping_dict)

# file_blocks['age_of_structure']['code_to_alias_column_mappings'] = column_mapping

age_of_structure_df = pd.read_csv(file)
age_of_structure_df.shape

(1084, 25)

In [9]:
# alias_df.head(2)

In [10]:
alias_df.shape

(1083, 25)

In [11]:
# list(file_blocks.items())[0]

In [12]:
# file_blocks['age_of_structure']['cols_to_drop']

In [13]:
test = census_drop_cols(alias_df, cols_to_drop)
test.shape

\Dropping 11 columns (5 pairs)


(1083, 14)

In [14]:
export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

Exported to /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data/age_of_structure.csv


True

In [15]:
age_of_structure_df.head(5)

Unnamed: 0,GEO_ID,NAME,B25034_001E,B25034_001M,B25034_002E,B25034_002M,B25034_003E,B25034_003M,B25034_004E,B25034_004M,...,B25034_007M,B25034_008E,B25034_008M,B25034_009E,B25034_009M,B25034_010E,B25034_010M,B25034_011E,B25034_011M,Unnamed: 24
0,Geography,Geographic Area Name,Estimate!!Total:,Margin of Error!!Total:,Estimate!!Total:!!Built 2020 or later,Margin of Error!!Total:!!Built 2020 or later,Estimate!!Total:!!Built 2010 to 2019,Margin of Error!!Total:!!Built 2010 to 2019,Estimate!!Total:!!Built 2000 to 2009,Margin of Error!!Total:!!Built 2000 to 2009,...,Margin of Error!!Total:!!Built 1970 to 1979,Estimate!!Total:!!Built 1960 to 1969,Margin of Error!!Total:!!Built 1960 to 1969,Estimate!!Total:!!Built 1950 to 1959,Margin of Error!!Total:!!Built 1950 to 1959,Estimate!!Total:!!Built 1940 to 1949,Margin of Error!!Total:!!Built 1940 to 1949,Estimate!!Total:!!Built 1939 or earlier,Margin of Error!!Total:!!Built 1939 or earlier,
1,1500000US150010201001,Block Group 1; Census Tract 201; Hawaii County...,601,110,0,12,24,21,33,28,...,80,87,39,17,18,5,6,9,14,
2,1500000US150010201002,Block Group 2; Census Tract 201; Hawaii County...,340,71,0,12,0,12,77,38,...,6,57,70,28,14,29,16,76,36,
3,1500000US150010201003,Block Group 3; Census Tract 201; Hawaii County...,728,152,0,12,46,38,140,94,...,37,70,30,141,73,67,71,49,23,
4,1500000US150010201004,Block Group 4; Census Tract 201; Hawaii County...,420,101,0,12,31,26,0,12,...,25,56,37,45,29,24,22,144,55,


## No longer listed in JPL?

### Aggregate income

In [16]:
# key = 'aggregate_income_deficit_dollars_in_the_past_12_months_for_families_by_family_type'
# alias = "aggregate_income"
# original_file_path = exposures_path_head + r"Table1/Data Describing Unsafe Conditions/Low income levels/aggregate_income/ACSDT5Y2022.B17011-Data.csv"
# centralized_file_dir = os.path.join(central_path_head, alias)

# # Update file_blocks entry
# file_blocks[key]['original_file_path'] = original_file_path
# file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # # Copy the file to a centralized directory
# # copy_file(original_file_path, centralized_file_dir)

# # Build path to centralized directory path
# file_name = os.path.basename(original_file_path)
# file = os.path.join(centralized_file_dir, file_name)

# cols_to_drop = file_blocks[key]['cols_to_drop']
# column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

# alias_df = map_census_aliases(file, column_mapping_dict)

# aggregate_income_df = pd.read_csv(file)
# aggregate_income_df.shape

In [17]:
# alias_df.shape

In [18]:
# test = census_drop_cols(alias_df, cols_to_drop)
# test.shape

In [19]:
# test.head(2)

In [20]:
# export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

### Aggregate number of vehicles

In [21]:
key = "aggregate_number_of_vehicles_available_by_tenure"
alias = "aggregate_vehicles"
original_file_path = exposures_path_head + r"Table1/Data Describing Unsafe Conditions/Mobility constraints/ACSDT5Y2022.B25046-Data.csv"
centralized_file_dir = os.path.join(central_path_head, alias)

# Update file_blocks entry
file_blocks[key]['original_file_path'] = original_file_path
file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # Copy the file to a centralized directory
# copy_file(original_file_path, centralized_file_dir)

# Build path to centralized directory path
file_name = os.path.basename(original_file_path)
file = os.path.join(centralized_file_dir, file_name)

cols_to_drop = file_blocks[key]['cols_to_drop']
column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

alias_df = map_census_aliases(file, column_mapping_dict)

aggregate_vehicles_df = pd.read_csv(file)
aggregate_vehicles_df.shape

(1084, 9)

In [22]:
alias_df.shape

(1083, 9)

In [23]:
test = census_drop_cols(alias_df, cols_to_drop)
test.shape

\Dropping 5 columns (2 pairs)


(1083, 4)

In [24]:
test.head(2)

Code,"﻿""GEO_ID""",NAME,B25046_001E,B25046_001M
Alias,Geography,Geographic Area Name,Estimate!!Aggregate number of vehicles available:,Margin of Error!!Aggregate number of vehicles available:
0,1500000US150010201001,Block Group 1; Census Tract 201; Hawaii County...,813,182
1,1500000US150010201002,Block Group 2; Census Tract 201; Hawaii County...,505,106


In [25]:
export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

Exported to /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data/aggregate_vehicles.csv


True

### Health insurance

In [26]:
key = "health_insurance_coverage_by_age"
alias = "health_insurance"
original_file_path = exposures_path_head + r"Table1/Data Describing Unsafe Conditions/Lack of social security and insurance/Health insurance coverage by age/ACSDT5Y2022.B27010-Data.csv"
centralized_file_dir = os.path.join(central_path_head, alias)

# Update file_blocks entry
file_blocks[key]['original_file_path'] = original_file_path
file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # Copy the file to a centralized directory
# copy_file(original_file_path, centralized_file_dir)

# Build path to centralized directory path
file_name = os.path.basename(original_file_path)
file = os.path.join(centralized_file_dir, file_name)

cols_to_drop = file_blocks[key]['cols_to_drop']
column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

alias_df = map_census_aliases(file, column_mapping_dict)

health_insurance_df = pd.read_csv(file)
health_insurance_df.shape

(1084, 135)

In [27]:
alias_df.shape

(1083, 135)

In [28]:
test = census_drop_cols(alias_df, cols_to_drop)
test.shape

\Dropping 125 columns (62 pairs)


(1083, 10)

In [29]:
test.head(2)

Code,"﻿""GEO_ID""",NAME,B27010_017E,B27010_017M,B27010_033E,B27010_033M,B27010_050E,B27010_050M,B27010_066E,B27010_066M
Alias,Geography,Geographic Area Name,Estimate!!Total:!!Under 19 years:!!No health insurance coverage,Margin of Error!!Total:!!Under 19 years:!!No health insurance coverage,Estimate!!Total:!!19 to 34 years:!!No health insurance coverage,Margin of Error!!Total:!!19 to 34 years:!!No health insurance coverage,Estimate!!Total:!!35 to 64 years:!!No health insurance coverage,Margin of Error!!Total:!!35 to 64 years:!!No health insurance coverage,Estimate!!Total:!!65 years and over:!!No health insurance coverage,Margin of Error!!Total:!!65 years and over:!!No health insurance coverage
0,1500000US150010201001,Block Group 1; Census Tract 201; Hawaii County...,0,12,42,49,40,35,0,12
1,1500000US150010201002,Block Group 2; Census Tract 201; Hawaii County...,7,10,42,57,7,8,7,9


In [30]:
export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

Exported to /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data/health_insurance.csv


True

# SKIP

### Household income 12 month

In [31]:
# key = "household_income_in_the_past_12_month"
# alias = "household_income"
# original_file_path = exposures_path_head + r"Table2/Data Describing Unsafe Conditions/Low Income Levels/household_income/ACSDT5Y2022.B19001-Data.csv"
# centralized_file_dir = os.path.join(central_path_head, alias)

# # Update file_blocks entry
# file_blocks[key]['original_file_path'] = original_file_path
# file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # # Copy the file to a centralized directory
# # copy_file(original_file_path, centralized_file_dir)

# # Build path to centralized directory path
# file_name = os.path.basename(original_file_path)
# file = os.path.join(centralized_file_dir, file_name)

# cols_to_drop = file_blocks[key]['cols_to_drop']
# column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

# alias_df = map_census_aliases(file, column_mapping_dict)

# household_income_df = pd.read_csv(file)
# household_income_df.shape

In [32]:
# alias_df.shape

In [33]:
# test = census_drop_cols(alias_df, cols_to_drop)
# test.shape

In [34]:
# test.head(2)

In [35]:
# export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

### Households with a computer

In [36]:
key = "households_with_a_computer"
alias = "households_w_computer"
original_file_path = exposures_path_head + r"Table1/Data Describing Unsafe Conditions/Language and communication barriers/households_w_computer/ACSDT5Y2022.B28010-Data.csv"
centralized_file_dir = os.path.join(central_path_head, alias)

# Update file_blocks entry
file_blocks[key]['original_file_path'] = original_file_path
file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # Copy the file to a centralized directory
# copy_file(original_file_path, centralized_file_dir)

# Build path to centralized directory path
file_name = os.path.basename(original_file_path)
file = os.path.join(centralized_file_dir, file_name)

cols_to_drop = file_blocks[key]['cols_to_drop']
column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

alias_df = map_census_aliases(file, column_mapping_dict)

household_computer_df = pd.read_csv(file)
household_computer_df.shape

(1084, 17)

In [37]:
alias_df.shape

(1083, 17)

In [38]:
test = census_drop_cols(alias_df, cols_to_drop)
test.shape

\Dropping 13 columns (6 pairs)


(1083, 4)

In [39]:
test.head(2)

Code,"﻿""GEO_ID""",NAME,B28010_007E,B28010_007M
Alias,Geography,Geographic Area Name,Estimate!!Total:!!No Computer,Margin of Error!!Total:!!No Computer
0,1500000US150010201001,Block Group 1; Census Tract 201; Hawaii County...,116,49
1,1500000US150010201002,Block Group 2; Census Tract 201; Hawaii County...,30,37


In [40]:
export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

Exported to /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data/households_w_computer.csv


True

### Income

In [41]:
# Cannot find file

### Internet subscription

In [42]:
key = "internet_subscription_in_household"
alias = "internet_subscription"
original_file_path = exposures_path_head + r"Table1/Data Describing Unsafe Conditions/Language and communication barriers/internet_subscription/ACSDT5Y2022.B28011-Data.csv"
centralized_file_dir = os.path.join(central_path_head, alias)

# Update file_blocks entry
file_blocks[key]['original_file_path'] = original_file_path
file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # Copy the file to a centralized directory
# copy_file(original_file_path, centralized_file_dir)

# Build path to centralized directory path
file_name = os.path.basename(original_file_path)
file = os.path.join(centralized_file_dir, file_name)

cols_to_drop = file_blocks[key]['cols_to_drop']
column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

alias_df = map_census_aliases(file, column_mapping_dict)

internet_sub_df = pd.read_csv(file)
internet_sub_df.shape

(1084, 19)

In [43]:
alias_df.shape

(1083, 19)

In [44]:
test = census_drop_cols(alias_df, cols_to_drop)
test.shape

\Dropping 15 columns (7 pairs)


(1083, 4)

In [45]:
test.head(2)

Code,"﻿""GEO_ID""",NAME,B28011_008E,B28011_008M
Alias,Geography,Geographic Area Name,Estimate!!Total:!!No Internet access,Margin of Error!!Total:!!No Internet access
0,1500000US150010201001,Block Group 1; Census Tract 201; Hawaii County...,160,56
1,1500000US150010201002,Block Group 2; Census Tract 201; Hawaii County...,45,39


In [46]:
export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

Exported to /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data/internet_subscription.csv


True

### Limited English speaking

In [47]:
key = "limited_english_speaking_households"
alias = "limited_english_speaking"
original_file_path = exposures_path_head + r"Table1/Data Describing Unsafe Conditions/Language and communication barriers/limited_english_speaking/ACSDT5Y2022.C16002-Data.csv"
centralized_file_dir = os.path.join(central_path_head, alias)

# Update file_blocks entry
file_blocks[key]['original_file_path'] = original_file_path
file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # Copy the file to a centralized directory
# copy_file(original_file_path, centralized_file_dir)

# Build path to centralized directory path
file_name = os.path.basename(original_file_path)
file = os.path.join(centralized_file_dir, file_name)

cols_to_drop = file_blocks[key]['cols_to_drop']
column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

alias_df = map_census_aliases(file, column_mapping_dict)

limited_eng_df = pd.read_csv(file)
limited_eng_df.shape

(1084, 31)

In [48]:
alias_df.shape

(1083, 31)

In [49]:
test = census_drop_cols(alias_df, cols_to_drop)
test.shape

\Dropping 21 columns (10 pairs)


(1083, 10)

In [50]:
test.head(2)

Code,"﻿""GEO_ID""",NAME,C16002_004E,C16002_004M,C16002_007E,C16002_007M,C16002_010E,C16002_010M,C16002_013E,C16002_013M
Alias,Geography,Geographic Area Name,Estimate!!Total:!!Spanish:!!Limited English speaking household,Margin of Error!!Total:!!Spanish:!!Limited English speaking household,Estimate!!Total:!!Other Indo-European languages:!!Limited English speaking household,Margin of Error!!Total:!!Other Indo-European languages:!!Limited English speaking household,Estimate!!Total:!!Asian and Pacific Island languages:!!Limited English speaking household,Margin of Error!!Total:!!Asian and Pacific Island languages:!!Limited English speaking household,Estimate!!Total:!!Other languages:!!Limited English speaking household,Margin of Error!!Total:!!Other languages:!!Limited English speaking household
0,1500000US150010201001,Block Group 1; Census Tract 201; Hawaii County...,0,12,0,12,53,36,0,12
1,1500000US150010201002,Block Group 2; Census Tract 201; Hawaii County...,0,12,0,12,4,5,0,12


In [51]:
export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

Exported to /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data/limited_english_speaking.csv


True

### Living Arrangements

In [52]:
key = "living_arragements_including_living_alone_by_sex_and_relationship"
alias = "living_arrangements"
original_file_path = exposures_path_head + r"Table1/Data Describing Unsafe Conditions/Historical an other disparities_race or origin_and_gender/living_arrangements/ACSDT5Y2022.B09019-Data.csv"
centralized_file_dir = os.path.join(central_path_head, alias)

# Update file_blocks entry
file_blocks[key]['original_file_path'] = original_file_path
file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # Copy the file to a centralized directory
# copy_file(original_file_path, centralized_file_dir)

# Build path to centralized directory path
file_name = os.path.basename(original_file_path)
file = os.path.join(centralized_file_dir, file_name)

cols_to_drop = file_blocks[key]['cols_to_drop']
column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

alias_df = map_census_aliases(file, column_mapping_dict)

living_arrangements_df = pd.read_csv(file)
living_arrangements_df.shape

(1084, 55)

In [53]:
alias_df.shape

(1083, 55)

In [54]:
test = census_drop_cols(alias_df, cols_to_drop)
test.shape

\Dropping 49 columns (24 pairs)


(1083, 6)

In [55]:
test.head(2)

Code,"﻿""GEO_ID""",NAME,B09019_005E,B09019_005M,B09019_008E,B09019_008M
Alias,Geography,Geographic Area Name,Estimate!!Total:!!In households:!!Householder:!!Male:!!Living alone,Margin of Error!!Total:!!In households:!!Householder:!!Male:!!Living alone,Estimate!!Total:!!In households:!!Householder:!!Female:!!Living alone,Margin of Error!!Total:!!In households:!!Householder:!!Female:!!Living alone
0,1500000US150010201001,Block Group 1; Census Tract 201; Hawaii County...,79,36,49,30
1,1500000US150010201002,Block Group 2; Census Tract 201; Hawaii County...,31,18,46,40


In [56]:
export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

Exported to /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data/living_arrangements.csv


True

# SKIP

### Persons in poverty

In [57]:
# key = "persons_in_poverty"
# alias = key
# original_file_path = exposures_path_head + r"Table2/Data Describing Unsafe Conditions/Low Income Levels/persons_in_poverty/ACSDT5Y2022.C17002-Data.csv"
# centralized_file_dir = os.path.join(central_path_head, alias)

# # Update file_blocks entry
# file_blocks[key]['original_file_path'] = original_file_path
# file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # # Copy the file to a centralized directory
# # copy_file(original_file_path, centralized_file_dir)

# # Build path to centralized directory path
# file_name = os.path.basename(original_file_path)
# file = os.path.join(centralized_file_dir, file_name)

# cols_to_drop = file_blocks[key]['cols_to_drop']
# column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

# alias_df = map_census_aliases(file, column_mapping_dict)

# persons_in_poverty_df = pd.read_csv(file)
# persons_in_poverty_df.shape

In [58]:
# alias_df.shape

In [59]:
# test = census_drop_cols(alias_df, cols_to_drop)
# test.shape

In [60]:
# test.head(2)

In [61]:
# export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

### Persons under 5 & 65

In [62]:
key = "persons_under_5_65_years_table"
alias = "person_under_5_65"
original_file_path = "All Exposures/Table2/Data Describing Unsafe Conditions/Special Group at Risk/person_under_5_65/ACSDT5Y2022.B01001-Data.csv"
centralized_file_dir = os.path.join(central_path_head, alias)

# Update file_blocks entry
file_blocks[key]['original_file_path'] = original_file_path
file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # Copy the file to a centralized directory
# copy_file(original_file_path, centralized_file_dir)

# Build path to centralized directory path
file_name = os.path.basename(original_file_path)
file = os.path.join(centralized_file_dir, file_name)

cols_to_drop = file_blocks[key]['cols_to_drop']
column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

alias_df = map_census_aliases(file, column_mapping_dict)

persons_under_5_65_df = pd.read_csv(file)
persons_under_5_65_df.shape

(1084, 101)

In [63]:
alias_df.shape

(1083, 101)

In [64]:
test = census_drop_cols(alias_df, cols_to_drop)
test.shape

\Dropping 59 columns (29 pairs)


(1083, 42)

In [65]:
test.head(2)

Code,"﻿""GEO_ID""",NAME,B01001_002E,B01001_002M,B01001_003E,B01001_003M,B01001_004E,B01001_004M,B01001_005E,B01001_005M,...,B01001_045E,B01001_045M,B01001_046E,B01001_046M,B01001_047E,B01001_047M,B01001_048E,B01001_048M,B01001_049E,B01001_049M
Alias,Geography,Geographic Area Name,Estimate!!Total:!!Male:,Margin of Error!!Total:!!Male:,Estimate!!Total:!!Male:!!Under 5 years,Margin of Error!!Total:!!Male:!!Under 5 years,Estimate!!Total:!!Male:!!5 to 9 years,Margin of Error!!Total:!!Male:!!5 to 9 years,Estimate!!Total:!!Male:!!10 to 14 years,Margin of Error!!Total:!!Male:!!10 to 14 years,...,Estimate!!Total:!!Female:!!67 to 69 years,Margin of Error!!Total:!!Female:!!67 to 69 years,Estimate!!Total:!!Female:!!70 to 74 years,Margin of Error!!Total:!!Female:!!70 to 74 years,Estimate!!Total:!!Female:!!75 to 79 years,Margin of Error!!Total:!!Female:!!75 to 79 years,Estimate!!Total:!!Female:!!80 to 84 years,Margin of Error!!Total:!!Female:!!80 to 84 years,Estimate!!Total:!!Female:!!85 years and over,Margin of Error!!Total:!!Female:!!85 years and over
0,1500000US150010201001,Block Group 1; Census Tract 201; Hawaii County...,766,193,37,46,39,36,76,47,...,0,12,44,33,54,40,8,11,21,22
1,1500000US150010201002,Block Group 2; Census Tract 201; Hawaii County...,301,88,9,8,9,8,26,16,...,9,9,8,10,5,6,5,6,29,37


In [66]:
export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

Exported to /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data/person_under_5_65.csv


True

### Persons under 5

In [67]:
# Skipping per JPL notes as a duplicate of persons under 5 & 65

### Population in group quarters

In [68]:
key = "population_in_group_quarters"
alias = "population_group_quarters"
original_file_path = "All Exposures/Table1/Data Describing Unsafe Conditions/Higher risk housing and infrastructure/Population in group quarters/DECENNIALPL2020.P5-Data.csv"
centralized_file_dir = os.path.join(central_path_head, alias)

# Update file_blocks entry
file_blocks[key]['original_file_path'] = original_file_path
file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # Copy the file to a centralized directory
# copy_file(original_file_path, centralized_file_dir)

# Build path to centralized directory path
file_name = os.path.basename(original_file_path)
file = os.path.join(centralized_file_dir, file_name)

cols_to_drop = file_blocks[key]['cols_to_drop']
column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

alias_df = map_census_aliases(file, column_mapping_dict)

group_quarters_df = pd.read_csv(file)
group_quarters_df.shape

(1084, 12)

In [69]:
alias_df.shape

(1083, 12)

In [70]:
test = census_drop_cols(alias_df, cols_to_drop)
test.shape

\Dropping 0 columns (0 pairs)


(1083, 12)

In [71]:
test.head(2)

Code,﻿GEO_ID,NAME,P5_001N,P5_002N,P5_003N,P5_004N,P5_005N,P5_006N,P5_007N,P5_008N,P5_009N,P5_010N
Alias,Geography,Geographic Area Name,!!Total:,!!Total:!!Institutionalized population:,!!Total:!!Institutionalized population:!!Correctional facilities for adults,!!Total:!!Institutionalized population:!!Juvenile facilities,!!Total:!!Institutionalized population:!!Nursing facilities/Skilled-nursing facilities,!!Total:!!Institutionalized population:!!Other institutional facilities,!!Total:!!Noninstitutionalized population:,!!Total:!!Noninstitutionalized population:!!College/University student housing,!!Total:!!Noninstitutionalized population:!!Military quarters,!!Total:!!Noninstitutionalized population:!!Other noninstitutional facilities
0,1500000US150010201001,"Block Group 1, Census Tract 201, Hawaii County...",0,0,0,0,0,0,0,0,0,0
1,1500000US150010201002,"Block Group 2, Census Tract 201, Hawaii County...",0,0,0,0,0,0,0,0,0,0


In [72]:
export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

Exported to /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data/population_group_quarters.csv


True

# SKIP

### Race origin

In [73]:
# key = "race_origin"
# alias = key
# original_file_path = "All Exposures/Table1/Data Describing Unsafe Conditions/Historical an other disparities_race or origin_and_gender/race_origin/ACSDT5Y2022.B02001-Data.csv"
# centralized_file_dir = os.path.join(central_path_head, alias)

# # Update file_blocks entry
# file_blocks[key]['original_file_path'] = original_file_path
# file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # # Copy the file to a centralized directory
# # copy_file(original_file_path, centralized_file_dir)

# # Build path to centralized directory path
# file_name = os.path.basename(original_file_path)
# file = os.path.join(centralized_file_dir, file_name)

# cols_to_drop = file_blocks[key]['cols_to_drop']
# column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

# alias_df = map_census_aliases(file, column_mapping_dict)

# race_origin_df = pd.read_csv(file)
# race_origin_df.shape

In [74]:
# alias_df.shape

In [75]:
# test = census_drop_cols(alias_df, cols_to_drop)
# test.shape

In [76]:
# test.head(2)

In [77]:
# export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

### Tenure

In [78]:
key = "tenure"
alias = key
original_file_path = "All Exposures/Table1/Data Describing Unsafe Conditions/Higher risk housing and infrastructure/Tenure/ACSDT5Y2022.B25003-Data.csv"
centralized_file_dir = os.path.join(central_path_head, alias)

# Update file_blocks entry
file_blocks[key]['original_file_path'] = original_file_path
file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # Copy the file to a centralized directory
# copy_file(original_file_path, centralized_file_dir)

# Build path to centralized directory path
file_name = os.path.basename(original_file_path)
file = os.path.join(centralized_file_dir, file_name)

cols_to_drop = file_blocks[key]['cols_to_drop']
column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

alias_df = map_census_aliases(file, column_mapping_dict)

tenure_df = pd.read_csv(file)
tenure_df.shape

(1084, 8)

In [79]:
alias_df.shape

(1083, 8)

In [80]:
test = census_drop_cols(alias_df, cols_to_drop)
test.shape

\Dropping 4 columns (2 pairs)


(1083, 4)

In [81]:
test.head(2)

Code,﻿GEO_ID,NAME,B25003_003E,B25003_003M
Alias,Geography,Geographic Area Name,Estimate!!Total:!!Renter occupied,Margin of Error!!Total:!!Renter occupied
0,1500000US150010201001,Block Group 1; Census Tract 201; Hawaii County...,249,68
1,1500000US150010201002,Block Group 2; Census Tract 201; Hawaii County...,41,20


In [82]:
export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

Exported to /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data/tenure.csv


True

In [83]:
file_blocks.keys()

dict_keys(['age_of_structure', 'aggregate_number_of_vehicles_available_by_tenure', 'health_insurance_coverage_by_age', 'households_with_a_computer', 'internet_subscription_in_household', 'limited_english_speaking_households', 'living_arragements_including_living_alone_by_sex_and_relationship', 'income_share_of_fpl', 'persons_under_5_65_years_table', 'population_in_group_quarters', 'race_origin', 'tenure', '2022_census_hawaiian_homelands'])

# SKIP

### Types of Health Insurance

In [84]:
# key = "types_of_health_insurance_by_age"
# alias = "health_insurance_types"
# original_file_path = "All Exposures/Table1/Data Describing Unsafe Conditions/Lack of social security and insurance/Health insurance coverage by age/ACSDT5Y2022.B27010-Data.csv"
# centralized_file_dir = os.path.join(central_path_head, alias)

# # Update file_blocks entry
# file_blocks[key]['original_file_path'] = original_file_path
# file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # # Copy the file to a centralized directory
# # copy_file(original_file_path, centralized_file_dir)

# # Build path to centralized directory path
# file_name = os.path.basename(original_file_path)
# file = os.path.join(centralized_file_dir, file_name)

# cols_to_drop = file_blocks[key]['cols_to_drop']
# column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

# alias_df = map_census_aliases(file, column_mapping_dict)

# health_insurance_df = pd.read_csv(file)
# health_insurance_df.shape

In [85]:
# alias_df.shape

In [86]:
# test = census_drop_cols(alias_df, cols_to_drop)
# test.shape

In [87]:
# test.head(2)

In [88]:
# export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

# SKIP

### 2022 Census Hawaiian Homelands

In [89]:
# key = "2022_census_hawaiian_homelands"
# alias = key
# original_file_path = "All Exposures/Table1/Data Describing Unsafe Conditions/Historical an other disparities_race or origin_and_gender/2022_census_hawaiian_homelands/ACSST5Y2022.S0601-Data.csv"
# centralized_file_dir = os.path.join(central_path_head, alias)

# # Update file_blocks entry
# file_blocks[key]['original_file_path'] = original_file_path
# file_blocks[key]['centralized_file_dir'] = centralized_file_dir

# # # Copy the file to a centralized directory
# # copy_file(original_file_path, centralized_file_dir)

# # Build path to centralized directory path
# file_name = os.path.basename(original_file_path)
# file = os.path.join(centralized_file_dir, file_name)

# cols_to_drop = file_blocks[key]['cols_to_drop']
# column_mapping_dict = file_blocks[key]['code_to_alias_column_mappings']

# alias_df = map_census_aliases(file, column_mapping_dict)

# hawaiian_homelands_2022_df = pd.read_csv(file)
# hawaiian_homelands_2022_df.shape

In [90]:
# alias_df.shape

In [91]:
# test = census_drop_cols(alias_df, cols_to_drop)
# test.shape

In [92]:
# test.head(2)

In [93]:
# export_census_csv(test, cleaned_path_head, f"{alias}.csv", True)

## ============================================================

In [94]:
# import glob

# csv_files = glob.glob(os.path.join(cleaned_path_head, "*.csv"))

# # Process each CSV file
# for csv_file in csv_files:
#     print(f"Processing {csv_file}...")
#     try:
#         census_csv_to_json(csv_file, json_dir_path)
#     except Exception as e:
#         print(f"Error processing {csv_file}: {e}")

Processing /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data/internet_subscription.csv...
JSON at /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/census-jsons/internet_subscription.json
Processing /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data/aggregate_income.csv...
JSON at /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/census-jsons/aggregate_income.json
Processing /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data/living_arrangements.csv...
JSON at /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/census-jsons/living_arrangements.json
Processing /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data/limited_english_speaking.csv...
JSON at /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/census-jsons/limited_english_speaking.json
Proc

JSON at /Users/andyyu/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/census-jsons/all_census.json


## Leaflet JSON 

In [4]:
# This version processes multiple csvs into a single master json
def census_csvs_to_master_json(csv_directory, json_path):
    """
    Process all census CSV files of a given directory into a single master JSON file.
    """       
    csv_directory = os.path.expanduser(csv_directory)
    json_path = os.path.expanduser(json_path)
    
    os.makedirs(os.path.dirname(json_path), exist_ok=True)
    
    csv_files = glob.glob(os.path.join(csv_directory, "*.csv"))
    
    metrics = {}
    
    for csv_file in csv_files:
        csv_filename = os.path.basename(csv_file)
        census_metric_name = os.path.splitext(csv_filename)[0]
        
        try:
            na_values = ["", "-", "**", "null"]
            df = pd.read_csv(csv_file, na_values=na_values)
            
            for _, row in df.iterrows():
                geo_id = row['Geography']
                # Remove the "1500000US" prefix
                if isinstance(geo_id, str) and 'US' in geo_id:
                    geo_id = geo_id.split('US')[1]
                
                geo_name = row['Geographic Area Name']
                
                # Split geographic name
                geo_name_parts = [geo_name_part.strip() for geo_name_part in geo_name.split(';')]
                
                if geo_id not in metrics:
                    metrics[geo_id] = {
                        "block_group": geo_name_parts[0],
                        "census_tract": geo_name_parts[1],
                        "county": geo_name_parts[2],
                        "state": geo_name_parts[3],
                        "metrics": {}
                    }

                # Group metrics by csv
                if census_metric_name not in metrics[geo_id]["metrics"]:
                    metrics[geo_id]["metrics"][census_metric_name] = {}
                
                # Add all estimate columns
                for col in df.columns:
                    if col == 'Geography' or col == 'Geographic Area Name' or col.startswith('Margin of Error'):
                        continue
                    
                    field_name = col.replace('Estimate!!Total:!!', '')
                    if pd.isna(row[col]):
                        metrics[geo_id]["metrics"][census_metric_name][field_name] = None
                        continue
                                            
                    value = int(row[col])
                    
                    metrics[geo_id]["metrics"][census_metric_name][field_name] = value
                    
        except Exception as e:
            print(f"Error processing {csv_file}: {e}")
            continue
    
    # Write to JSON file
    with open(json_path, 'w') as f:
        json.dump(metrics, f, indent=2)
    
    print(f"JSON at {json_path}")
    return metrics

# # This version processes each csv into individual json files
# # This version also lacks handling of null values (e.g. aggregate_vehicles)
# def census_csv_to_json(csv_path, json_dir_path="./"):
#     """
#     Process a cleaned census csv into a GeoID-to-values JSON lookup file.
#     """   
#     csv_path = os.path.expanduser(csv_path)
#     json_dir_path = os.path.expanduser(json_dir_path)
    
#     os.makedirs(json_dir_path, exist_ok=True)
    
#     csv_filename = os.path.basename(csv_path)
#     json_filename = os.path.splitext(csv_filename)[0] + ".json"
#     json_path = os.path.join(json_dir_path, json_filename)
    
#     df = pd.read_csv(csv_path)
    
#     metrics = {}
    
#     for _, row in df.iterrows():
#         geo_id = row['Geography']
#         # Remove the "1500000US" prefix
#         if isinstance(geo_id, str) and 'US' in geo_id:
#             geo_id = geo_id.split('US')[1]
        
#         geo_name = row['Geographic Area Name']
        
#         # Split geographic name
#         geo_name_parts = [geo_name_part.strip() for geo_name_part in geo_name.split(';')]
        
#         metrics[geo_id] = {
#             "block_group": geo_name_parts[0],
#             "census_tract": geo_name_parts[1],
#             "county": geo_name_parts[2],
#             "state": geo_name_parts[3],
#             "metric": {}
#         }
        
#         # Add all estimate columns
#         for col in df.columns:
#             if col == 'Geography' or col == 'Geographic Area Name' or col.startswith('Margin of Error'):
#                 continue
            
#             # Clean up field name
#             field_name = col.replace('Estimate!!Total:!!', '')
#             value = int(row[col])
            
#             # Add to lookup
#             metrics[geo_id]["metric"][field_name] = value
    
#     # Write to JSON file
#     with open(json_path, 'w') as f:
#         json.dump(metrics, f, indent=2)
    
#     print(f"JSON at {json_path}")
#     return metrics

def generate_dataset_params(csv_directory, json_path):
    csv_directory = os.path.expanduser(csv_directory)
    json_path = os.path.expanduser(json_path)

    os.makedirs(os.path.dirname(json_path), exist_ok=True)
    
    csv_files = glob.glob(os.path.join(csv_directory, "*.csv"))
    
    dataset_params = {}
    
    for csv_file in csv_files:
        csv_filename = os.path.basename(csv_file)
        key = os.path.splitext(csv_filename)[0]
        
        dataset_params[key] = {
            'metricName': '',
            'metricLabel': '',
            'columnThresholds': {},
        }
        
        na_values = ["", "-", "**", "null"]
        df = pd.read_csv(csv_file, na_values=na_values)
        
        for col in df.columns:
            if col == 'Geography' or col == 'Geographic Area Name' or col.startswith('Margin of Error'):
                continue

            field_name = col.replace('Estimate!!Total:!!', '')
            
            # This section calculates quantiles using pandas qcut rounded to nearest factor of 5
            numeric_col = pd.to_numeric(df[col])
            
            labels, edges = pd.qcut(numeric_col, q=10, labels=False, retbins=True, duplicates='drop')
            
            # Round edges to nearest factor of 5 directly in the loop
            rounded_edges = [5 * round(edge/5) for edge in edges]

            dataset_params[key]['columnThresholds'][field_name] = {
                'thresholds': rounded_edges,
                'colors': ['#FFEDA0', '#FED976', '#FEB24C', '#FD8D3C', '#FC4E2A', '#E31A1C', '#BD0026', '#800026', '#5A0018', '#3A000F']
            }
            
    with open(json_path, 'w') as f:
        json.dump(dataset_params, f, indent=2)
        
    print(dataset_params)

    return dataset_params

In [None]:
json_output_path = os.path.join(json_dir_path,"all_census.json") 

metrics = census_csvs_to_master_json(cleaned_path_head, json_output_path)

In [6]:
cleaned_path_head = os.path.expanduser("~/Desktop/Nextcloud/SCOVI Project/Metrics/All Exposures/census/cleaned-data")

json_dir_path = os.path.expanduser("~/Desktop")
json_output_path = os.path.join(json_dir_path,"census_datasets_info.json") 

dataset_params = generate_dataset_params(cleaned_path_head, json_output_path)

11
