In [1]:
import duckdb

In [2]:
duckdb.sql("""
create or replace table Dirty as select row_number() over () as csv_row_id, * from read_csv('Battery_List_Data_ADA.2026-02-11.csv');
describe Dirty;
""")

┌────────────────────────────────────────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│                      column_name                       │ column_type │  null   │   key   │ default │  extra  │
│                        varchar                         │   varchar   │ varchar │ varchar │ varchar │ varchar │
├────────────────────────────────────────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ csv_row_id                                             │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ Manufacturer Name                                      │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Brand1                                                 │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Model Number                                           │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Technology                                             │ VARCHAR     │ YES     │ NULL    │ NUL

In [3]:
duckdb.sql("""
create or replace table EntityCode as select * from read_csv('../EntityCodes.csv');
describe EntityCode;
""")

┌────────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│      column_name       │ column_type │  null   │   key   │ default │  extra  │
│        varchar         │   varchar   │ varchar │ varchar │ varchar │ varchar │
├────────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ EntityCode             │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ ProdMfr_Value__cleaned │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ ProdMfr_Value__cec     │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
└────────────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

### ProdMfr

In [4]:
ob_prod_mfr = duckdb.sql("""
select csv_row_id, "Manufacturer Name" as ProdMfr_Value from Dirty;
""")
ob_prod_mfr.show(max_width=120)

┌────────────┬────────────────────────────────────────────────────┐
│ csv_row_id │                   ProdMfr_Value                    │
│   int64    │                      varchar                       │
├────────────┼────────────────────────────────────────────────────┤
│          1 │ Alpha ESS Co., Ltd.                                │
│          2 │ Altenergy Power System Inc.                        │
│          3 │ Altenergy Power System Inc.                        │
│          4 │ Altenergy Power System Inc.                        │
│          5 │ Altenergy Power System Inc.                        │
│          6 │ Altenergy Power System Inc.                        │
│          7 │ Altenergy Power System Inc.                        │
│          8 │ Altenergy Power System Inc.                        │
│          9 │ Altenergy Power System Inc.                        │
│         10 │ Altenergy Power System Inc.                        │
│          · │              ·                   

### ProdCode
If build the ProdCode, we first need to match the CEC manufacturer names to those in the list of EntityCodes.

In [5]:
duckdb.sql("""
select
    *
from ob_prod_mfr
where ProdMfr_Value not in (select ProdMfr_Value from EntityCode)
""")

┌────────────┬───────────────┐
│ csv_row_id │ ProdMfr_Value │
│   int64    │    varchar    │
├────────────┴───────────────┤
│           0 rows           │
└────────────────────────────┘

In [6]:
ob_prod_code_with_possible_dupes = duckdb.sql(r"""
select
    ob_prod_mfr.csv_row_id,
    format(
        '{}-{}',
        EntityCode,
        upper(regexp_replace("Model Number", '[^A-Za-z^0-9 ]|\s', '_', 'g'))
    ) as ProdCode_Value,
    EntityCode,
    "Model Number",
from ob_prod_mfr
join EntityCode on ob_prod_mfr.ProdMfr_Value == EntityCode.ProdMfr_Value__cec
join Dirty on ob_prod_mfr.csv_row_id = Dirty.csv_row_id
""")#.show(max_width=120)
ob_prod_code_with_possible_dupes

┌────────────┬──────────────────────────────────────┬────────────┬─────────────────────────────────┐
│ csv_row_id │            ProdCode_Value            │ EntityCode │          Model Number           │
│   int64    │               varchar                │  varchar   │             varchar             │
├────────────┼──────────────────────────────────────┼────────────┼─────────────────────────────────┤
│          1 │ AESS-SMILE_BAT_8_2PHA                │ AESS       │ SMILE-BAT-8.2PHA                │
│          2 │ ALTE-APBATTERY_48V_5_76KWH           │ ALTE       │ APbattery-48V/5.76kWh           │
│          3 │ ALTE-APBATTERY_51_2V_10_24KWH_U_N_1_ │ ALTE       │ APbattery-51.2V/10.24kWh-U(N=1) │
│          4 │ ALTE-APBATTERY_51_2V_20_48KWH_U_N_2_ │ ALTE       │ APbattery-51.2V/20.48kWh-U(N=2) │
│          5 │ ALTE-APBATTERY_51_2V_30_72KWH_U_N_3_ │ ALTE       │ APbattery-51.2V/30.72kWh-U(N=3) │
│          6 │ ALTE-APBATTERY_51_2V_40_96KWH_U_N_4_ │ ALTE       │ APbattery-51.2V/40.96kWh

There are no duplicates so we do not need to add an incremented number suffix.

In [7]:
duckdb.sql("""
select
    ProdCode_Value, count(ProdCode_Value) as cnt
from ob_prod_code_with_possible_dupes
group by ProdCode_Value
having cnt > 1
order by cnt desc
""")

┌────────────────┬───────┐
│ ProdCode_Value │  cnt  │
│    varchar     │ int64 │
├────────────────┴───────┤
│         0 rows         │
└────────────────────────┘

In [8]:
ob_prod_code = duckdb.sql("""
select csv_row_id, ProdCode_Value from ob_prod_code_with_possible_dupes
""")
ob_prod_code

┌────────────┬──────────────────────────────────────┐
│ csv_row_id │            ProdCode_Value            │
│   int64    │               varchar                │
├────────────┼──────────────────────────────────────┤
│          1 │ AESS-SMILE_BAT_8_2PHA                │
│          2 │ ALTE-APBATTERY_48V_5_76KWH           │
│          3 │ ALTE-APBATTERY_51_2V_10_24KWH_U_N_1_ │
│          4 │ ALTE-APBATTERY_51_2V_20_48KWH_U_N_2_ │
│          5 │ ALTE-APBATTERY_51_2V_30_72KWH_U_N_3_ │
│          6 │ ALTE-APBATTERY_51_2V_40_96KWH_U_N_4_ │
│          7 │ ALTE-APBATTERY_51_2V_51_20KWH_U_N_5_ │
│          8 │ ALTE-APBATTERY_51_2V_61_44KWH_U_N_6_ │
│          9 │ ALTE-APBATTERY_51_2V_71_68KWH_U_N_7_ │
│         10 │ ALTE-APBATTERY_51_2V_81_92KWH_U_N_8_ │
│          · │         ·                            │
│          · │         ·                            │
│          · │         ·                            │
│        854 │ XHTHE-SL00344U001L                   │
│        855 │ XHTHE-LC08350

### Description

In [9]:
ob_description = duckdb.sql("""
select csv_row_id, Description as Description_Value from Dirty;
""")
ob_description.show(max_width=120)

┌────────────┬─────────────────────────────────────────────────────┐
│ csv_row_id │                  Description_Value                  │
│   int64    │                       varchar                       │
├────────────┼─────────────────────────────────────────────────────┤
│          1 │ 9.8 kW, 8.2kWh lithium iron battery                 │
│          2 │ 5 kW, 5.76 kWh lithium iron phosphate battery       │
│          3 │ 6.14 kW, 10.24 kWh, lithium iron phosphate battery  │
│          4 │ 12.28 kW, 20.48 kWh, lithium iron phosphate battery │
│          5 │ 18.42 kW, 30.72 kWh, lithium iron phosphate battery │
│          6 │ 24.56 kW, 40.96 kWh, lithium iron phosphate battery │
│          7 │ 30.7 kW, 51.2 kWh, lithium iron phosphate battery   │
│          8 │ 36.84 kW, 61.44 kWh, lithium iron phosphate battery │
│          9 │ 42.98 kW, 71.68 kWh, lithium iron phosphate battery │
│         10 │ 49.12 kW, 81.92 kWh, lithium iron phosphate battery │
│          · │                    

### BatteryChemistryType
Below are the BatteryChemistryTypes reported by the CEC.

In [10]:
duckdb.sql("""
select distinct Technology as BatteryChemistryType_Value from Dirty;
""")

┌────────────────────────────┐
│ BatteryChemistryType_Value │
│          varchar           │
├────────────────────────────┤
│ Lithium Iron               │
│ Lithium Ion                │
│ Lithium Ion Phosphate      │
│ Lithium Iron Phosphate     │
│ Lead Acid                  │
└────────────────────────────┘

For "Lead Acid", the product description does not provide additional information, so we do not change it to something more specific.

In [11]:
duckdb.sql("""
select distinct
    "Manufacturer Name",
    Technology as BatteryChemistryType_Value,
    Description_Value,
from Dirty join ob_description using (csv_row_id)
where BatteryChemistryType_Value = 'Lead Acid'
""").show(max_width=120)

┌─────────────────────────────┬────────────────────────────┬───────────────────────────────────────┐
│      Manufacturer Name      │ BatteryChemistryType_Value │           Description_Value           │
│           varchar           │          varchar           │                varchar                │
├─────────────────────────────┼────────────────────────────┼───────────────────────────────────────┤
│ East Penn Manufacturing Co. │ Lead Acid                  │ 0.297 kW, 2.4 kWh, lead acid battery  │
│ BAE Batteries USA           │ Lead Acid                  │ 1.49 kW, 1.19 kWh lead acid battery   │
│ BAE Batteries USA           │ Lead Acid                  │ 12 kW, 40 kWh lead acid battery       │
│ East Penn Manufacturing Co. │ Lead Acid                  │ 0.244 kW, 1.97 kWh, lead acid battery │
│ East Penn Manufacturing Co. │ Lead Acid                  │ 0.146 kW, 1.15 kWh, lead acid battery │
│ BAE Batteries USA           │ Lead Acid                  │ 1.91 kW, 1.67 kWh lead acid ba

Note that the CEC data contains "Lithium Ion Phosphate" and "Lithium Iron", which appear to be typos.
We can verify that this is a typo by checking the descriptions of the associated batteries.

In [12]:
duckdb.sql("""
select distinct
    Technology as BatteryChemistryType_Value,
    'ion' in lower(Description_Value) <> 'iron' in lower(Description_Value) as should_be_iron,
    Description_Value,
from Dirty join ob_description using (csv_row_id)
where BatteryChemistryType_Value = 'Lithium Ion Phosphate'
""").show(max_width=120)

┌──────────────────────┬────────────────┬──────────────────────────────────────────────────────────────────────────────┐
│ BatteryChemistryTy…  │ should_be_iron │                              Description_Value                               │
│       varchar        │    boolean     │                                   varchar                                    │
├──────────────────────┼────────────────┼──────────────────────────────────────────────────────────────────────────────┤
│ Lithium Ion Phosph…  │ true           │ 12.8 kW, 25.6 kWh lithium iron phosphate battery                             │
│ Lithium Ion Phosph…  │ true           │ 33.28 kW, 66.56 kWh lithium iron phosphate battery                           │
│ Lithium Ion Phosph…  │ true           │ 28.16 kW, 56.32 kWh lithium iron phosphate battery                           │
│ Lithium Ion Phosph…  │ true           │ 20.48 kW, 40.96 kWh lithium iron phosphate battery                           │
│ Lithium Ion Phosph…  │ true   

For "Lithium Iron", a google search suggests that a lithium iron battery without phospate does not exist, so we conclude that "Lithium Iron" should be "Lithium Iron Phosphate".

In [13]:
duckdb.sql("""
select distinct
    "Manufacturer Name",
    Technology as BatteryChemistryType_Value,
    Description_Value,
from Dirty join ob_description using (csv_row_id)
where BatteryChemistryType_Value = 'Lithium Iron'
""").show(max_width=120)

┌──────────────────────────────────────────┬────────────────────────────┬──────────────────────────────────────────────┐
│            Manufacturer Name             │ BatteryChemistryType_Value │              Description_Value               │
│                 varchar                  │          varchar           │                   varchar                    │
├──────────────────────────────────────────┼────────────────────────────┼──────────────────────────────────────────────┤
│ Qcells North America                     │ Lithium Iron               │ 8.3kW, 15kWh, lithium iron battery system    │
│ SolaX Power Network Technology (Zhe ji…  │ Lithium Iron               │ 11.1 kW, 20 kWh, lithium iron battery system │
│ SolaX Power Network Technology (Zhe ji…  │ Lithium Iron               │ 5.5 kW, 10 kWh, lithium iron battery system  │
│ Qcells North America                     │ Lithium Iron               │ 11.1kW, 20kWh, lithium iron battery system   │
│ SolaX Power Network Technology

Now, we fix the inconsistencies and convert to Orange Button enumerations.

In [14]:
ob_battery_chemistry_type = duckdb.sql("""
select
    csv_row_id,
    case
        when Technology = 'Lead Acid' then 'LeadAcid'
        when Technology in ('Lithium Iron Phosphate', 'Lithium Ion Phosphate', 'Lithium Iron') then 'LiFePO4'
        when Technology = 'Lithium Ion' then 'LiIon'
    end as BatteryChemistryType_Value
from Dirty;
""")
duckdb.sql("""
select distinct BatteryChemistryType_Value from ob_battery_chemistry_type
""")

┌────────────────────────────┐
│ BatteryChemistryType_Value │
│          varchar           │
├────────────────────────────┤
│ LiIon                      │
│ LiFePO4                    │
│ LeadAcid                   │
└────────────────────────────┘

### ProdCertification.CertificationAgencyName
Below we see some multiple labels for the same certification agency:
- CSA rebranded to CSA Group in 2012.
- "TUV Rhainland of North America" is a mispelling of "TUV Rheinland of North America"

In [15]:
duckdb.sql("""
select distinct "UL 1973 Certification Certifying Entity" as CertificationAgencyName_Value from Dirty;
""")

┌────────────────────────────────┐
│ CertificationAgencyName_Value  │
│            varchar             │
├────────────────────────────────┤
│ CSA                            │
│ SGS                            │
│ TUV Rhainland of North America │
│ CSA Group                      │
│ Intertek                       │
│ UL                             │
│ TUV SUD                        │
│ TUV                            │
│ TUV Rheinland of North America │
│ TUV Rheinland                  │
│ TUV SUD America                │
├────────────────────────────────┤
│            11 rows             │
└────────────────────────────────┘

Now we normalize the labels.

In [16]:
ob_certification_agency_name = duckdb.sql("""
select
    csv_row_id,
    case "UL 1973 Certification Certifying Entity"
        when 'CSA' then 'CSA Group'
        when 'TUV Rhainland of North America' then 'TUV Rheinland of North America'
        else "UL 1973 Certification Certifying Entity"
    end as CertificationAgency__CertificationAgencyName_Value
from Dirty;
""")
duckdb.sql("""
select distinct CertificationAgency__CertificationAgencyName_Value from ob_certification_agency_name;
""")

┌────────────────────────────────────────────────────┐
│ CertificationAgency__CertificationAgencyName_Value │
│                      varchar                       │
├────────────────────────────────────────────────────┤
│ SGS                                                │
│ UL                                                 │
│ TUV SUD                                            │
│ TUV                                                │
│ TUV SUD America                                    │
│ TUV Rheinland                                      │
│ TUV Rheinland of North America                     │
│ CSA Group                                          │
│ Intertek                                           │
└────────────────────────────────────────────────────┘

### ProdCertification.CertificationDate

In [17]:
ob_certification_date = duckdb.sql("""
select csv_row_id, "UL 1973 Certification Certificate Date" as CertificationDate_Value from Dirty;
""")
ob_certification_date

┌────────────┬─────────────────────────┐
│ csv_row_id │ CertificationDate_Value │
│   int64    │          date           │
├────────────┼─────────────────────────┤
│          1 │ 2021-06-02              │
│          2 │ 2022-02-02              │
│          3 │ 2025-12-29              │
│          4 │ 2025-12-29              │
│          5 │ 2025-12-29              │
│          6 │ 2025-12-29              │
│          7 │ 2025-12-29              │
│          8 │ 2025-12-29              │
│          9 │ 2025-12-29              │
│         10 │ 2025-12-29              │
│          · │     ·                   │
│          · │     ·                   │
│          · │     ·                   │
│        854 │ 2023-03-10              │
│        855 │ 2024-06-20              │
│        856 │ 2022-11-07              │
│        857 │ 2024-09-09              │
│        858 │ 2024-09-06              │
│        859 │ 2024-09-09              │
│        860 │ 2023-01-05              │
│        861 │ 2

### ProdCertification.CertificationStandard
Here we see inconsistent formatting and typos:
- "Ed 3: 2022" vs "Ed. 3 : 2022"
- "Ed. 3 : 2023" should be "Ed. 3 : 2022"

In [18]:
duckdb.sql("""
select distinct "UL 1973 Certification Edition of UL 1973" as CertificationStandard_Value from Dirty;
""")

┌─────────────────────────────┐
│ CertificationStandard_Value │
│           varchar           │
├─────────────────────────────┤
│ Ed 3: 2022                  │
│ Ed. 2 : 2018                │
│ Ed. 3 : 2023                │
│ Ed. 3 : 2022                │
└─────────────────────────────┘

Now we fix these inconsistencies and convert to the Orange Button enumerations.

In [19]:
ob_certification_standard = duckdb.sql("""
select
    csv_row_id,
    case "UL 1973 Certification Edition of UL 1973"
        when 'Ed. 2 : 2018' then 'UL1973_2_2018'
        when 'Ed 3: 2022' then 'UL1973_3_2022'
        when 'Ed. 3 : 2022' then 'UL1973_3_2022'
        when 'Ed. 3 : 2023' then 'UL1973_3_2022'
    end as CertificationStandard_Value from Dirty;
""")
duckdb.sql("""
select distinct CertificationStandard_Value from ob_certification_standard
""")

┌─────────────────────────────┐
│ CertificationStandard_Value │
│           varchar           │
├─────────────────────────────┤
│ UL1973_2_2018               │
│ UL1973_3_2022               │
└─────────────────────────────┘

### EnergyCapacityNominal

In [20]:
ob_energy_capacity_nominal = duckdb.sql("""
select
    csv_row_id,
    'kWh' as EnergyCapacityNominal_Unit,
    "Nameplate Energy Capacity (kWh)" as EnergyCapacityNominal_Value,
from Dirty;
""")
ob_energy_capacity_nominal

┌────────────┬────────────────────────────┬─────────────────────────────┐
│ csv_row_id │ EnergyCapacityNominal_Unit │ EnergyCapacityNominal_Value │
│   int64    │          varchar           │           double            │
├────────────┼────────────────────────────┼─────────────────────────────┤
│          1 │ kWh                        │                         8.2 │
│          2 │ kWh                        │                        5.76 │
│          3 │ kWh                        │                       10.24 │
│          4 │ kWh                        │                       20.48 │
│          5 │ kWh                        │                       30.72 │
│          6 │ kWh                        │                       40.96 │
│          7 │ kWh                        │                        51.2 │
│          8 │ kWh                        │                       61.44 │
│          9 │ kWh                        │                       71.68 │
│         10 │ kWh                    

### DCOutput.PowerDCContinuousMax

In [21]:
ob_power_dc_continuous_max = duckdb.sql("""
select
    csv_row_id,
    'kW' as DCOutput__PowerDCContinuousMax_Unit,
    "Nameplate Energy Capacity (kWh)" as DCOutput__PowerDCContinuousMax_Value,
from Dirty;
""")
ob_power_dc_continuous_max

┌────────────┬─────────────────────────────────────┬──────────────────────────────────────┐
│ csv_row_id │ DCOutput__PowerDCContinuousMax_Unit │ DCOutput__PowerDCContinuousMax_Value │
│   int64    │               varchar               │                double                │
├────────────┼─────────────────────────────────────┼──────────────────────────────────────┤
│          1 │ kW                                  │                                  8.2 │
│          2 │ kW                                  │                                 5.76 │
│          3 │ kW                                  │                                10.24 │
│          4 │ kW                                  │                                20.48 │
│          5 │ kW                                  │                                30.72 │
│          6 │ kW                                  │                                40.96 │
│          7 │ kW                                  │                            

# Columns that are too hard.
Clean these later.

### EfficiencyBatteryRoundTrip

In [22]:
duckdb.sql("""
select distinct
    'decimal_percent' as EfficiencyBatteryRoundTrip_Unit,
    "Manufacturer Declared Roundtrip Efficiency (%, Ac-AC)1" as EfficiencyBatteryRoundTrip_Value,
from Dirty;
""").show(max_width=120)

┌──────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────┐
│ EfficiencyBatteryR…  │                               EfficiencyBatteryRoundTrip_Value                                │
│       varchar        │                                            varchar                                            │
├──────────────────────┼───────────────────────────────────────────────────────────────────────────────────────────────┤
│ decimal_percent      │ 90 [DPI 208]\n90 [DPI 480]                                                                    │
│ decimal_percent      │ No Information Submitted                                                                      │
│ decimal_percent      │ 89 [Lion Hybrid \nInverter 12kW]\n89 [Lion Hybrid \nInverter 15kW]                            │
│ decimal_percent      │ 89 [JKS-R-7P12-US]\n89 [JKS-R-11P12-US]                                                       │
│ decimal_percent      │ Inverte

In [23]:
duckdb.sql("""
select distinct
    "Certified JA12 Control  Strategies1"
from Dirty;
""").show(max_width=120)

┌──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│                                         Certified JA12 Control  Strategies1                                          │
│                                                       varchar                                                        │
├──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ JKS-R-7P12-US - Basic, TOU, Advance DR\nJKS-R-11P12-US - Basic, TOU, Advance DR                                      │
│ Inverter 18kW- Basic, TOU, Advance DR                                                                                │
│ Sol-Ark 5K - Basic, TOU, Advanced\nSol-Ark 8K - Basic, TOU, Advanced\nSol-Ark 12K - Basic, TOU, Advanced\nSol-Ark …  │
│ Sanctuary 3 \nInverter 12kW - Basic, TOU, Advance DR\nSanctuary 3 \nInverter 18kW- Basic, TOU, Advance DR\nSanctua…  │
│ Force 3.6DC - TOU\nForce 5.7DC

In [24]:
duckdb.sql("""
select distinct "Declaration for JA12 Submitted1" from Dirty;
""").show(max_width=120)

┌─────────────────────────────────┐
│ Declaration for JA12 Submitted1 │
│             varchar             │
├─────────────────────────────────┤
│ Y (2022)                        │
│ N                               │
└─────────────────────────────────┘



# Ignored columns

In [25]:
duckdb.sql("""
select distinct Notes from Dirty;
""").show(max_width=120)

┌───────────────────────────────────────────────┐
│                     Notes                     │
│                    varchar                    │
├───────────────────────────────────────────────┤
│ NULL                                          │
│ Formerly Stored Energy Inc, PF5-LFP19200-2A01 │
│ Formerly Stored Energy Inc, PF5-LFP14400-2A01 │
│ Formerly Stored Energy Inc, PF5-LFP24000-2A01 │
│ Formerly Stored Energy Inc, PF5-LFP38400-2A01 │
│                                               │
│ Formerly Stored Energy Inc, PF5-LFP33600-2A01 │
│ Formerly Stored Energy Inc, PF5-LFP09600-2A01 │
│ Formerly Stored Energy Inc, PF5-LFP28800-2A01 │
└───────────────────────────────────────────────┘



In [26]:
duckdb.sql("""
select distinct "CEC Listing Date" from Dirty;
""").show(max_width=120)

┌──────────────────┐
│ CEC Listing Date │
│       date       │
├──────────────────┤
│ 2021-07-21       │
│ 2023-06-21       │
│ 2026-02-02       │
│ 2022-10-03       │
│ 2021-10-11       │
│ 2023-06-01       │
│ 2022-01-21       │
│ 2025-03-21       │
│ 2024-07-01       │
│ 2025-09-11       │
│     ·            │
│     ·            │
│     ·            │
│ 2024-03-01       │
│ 2023-08-21       │
│ 2024-04-22       │
│ 2023-01-11       │
│ 2021-04-12       │
│ 2025-04-01       │
│ 2024-05-01       │
│ 2023-07-03       │
│ 2025-08-21       │
│ 2025-04-11       │
├──────────────────┤
│     121 rows     │
│    (20 shown)    │
└──────────────────┘



In [27]:
duckdb.sql("""
select distinct "Last Update" from Dirty;
""").show(max_width=120)

┌─────────────┐
│ Last Update │
│    date     │
├─────────────┤
│ 2023-05-11  │
│ 2024-09-11  │
│ 2026-02-02  │
│ 2025-12-11  │
│ 2023-12-01  │
│ 2024-11-01  │
│ 2024-01-02  │
│ NULL        │
│ 2024-06-03  │
│ 2024-03-01  │
│ 2023-05-22  │
│ 2019-12-23  │
│ 2023-09-21  │
│ 2023-07-03  │
│ 2025-08-21  │
│ 2024-04-02  │
├─────────────┤
│   16 rows   │
└─────────────┘



# Building clean data
Tip: You can use the Jupyter Notebook Table of Contents feature to see what columns we need to join together.

We will separate the cleaned values into multiple CSVs.
This is not necessary if the arrays only have one entry, but we do so here for a consistent data processing pipeline.

### ProdBattery

In [28]:
ob_prod_batteries = duckdb.sql("""
select
    ob_prod_mfr.csv_row_id,
    * exclude(csv_row_id)
from ob_prod_mfr
natural join ob_prod_code
natural join ob_description
natural join ob_battery_chemistry_type
natural join ob_energy_capacity_nominal
natural join ob_power_dc_continuous_max
""")
ob_prod_batteries.show(max_width=120)

┌────────────┬──────────────────────┬───┬──────────────────────┬──────────────────────┬──────────────────────┐
│ csv_row_id │    ProdMfr_Value     │ … │ EnergyCapacityNomi…  │ DCOutput__PowerDCC…  │ DCOutput__PowerDCC…  │
│   int64    │       varchar        │   │        double        │       varchar        │        double        │
├────────────┼──────────────────────┼───┼──────────────────────┼──────────────────────┼──────────────────────┤
│          1 │ Alpha ESS Co., Ltd.  │ … │                  8.2 │ kW                   │                  8.2 │
│          2 │ Altenergy Power Sy…  │ … │                 5.76 │ kW                   │                 5.76 │
│          3 │ Altenergy Power Sy…  │ … │                10.24 │ kW                   │                10.24 │
│          4 │ Altenergy Power Sy…  │ … │                20.48 │ kW                   │                20.48 │
│          5 │ Altenergy Power Sy…  │ … │                30.72 │ kW                   │                30.72 │
│

### ProdCertification

In [29]:
ob_prod_certifications = duckdb.sql("""
select
    ob_certification_agency_name.csv_row_id as prodbattery_id,
    * exclude(csv_row_id),
    row_number() over () as csv_row_id,
from ob_certification_agency_name
natural join ob_certification_date
natural join ob_certification_standard
""")
ob_prod_certifications.show(max_width=120)

┌────────────────┬────────────────────────────────┬─────────────────────────┬─────────────────────────────┬────────────┐
│ prodbattery_id │ CertificationAgency__Certifi…  │ CertificationDate_Value │ CertificationStandard_Value │ csv_row_id │
│     int64      │            varchar             │          date           │           varchar           │   int64    │
├────────────────┼────────────────────────────────┼─────────────────────────┼─────────────────────────────┼────────────┤
│              1 │ TUV Rheinland of North America │ 2021-06-02              │ UL1973_2_2018               │          1 │
│              2 │ UL                             │ 2022-02-02              │ UL1973_2_2018               │          2 │
│              3 │ CSA Group                      │ 2025-12-29              │ UL1973_3_2022               │          3 │
│              4 │ CSA Group                      │ 2025-12-29              │ UL1973_3_2022               │          4 │
│              5 │ CSA Group    

### Write cleaned data

In [30]:
ob_prod_batteries.pl().write_csv('ProdBattery.csv')
ob_prod_certifications.pl().write_csv('ProdCertification.csv')