In [1]:
import duckdb
import polars as pl

In [2]:
duckdb.sql("""
create or replace table Dirty as select row_number() over () as csv_row_id, * from read_csv('PV_Module_List_Full_Data_ADA.2026-02-11.csv');
describe Dirty;
""").show(max_width=120)

┌────────────────────────────────────────────────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│                          column_name                           │ column_type │  null   │   key   │ default │  extra  │
│                            varchar                             │   varchar   │ varchar │ varchar │ varchar │ varchar │
├────────────────────────────────────────────────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ csv_row_id                                                     │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ Manufacturer                                                   │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Model Number                                                   │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Description                                                    │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Safety Certification          

In [3]:
duckdb.sql("""
create or replace table EntityCode as select * from read_csv('../EntityCodes.csv');
describe EntityCode;
""")

┌────────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│      column_name       │ column_type │  null   │   key   │ default │  extra  │
│        varchar         │   varchar   │ varchar │ varchar │ varchar │ varchar │
├────────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ EntityCode             │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ ProdMfr_Value__cleaned │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ ProdMfr_Value__cec     │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
└────────────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

### ProdMfr

In [4]:
ob_prod_mfr = duckdb.sql("""
select csv_row_id, "Manufacturer" as ProdMfr_Value from Dirty;
""")
ob_prod_mfr.show(max_width=120)

┌────────────┬──────────────────────────┐
│ csv_row_id │      ProdMfr_Value       │
│   int64    │         varchar          │
├────────────┼──────────────────────────┤
│          1 │ Ablytek                  │
│          2 │ Ablytek                  │
│          3 │ Ablytek                  │
│          4 │ Ablytek                  │
│          5 │ Ablytek                  │
│          6 │ Advance Power            │
│          7 │ Advance Power            │
│          8 │ Advance Power            │
│          9 │ Advance Power            │
│         10 │ Advance Power            │
│          · │       ·                  │
│          · │       ·                  │
│          · │       ·                  │
│       9991 │ Mission Solar Energy LLC │
│       9992 │ Mission Solar Energy LLC │
│       9993 │ Mission Solar Energy LLC │
│       9994 │ Mission Solar Energy LLC │
│       9995 │ Mission Solar Energy LLC │
│       9996 │ Mission Solar Energy LLC │
│       9997 │ Mission Solar Energ

### ProdCode
If build the ProdCode, we first need to match the CEC manufacturer names to those in the list of EntityCodes.

In [5]:
duckdb.sql("""
select
    *
from ob_prod_mfr
where ProdMfr_Value not in (select ProdMfr_Value from EntityCode)
""")

┌────────────┬───────────────┐
│ csv_row_id │ ProdMfr_Value │
│   int64    │    varchar    │
├────────────┴───────────────┤
│           0 rows           │
└────────────────────────────┘

In [6]:
ob_prod_code_with_possible_dupes = duckdb.sql(r"""
select
    ob_prod_mfr.csv_row_id,
    format(
        '{}-{}',
        EntityCode,
        upper(regexp_replace("Model Number", '[^A-Za-z^0-9 ]|\s', '_', 'g'))
    ) as ProdCode_Value,
    EntityCode,
    "Model Number",
from ob_prod_mfr
join EntityCode on ob_prod_mfr.ProdMfr_Value == EntityCode.ProdMfr_Value__cec
join Dirty on ob_prod_mfr.csv_row_id = Dirty.csv_row_id
""")#.show(max_width=120)
ob_prod_code_with_possible_dupes

┌────────────┬──────────────────────┬────────────┬───────────────┐
│ csv_row_id │    ProdCode_Value    │ EntityCode │ Model Number  │
│   int64    │       varchar        │  varchar   │    varchar    │
├────────────┼──────────────────────┼────────────┼───────────────┤
│          1 │ ABLYT-6MN6A270       │ ABLYT      │ 6MN6A270      │
│          2 │ ABLYT-6MN6A275       │ ABLYT      │ 6MN6A275      │
│          3 │ ABLYT-6MN6A280       │ ABLYT      │ 6MN6A280      │
│          4 │ ABLYT-6MN6A285       │ ABLYT      │ 6MN6A285      │
│          5 │ ABLYT-6MN6A290       │ ABLYT      │ 6MN6A290      │
│          6 │ ADVP-API_P210        │ ADVP       │ API-P210      │
│          7 │ ADVP-API_P215        │ ADVP       │ API-P215      │
│          8 │ ADVP-API_P220        │ ADVP       │ API-P220      │
│          9 │ ADVP-API_M225        │ ADVP       │ API-M225      │
│         10 │ ADVP-API_P225        │ ADVP       │ API-P225      │
│          · │       ·              │  ·         │    ·       

There are duplicates so we need to add an incremented number suffix.

In [7]:
duckdb.sql("""
select
    ProdCode_Value, count(ProdCode_Value) as cnt
from ob_prod_code_with_possible_dupes
group by ProdCode_Value
having cnt > 1
order by cnt desc
""")

┌──────────────────────────┬───────┐
│      ProdCode_Value      │  cnt  │
│         varchar          │ int64 │
├──────────────────────────┼───────┤
│ ENRGT-EG_108L410W        │     2 │
│ ENRGT-EG_120L440W        │     2 │
│ ENRGT-EG_144L535W        │     2 │
│ ENRGT-EG_144L545W        │     2 │
│ JSRNE-HY_DH108P8B_405    │     2 │
│ JSRNE-HY_DH108P8B_410    │     2 │
│ JSRNE-HY_DH108N8B_425    │     2 │
│ JSRNE-HY_DH144N8B_560    │     2 │
│ JSRNE-HY_DH108N8B_410    │     2 │
│ JSRNE-HY_DH144N8B_570    │     2 │
│           ·              │     · │
│           ·              │     · │
│           ·              │     · │
│ TSHNE-HTM595MH8_60_1500_ │     2 │
│ TSHNE-HTM605MH8_60_1500_ │     2 │
│ TSHNE-HTM665MH8_66_1500_ │     2 │
│ JSRNE-HY_DH108P8B_390    │     2 │
│ JSRNE-HY_DH108P8B_400    │     2 │
│ JSRNE-HY_DH108N8B_405    │     2 │
│ JSRNE-HY_DH108N8B_415    │     2 │
│ JSRNE-HY_DH144P8B_550    │     2 │
│ TSHNE-HTM540MH5_72_1500_ │     2 │
│ HANWH-HSL72P6_PC_3_305   │     2 │
├

Notice that the incremental number suffix '-1' is only added to a ProdCode if it is duplicated.

In [8]:
ob_prod_code_increment_compute = duckdb.sql("""
select
    csv_row_id,
    count(ProdCode_Value) over(partition by ProdCode_Value) as cnt,
    row_number() over(partition by ProdCode_Value) as increment,
    if(cnt > 1, ProdCode_Value || '-' || increment, ProdCode_Value) as ProdCode_Value,
from ob_prod_code_with_possible_dupes
--qualify cnt > 1 and increment = 1
order by ProdCode_Value desc
""")
ob_prod_code_increment_compute

┌────────────┬───────┬───────────┬──────────────────────────┐
│ csv_row_id │  cnt  │ increment │      ProdCode_Value      │
│   int64    │ int64 │   int64   │         varchar          │
├────────────┼───────┼───────────┼──────────────────────────┤
│      20884 │     1 │         1 │ ZSRNE-RK_95              │
│      20883 │     1 │         1 │ ZSRNE-RK_75_2            │
│      20888 │     1 │         1 │ ZSRNE-RK_110             │
│      20887 │     1 │         1 │ ZSRNE-RK_105_2           │
│      20886 │     1 │         1 │ ZSRNE-RK_105             │
│      20885 │     1 │         1 │ ZSRNE-RK_100             │
│      20912 │     1 │         1 │ ZNSHIN-ZXP6_D60_255_P    │
│      20908 │     1 │         1 │ ZNSHIN-ZXP6_D60_250_P    │
│      20904 │     1 │         1 │ ZNSHIN-ZXP6_D60_245_P    │
│      20900 │     1 │         1 │ ZNSHIN-ZXP6_D60_240_P    │
│        ·   │     · │         · │           ·              │
│        ·   │     · │         · │           ·              │
│       

In [9]:
ob_prod_code = duckdb.sql("""
select csv_row_id, ProdCode_Value from ob_prod_code_increment_compute
""")
ob_prod_code

┌────────────┬──────────────────────────┐
│ csv_row_id │      ProdCode_Value      │
│   int64    │         varchar          │
├────────────┼──────────────────────────┤
│      20884 │ ZSRNE-RK_95              │
│      20883 │ ZSRNE-RK_75_2            │
│      20888 │ ZSRNE-RK_110             │
│      20887 │ ZSRNE-RK_105_2           │
│      20886 │ ZSRNE-RK_105             │
│      20885 │ ZSRNE-RK_100             │
│      20912 │ ZNSHIN-ZXP6_D60_255_P    │
│      20908 │ ZNSHIN-ZXP6_D60_250_P    │
│      20904 │ ZNSHIN-ZXP6_D60_245_P    │
│      20900 │ ZNSHIN-ZXP6_D60_240_P    │
│        ·   │           ·              │
│        ·   │           ·              │
│        ·   │           ·              │
│      11099 │ PHONO-PS440M4H_24_THB    │
│      11098 │ PHONO-PS440M4H_24_TH     │
│      11095 │ PHONO-PS435M8GFH_18_VNHB │
│      11094 │ PHONO-PS435M8GFH_18_VNH  │
│      11093 │ PHONO-PS435M5H_24_THB    │
│      11091 │ PHONO-PS435M5GF_24_TH    │
│      11092 │ PHONO-PS435M5GFH_24

### Description

In [10]:
ob_description = duckdb.sql("""
select csv_row_id, Description as Description_Value from Dirty;
""")
ob_description.show(max_width=120)

┌────────────┬────────────────────────────────────────────────────────────────────────────────────────┐
│ csv_row_id │                                   Description_Value                                    │
│   int64    │                                        varchar                                         │
├────────────┼────────────────────────────────────────────────────────────────────────────────────────┤
│          1 │ 270 W Monocrystalline Module                                                           │
│          2 │ 275 W Monocrystalline Module                                                           │
│          3 │ 280 W Monocrystalline Module                                                           │
│          4 │ 285 W Monocrystalline Module                                                           │
│          5 │ 290 W Monocrystalline Module                                                           │
│          6 │ 210 W Polycrystalline Module                     

### ProdCertifications
This is a list.

In [11]:
ob_prod_certifications = []

### ProdCertification for "Safety Certification"
Here there is extra white space, and improper values.

In [12]:
duckdb.sql("""
select distinct "Safety Certification" from Dirty
""")

┌─────────────────────────────────────────────────────────────────────────────────────────────┐
│                                    Safety Certification                                     │
│                                           varchar                                           │
├─────────────────────────────────────────────────────────────────────────────────────────────┤
│ UL 61730                                                                                    │
│ NULL                                                                                        │
│ UL 61730, UL 1703                                                                           │
│ UL 61731                                                                                    │
│ UL 1703                                                                                     │
│ UL 1703                                                                                     │
│ Ul 61730                              

The improper value starts with '501 W'.
There is only one module with this improper value.
The [installation manual of Thornova Solar](https://www.thornova-solar.com/attached/file/na/manuals/Installation_Manual.pdf) says that all their modules have UL 61730 certifications, so we will use this value.

In [13]:
duckdb.sql("""
select * from Dirty where "Safety Certification" = '501 W, 120 half-cut cell double glass bifacial monocrystalline module, 1500V max system Vdc';
""").show(max_width=1)

┌────────────┬────────────────┬───┬──────────────────┬─────────────┐
│ csv_row_id │  Manufacturer  │ … │ CEC Listing Date │ Last Update │
│   int64    │    varchar     │   │       date       │    date     │
├────────────┼────────────────┼───┼──────────────────┼─────────────┤
│      16596 │ Thornova Solar │ … │ 2025-06-02       │ NULL        │
├────────────┴────────────────┴───┴──────────────────┴─────────────┤
│ 1 rows                                      38 columns (4 shown) │
└──────────────────────────────────────────────────────────────────┘



In [14]:
ob_prod_certification = duckdb.sql("""
select distinct
    csv_row_id,
    case "Safety Certification"
        when 'UL 1703' then 'UL1703_2002' 
        when 'UL 1703 ' then 'UL1703_2002' 
        when 'UL 1741' then null 
        when 'Ul 61730' then 'UL61730_2017'
        when 'UL 61730' then 'UL61730_2017'
        when 'UL 61731' then 'UL61730_2017'
        when 'UL 61730, UL 1703' then unnest(['UL1703_2002', 'UL61730_2017'])
        when '501 W, 120 half-cut cell double glass bifacial monocrystalline module, 1500V max system Vdc' then 'UL61730_2017'
        else "Safety Certification"
    end as CertificationStandard_Value
from Dirty;
""")
ob_prod_certifications.append(ob_prod_certification)
ob_prod_certification

┌────────────┬─────────────────────────────┐
│ csv_row_id │ CertificationStandard_Value │
│   int64    │           varchar           │
├────────────┼─────────────────────────────┤
│         16 │ UL1703_2002                 │
│         32 │ UL1703_2002                 │
│         41 │ UL1703_2002                 │
│         42 │ UL1703_2002                 │
│         55 │ UL1703_2002                 │
│        106 │ UL61730_2017                │
│        121 │ UL61730_2017                │
│        143 │ UL61730_2017                │
│        150 │ UL61730_2017                │
│        200 │ UL61730_2017                │
│         ·  │      ·                      │
│         ·  │      ·                      │
│         ·  │      ·                      │
│       7975 │ UL61730_2017                │
│       7977 │ UL61730_2017                │
│       7981 │ UL61730_2017                │
│       7991 │ UL61730_2017                │
│       7992 │ UL61730_2017                │
│       80

### ProdCertification for "Design Qualification Certification (Optional Submission) IEC 61215:2016"
Since we know the certification, we only need to extract the date.

In [15]:
duckdb.sql("""
select distinct "Design Qualification Certification (Optional Submission) IEC 61215:2016" from Dirty
""")

┌─────────────────────────────────────────────────────────────────────────┐
│ Design Qualification Certification (Optional Submission) IEC 61215:2016 │
│                                 varchar                                 │
├─────────────────────────────────────────────────────────────────────────┤
│ 03.08.2021                                                              │
│ 16.03.2020                                                              │
│ 18.02.2020                                                              │
│ 11/21/2022 [IEC 61215:2021]                                             │
│ 29.11.2022                                                              │
│ 23.03.2022                                                              │
│ 23.07.2020                                                              │
│ 28.05.2021                                                              │
│ 31.08.2022                                                              │
│ 27.01.2022

All the date information is before the first white space.

In [16]:
duckdb.sql("""
select distinct
    split("Design Qualification Certification (Optional Submission) IEC 61215:2016", ' ')[2]
from Dirty;
""")

┌──────────────────────────────────────────────────────────────────────────────────────────┐
│ split("Design Qualification Certification (Optional Submission) IEC 61215:2016", ' ')[2] │
│                                         varchar                                          │
├──────────────────────────────────────────────────────────────────────────────────────────┤
│ [IEC                                                                                     │
│ Information                                                                              │
│ NULL                                                                                     │
└──────────────────────────────────────────────────────────────────────────────────────────┘

Therefore, we can take this part of the string, and try to cast it to a date.

In [17]:
duckdb.sql("""
with has_date_str as (
    select
        csv_row_id,
        split("Design Qualification Certification (Optional Submission) IEC 61215:2016", ' ')[1] as date_str,
    from Dirty
)
select distinct
    'IEC61215_2016' as CertificationStandard_Value,
    case
        when try_strptime(date_str, '%d.%m.%Y') is not null then cast(strptime(date_str, '%d.%m.%Y') as string)
        when try_strptime(date_str, '%m/%d/%Y') is not null then cast(strptime(date_str, '%m/%d/%Y') as string)
        else date_str
    end as CertificationDate_Value,
from has_date_str order by CertificationDate_Value
""")

┌─────────────────────────────┬─────────────────────────┐
│ CertificationStandard_Value │ CertificationDate_Value │
│           varchar           │         varchar         │
├─────────────────────────────┼─────────────────────────┤
│ IEC61215_2016               │ 2018-02-28 00:00:00     │
│ IEC61215_2016               │ 2018-03-29 00:00:00     │
│ IEC61215_2016               │ 2018-09-04 00:00:00     │
│ IEC61215_2016               │ 2018-10-26 00:00:00     │
│ IEC61215_2016               │ 2018-11-08 00:00:00     │
│ IEC61215_2016               │ 2018-12-20 00:00:00     │
│ IEC61215_2016               │ 2019-02-18 00:00:00     │
│ IEC61215_2016               │ 2019-03-08 00:00:00     │
│ IEC61215_2016               │ 2019-04-09 00:00:00     │
│ IEC61215_2016               │ 2019-05-05 00:00:00     │
│       ·                     │          ·              │
│       ·                     │          ·              │
│       ·                     │          ·              │
│ IEC61215_201

In [18]:
ob_prod_certification = duckdb.sql("""
with has_date_str as (
    select
        csv_row_id,
        split("Design Qualification Certification (Optional Submission) IEC 61215:2016", ' ')[1] as date_str,
    from Dirty
)
select
    csv_row_id,
    'IEC61215_2016' as CertificationStandard_Value,
    case
        when try_strptime(date_str, '%d.%m.%Y') is not null then cast(strptime(date_str, '%d.%m.%Y') as date)
        when try_strptime(date_str, '%m/%d/%Y') is not null then cast(strptime(date_str, '%m/%d/%Y') as date)
    end as CertificationDate_Value,
from has_date_str
""")
ob_prod_certifications.append(ob_prod_certification)
ob_prod_certification

┌────────────┬─────────────────────────────┬─────────────────────────┐
│ csv_row_id │ CertificationStandard_Value │ CertificationDate_Value │
│   int64    │           varchar           │          date           │
├────────────┼─────────────────────────────┼─────────────────────────┤
│          1 │ IEC61215_2016               │ NULL                    │
│          2 │ IEC61215_2016               │ NULL                    │
│          3 │ IEC61215_2016               │ NULL                    │
│          4 │ IEC61215_2016               │ NULL                    │
│          5 │ IEC61215_2016               │ NULL                    │
│          6 │ IEC61215_2016               │ NULL                    │
│          7 │ IEC61215_2016               │ NULL                    │
│          8 │ IEC61215_2016               │ NULL                    │
│          9 │ IEC61215_2016               │ NULL                    │
│         10 │ IEC61215_2016               │ NULL                    │
│     

### ProdCertification for "Performance Evaluation (Optional Submission) IEC 61853-1:2011"
Since we know the certification, we only need to extract the date.

In [19]:
duckdb.sql("""
select distinct "Performance Evaluation (Optional Submission) IEC 61853-1:2011" from Dirty
""")

┌───────────────────────────────────────────────────────────────┐
│ Performance Evaluation (Optional Submission) IEC 61853-1:2011 │
│                            varchar                            │
├───────────────────────────────────────────────────────────────┤
│ No Information Submitted                                      │
│ 28.05.2020                                                    │
│ 31.05.2022                                                    │
│ NULL                                                          │
│ 18.01.2022                                                    │
└───────────────────────────────────────────────────────────────┘

In [20]:
ob_prod_certification = duckdb.sql("""
with has_date_str as (
    select
        csv_row_id,
        split("Performance Evaluation (Optional Submission) IEC 61853-1:2011", ' ')[1] as date_str,
    from Dirty
)
select
    csv_row_id,
    'IEC61853_1_2011' as CertificationStandard_Value,
    case
        when try_strptime(date_str, '%d.%m.%Y') is not null then cast(strptime(date_str, '%d.%m.%Y') as date)
    end as CertificationDate_Value,
from has_date_str
""")
ob_prod_certifications.append(ob_prod_certification)
ob_prod_certification

┌────────────┬─────────────────────────────┬─────────────────────────┐
│ csv_row_id │ CertificationStandard_Value │ CertificationDate_Value │
│   int64    │           varchar           │          date           │
├────────────┼─────────────────────────────┼─────────────────────────┤
│          1 │ IEC61853_1_2011             │ NULL                    │
│          2 │ IEC61853_1_2011             │ NULL                    │
│          3 │ IEC61853_1_2011             │ NULL                    │
│          4 │ IEC61853_1_2011             │ NULL                    │
│          5 │ IEC61853_1_2011             │ NULL                    │
│          6 │ IEC61853_1_2011             │ NULL                    │
│          7 │ IEC61853_1_2011             │ NULL                    │
│          8 │ IEC61853_1_2011             │ NULL                    │
│          9 │ IEC61853_1_2011             │ NULL                    │
│         10 │ IEC61853_1_2011             │ NULL                    │
│     

### ModuleElectRatings
This is a list.

In [21]:
ob_module_elect_ratings = []

### ModuleElectRating for "Nameplate (...)"

In [22]:
ob_power_stc = duckdb.sql("""
select
    csv_row_id,
    'W' as PowerSTC_Unit,
    "Nameplate Pmax (W)" as PowerSTC_Value,
from Dirty;
""")
ob_power_stc

┌────────────┬───────────────┬────────────────┐
│ csv_row_id │ PowerSTC_Unit │ PowerSTC_Value │
│   int64    │    varchar    │     double     │
├────────────┼───────────────┼────────────────┤
│          1 │ W             │          270.0 │
│          2 │ W             │          275.0 │
│          3 │ W             │          280.0 │
│          4 │ W             │          285.0 │
│          5 │ W             │          290.0 │
│          6 │ W             │          210.0 │
│          7 │ W             │          215.0 │
│          8 │ W             │          220.0 │
│          9 │ W             │          225.0 │
│         10 │ W             │          225.0 │
│          · │ ·             │            ·   │
│          · │ ·             │            ·   │
│          · │ ·             │            ·   │
│       9991 │ W             │          360.0 │
│       9992 │ W             │          365.0 │
│       9993 │ W             │          365.0 │
│       9994 │ W             │          

In [23]:
ob_module_elect_rating = duckdb.sql("""
select
    csv_row_id,
    'STC' as ModuleRatingCondition_Value,
    'W' as PowerDC_Unit,
    "Nameplate Pmax (W)" as PowerDC_Value,
    'A' as CurrentShortCircuit_Unit,
    "Nameplate Isc (A)" as CurrentShortCircuit_Value,
    'V' as VoltageOpenCircuit_Unit,
    "Nameplate Voc (V)" as VoltageOpenCircuit_Value,
    'A' as CurrentAtMaximumPower_Unit,
    "Nameplate Ipmax (A)" as CurrentAtMaximumPower_Value,
    'V' as VoltageAtMaximumPower_Unit,
    "Nameplate Vpmax (V)" as VoltageAtMaximumPower_Value,
from Dirty;
""")
ob_module_elect_ratings.append(ob_module_elect_rating)
ob_module_elect_rating.show(max_width=120)

┌────────────┬──────────────────────┬──────────────┬───┬──────────────────────┬──────────────────────┐
│ csv_row_id │ ModuleRatingCondit…  │ PowerDC_Unit │ … │ VoltageAtMaximumPo…  │ VoltageAtMaximumPo…  │
│   int64    │       varchar        │   varchar    │   │       varchar        │        double        │
├────────────┼──────────────────────┼──────────────┼───┼──────────────────────┼──────────────────────┤
│          1 │ STC                  │ W            │ … │ V                    │                30.72 │
│          2 │ STC                  │ W            │ … │ V                    │                30.99 │
│          3 │ STC                  │ W            │ … │ V                    │                31.26 │
│          4 │ STC                  │ W            │ … │ V                    │                31.53 │
│          5 │ STC                  │ W            │ … │ V                    │                 31.8 │
│          6 │ STC                  │ W            │ … │ V               

### ModuleElectRating for "PTC"

In [24]:
ob_module_elect_rating = duckdb.sql("""
select
    csv_row_id,
    'PTC' as ModuleRatingCondition_Value,
    'W' as PowerDC_Unit,
    "PTC" as PowerDC_Value,
from Dirty;
""")
ob_module_elect_ratings.append(ob_module_elect_rating)
ob_module_elect_rating

┌────────────┬─────────────────────────────┬──────────────┬───────────────┐
│ csv_row_id │ ModuleRatingCondition_Value │ PowerDC_Unit │ PowerDC_Value │
│   int64    │           varchar           │   varchar    │    double     │
├────────────┼─────────────────────────────┼──────────────┼───────────────┤
│          1 │ PTC                         │ W            │         242.1 │
│          2 │ PTC                         │ W            │         246.7 │
│          3 │ PTC                         │ W            │         251.3 │
│          4 │ PTC                         │ W            │         256.0 │
│          5 │ PTC                         │ W            │         260.6 │
│          6 │ PTC                         │ W            │         186.8 │
│          7 │ PTC                         │ W            │         191.3 │
│          8 │ PTC                         │ W            │         195.9 │
│          9 │ PTC                         │ W            │         200.7 │
│         10

### ModuleElectRating for "(...), low"

In [25]:
ob_module_elect_rating = duckdb.sql("""
select
    csv_row_id,
    'LIC' as ModuleRatingCondition_Value,
    'A' as CurrentAtMaximumPower_Unit,
    "IPmax, low (A)" as CurrentAtMaximumPower_Value,
    'V' as VoltageAtMaximumPower_Unit,
    "VPmax, low (V)" as VoltageAtMaximumPower_Value,
from Dirty;
""")
ob_module_elect_ratings.append(ob_module_elect_rating)
ob_module_elect_rating.show(max_width=120)

┌────────────┬──────────────────────┬──────────────────────┬───┬──────────────────────┬──────────────────────┐
│ csv_row_id │ ModuleRatingCondit…  │ CurrentAtMaximumPo…  │ … │ VoltageAtMaximumPo…  │ VoltageAtMaximumPo…  │
│   int64    │       varchar        │       varchar        │   │       varchar        │        double        │
├────────────┼──────────────────────┼──────────────────────┼───┼──────────────────────┼──────────────────────┤
│          1 │ LIC                  │ A                    │ … │ V                    │                29.84 │
│          2 │ LIC                  │ A                    │ … │ V                    │                 30.1 │
│          3 │ LIC                  │ A                    │ … │ V                    │                30.37 │
│          4 │ LIC                  │ A                    │ … │ V                    │                30.63 │
│          5 │ LIC                  │ A                    │ … │ V                    │                30.89 │
│

### ModuleElectRating for "(...), NOCT"

In [26]:
ob_module_elect_rating = duckdb.sql("""
select
    csv_row_id,
    'NOCT' as ModuleRatingCondition_Value,
    'A' as CurrentAtMaximumPower_Unit,
    "IPmax, NOCT (A)" as CurrentAtMaximumPower_Value,
    'V' as VoltageAtMaximumPower_Unit,
    "VPmax, NOCT (V)" as VoltageAtMaximumPower_Value,
from Dirty;
""")
ob_module_elect_ratings.append(ob_module_elect_rating)
ob_module_elect_rating.show(max_width=120)

┌────────────┬──────────────────────┬──────────────────────┬───┬──────────────────────┬──────────────────────┐
│ csv_row_id │ ModuleRatingCondit…  │ CurrentAtMaximumPo…  │ … │ VoltageAtMaximumPo…  │ VoltageAtMaximumPo…  │
│   int64    │       varchar        │       varchar        │   │       varchar        │        double        │
├────────────┼──────────────────────┼──────────────────────┼───┼──────────────────────┼──────────────────────┤
│          1 │ NOCT                 │ A                    │ … │ V                    │                26.34 │
│          2 │ NOCT                 │ A                    │ … │ V                    │                26.57 │
│          3 │ NOCT                 │ A                    │ … │ V                    │                26.81 │
│          4 │ NOCT                 │ A                    │ … │ V                    │                27.04 │
│          5 │ NOCT                 │ A                    │ … │ V                    │                27.27 │
│

### ProdCell.CellTechnologyType

In [27]:
duckdb.sql("""
select distinct Technology from Dirty
""")

┌────────────┐
│ Technology │
│  varchar   │
├────────────┤
│ Thin Film  │
│ Mono-c-Si  │
│ Multi-c-Si │
│ a-Si/nc    │
│ NULL       │
│ CIGS       │
│ CdTe       │
│ Mono-C-si  │
└────────────┘

In [28]:
ob_cell_technology_type = duckdb.sql("""
select
    csv_row_id,
    case Technology
        when 'a-Si/nc' then 'ASi' -- nc stands for nanocrystalline
        when 'Mono-c-Si' then 'MonoSi'
        when 'Mono-C-si' then 'MonoSi'
        when 'Multi-c-Si' then 'PolySi'
        when 'Thin Film' then 'ThinFilm'
        else Technology
    end as ProdCell__CellTechnologyType_Value,
from Dirty;
""")
duckdb.sql("""
select distinct ProdCell__CellTechnologyType_Value from ob_cell_technology_type
""")

┌────────────────────────────────────┐
│ ProdCell__CellTechnologyType_Value │
│              varchar               │
├────────────────────────────────────┤
│ NULL                               │
│ MonoSi                             │
│ ASi                                │
│ PolySi                             │
│ CdTe                               │
│ ThinFilm                           │
│ CIGS                               │
└────────────────────────────────────┘

### ModuleArea

In [29]:
ob_module_area = duckdb.sql("""
select
    csv_row_id,
    'sqm' as ModuleArea_Unit,
    "A_c (m2)" as ModuleArea_Value,
from Dirty
""")
ob_module_area

┌────────────┬─────────────────┬──────────────────┐
│ csv_row_id │ ModuleArea_Unit │ ModuleArea_Value │
│   int64    │     varchar     │      double      │
├────────────┼─────────────────┼──────────────────┤
│          1 │ sqm             │            1.627 │
│          2 │ sqm             │            1.627 │
│          3 │ sqm             │            1.627 │
│          4 │ sqm             │            1.627 │
│          5 │ sqm             │            1.627 │
│          6 │ sqm             │            1.638 │
│          7 │ sqm             │            1.638 │
│          8 │ sqm             │            1.638 │
│          9 │ sqm             │            1.638 │
│         10 │ sqm             │            1.638 │
│          · │  ·              │               ·  │
│          · │  ·              │               ·  │
│          · │  ·              │               ·  │
│       9991 │ sqm             │              1.8 │
│       9992 │ sqm             │            1.914 │
│       9993

### CellsInSeries

In [30]:
ob_cells_in_series = duckdb.sql("""
select
    csv_row_id,
    N_s as CellsInSeries_Value,
from Dirty
""")
ob_cells_in_series

┌────────────┬─────────────────────┐
│ csv_row_id │ CellsInSeries_Value │
│   int64    │        int64        │
├────────────┼─────────────────────┤
│          1 │                  60 │
│          2 │                  60 │
│          3 │                  60 │
│          4 │                  60 │
│          5 │                  60 │
│          6 │                  60 │
│          7 │                  60 │
│          8 │                  60 │
│          9 │                  60 │
│         10 │                  60 │
│          · │                   · │
│          · │                   · │
│          · │                   · │
│       9991 │                  60 │
│       9992 │                  72 │
│       9993 │                  72 │
│       9994 │                  72 │
│       9995 │                  72 │
│       9996 │                  60 │
│       9997 │                  60 │
│       9998 │                  60 │
│       9999 │                  72 │
│      10000 │                  72 │
├

### CellStringsParallelQuantity

In [31]:
ob_cell_strings_parallel_quantity = duckdb.sql("""
select
    csv_row_id,
    N_p as CellStringsParallelQuantity_Value,
from Dirty
""")
ob_cell_strings_parallel_quantity

┌────────────┬───────────────────────────────────┐
│ csv_row_id │ CellStringsParallelQuantity_Value │
│   int64    │              double               │
├────────────┼───────────────────────────────────┤
│          1 │                               1.0 │
│          2 │                               1.0 │
│          3 │                               1.0 │
│          4 │                               1.0 │
│          5 │                               1.0 │
│          6 │                               1.0 │
│          7 │                               1.0 │
│          8 │                               1.0 │
│          9 │                               1.0 │
│         10 │                               1.0 │
│          · │                                ·  │
│          · │                                ·  │
│          · │                                ·  │
│       9991 │                               2.0 │
│       9992 │                               1.0 │
│       9993 │                 

### IsBIPV

In [32]:
duckdb.sql("""
select distinct BIPV from Dirty
""")

┌─────────┐
│  BIPV   │
│ varchar │
├─────────┤
│ NULL    │
│ N       │
│ Y       │
└─────────┘

In [33]:
ob_is_bipv = duckdb.sql("""
select
    csv_row_id,
    case BIPV
        when 'Y' then true
        when 'N' then false
        else null
    end as IsBIPV_Value
from Dirty;
""")
duckdb.sql("""
select distinct IsBIPV_Value from ob_is_bipv
""")

┌──────────────┐
│ IsBIPV_Value │
│   boolean    │
├──────────────┤
│ false        │
│ true         │
│ NULL         │
└──────────────┘

### TemperatureNOCT

In [34]:
ob_temperature_noct = duckdb.sql("""
select
    csv_row_id,
    'Cel' as TemperatureNOCT_Unit,
    "Average NOCT (deg_C)" as TemperatureNOCT_Value,
from Dirty;
""")
ob_temperature_noct

┌────────────┬──────────────────────┬───────────────────────┐
│ csv_row_id │ TemperatureNOCT_Unit │ TemperatureNOCT_Value │
│   int64    │       varchar        │        double         │
├────────────┼──────────────────────┼───────────────────────┤
│          1 │ Cel                  │                  47.4 │
│          2 │ Cel                  │                  47.4 │
│          3 │ Cel                  │                  47.4 │
│          4 │ Cel                  │                  47.4 │
│          5 │ Cel                  │                  47.4 │
│          6 │ Cel                  │                  47.6 │
│          7 │ Cel                  │                  47.6 │
│          8 │ Cel                  │                  47.6 │
│          9 │ Cel                  │                  46.0 │
│         10 │ Cel                  │                  47.6 │
│          · │  ·                   │                    ·  │
│          · │  ·                   │                    ·  │
│       

### TemperatureCoefficientMaximumPower

In [35]:
ob_temperature_coefficient_maximum_power = duckdb.sql("""
select
    csv_row_id,
    'percent_per_Cel' as TemperatureCoefficientMaximumPower_Unit,
    "γPmax (percent/deg_C)" as TemperatureCoefficientMaximumPower_Value,
from Dirty;
""")
ob_temperature_coefficient_maximum_power

┌────────────┬─────────────────────────────────────────┬──────────────────────────────────────────┐
│ csv_row_id │ TemperatureCoefficientMaximumPower_Unit │ TemperatureCoefficientMaximumPower_Value │
│   int64    │                 varchar                 │                  double                  │
├────────────┼─────────────────────────────────────────┼──────────────────────────────────────────┤
│          1 │ percent_per_Cel                         │                                  -0.4509 │
│          2 │ percent_per_Cel                         │                                  -0.4509 │
│          3 │ percent_per_Cel                         │                                  -0.4509 │
│          4 │ percent_per_Cel                         │                                  -0.4509 │
│          5 │ percent_per_Cel                         │                                  -0.4509 │
│          6 │ percent_per_Cel                         │                                  -0.4501 │


### TemperatureCoefficientShortCircuitCurrent

In [36]:
duckdb.sql("""
select distinct
    "αIsc  (percent/deg_C)" from Dirty
where try_cast("αIsc  (percent/deg_C)" as double) is null;
""")

┌───────────────────────┐
│ αIsc  (percent/deg_C) │
│        varchar        │
├───────────────────────┤
│ NULL                  │
└───────────────────────┘

In [37]:
ob_temperature_coefficient_short_circuit_current = duckdb.sql("""
select
    csv_row_id,
    'percent_per_Cel' as TemperatureCoefficientShortCircuitCurrent_Unit,
    cast("αIsc  (percent/deg_C)" as double) as TemperatureCoefficientShortCircuitCurrent_Value,
from Dirty;
""")
ob_temperature_coefficient_short_circuit_current

┌────────────┬────────────────────────────────────────────────┬─────────────────────────────────────────────────┐
│ csv_row_id │ TemperatureCoefficientShortCircuitCurrent_Unit │ TemperatureCoefficientShortCircuitCurrent_Value │
│   int64    │                    varchar                     │                     double                      │
├────────────┼────────────────────────────────────────────────┼─────────────────────────────────────────────────┤
│          1 │ percent_per_Cel                                │                                          0.0521 │
│          2 │ percent_per_Cel                                │                                          0.0521 │
│          3 │ percent_per_Cel                                │                                          0.0521 │
│          4 │ percent_per_Cel                                │                                          0.0521 │
│          5 │ percent_per_Cel                                │                         

### TemperatureCoefficientOpenCircuitVoltage

In [38]:
duckdb.sql("""
select distinct
    "βVoc (percent/deg_C)" from Dirty
where try_cast("βVoc (percent/deg_C)" as double) is null;
""")

┌──────────────────────┐
│ βVoc (percent/deg_C) │
│       varchar        │
├──────────────────────┤
│  -0.2794             │
│ NULL                 │
└──────────────────────┘

In [39]:
ob_temperature_coefficient_open_circuit_voltage = duckdb.sql("""
select
    csv_row_id,
    'percent_per_Cel' as TemperatureCoefficientOpenCircuitVoltage_Unit,
    cast(trim("βVoc (percent/deg_C)") as double) as TemperatureCoefficientOpenCircuitVoltage_Value,
from Dirty;
""")
ob_temperature_coefficient_open_circuit_voltage

┌────────────┬───────────────────────────────────────────────┬────────────────────────────────────────────────┐
│ csv_row_id │ TemperatureCoefficientOpenCircuitVoltage_Unit │ TemperatureCoefficientOpenCircuitVoltage_Value │
│   int64    │                    varchar                    │                     double                     │
├────────────┼───────────────────────────────────────────────┼────────────────────────────────────────────────┤
│          1 │ percent_per_Cel                               │                                        -0.3137 │
│          2 │ percent_per_Cel                               │                                        -0.3137 │
│          3 │ percent_per_Cel                               │                                        -0.3137 │
│          4 │ percent_per_Cel                               │                                        -0.3137 │
│          5 │ percent_per_Cel                               │                                        -0

### TemperatureCoefficientMaxPowerCurrent

In [40]:
duckdb.sql("""
select distinct
    "αIpmax (percent/deg_C)" from Dirty
where try_cast("αIpmax (percent/deg_C)" as double) is null;
""")

┌────────────────────────┐
│ αIpmax (percent/deg_C) │
│        varchar         │
├────────────────────────┤
│                        │
│ NULL                   │
└────────────────────────┘

In [41]:
ob_temperature_coefficient_max_power_current = duckdb.sql("""
select
    csv_row_id,
    'percent_per_Cel' as TemperatureCoefficientMaxPowerCurrent_Unit,
    try_cast("αIpmax (percent/deg_C)" as double) as TemperatureCoefficientMaxPowerCurrent_Value,
from Dirty;
""")
ob_temperature_coefficient_max_power_current

┌────────────┬────────────────────────────────────────────┬─────────────────────────────────────────────┐
│ csv_row_id │ TemperatureCoefficientMaxPowerCurrent_Unit │ TemperatureCoefficientMaxPowerCurrent_Value │
│   int64    │                  varchar                   │                   double                    │
├────────────┼────────────────────────────────────────────┼─────────────────────────────────────────────┤
│          1 │ percent_per_Cel                            │                                      0.0113 │
│          2 │ percent_per_Cel                            │                                      0.0113 │
│          3 │ percent_per_Cel                            │                                      0.0113 │
│          4 │ percent_per_Cel                            │                                      0.0113 │
│          5 │ percent_per_Cel                            │                                      0.0113 │
│          6 │ percent_per_Cel                

### TemperatureCoefficientMaxPowerVoltage

In [42]:
duckdb.sql("""
select distinct
    "βVpmax (percent/deg_C)" from Dirty
where try_cast("βVpmax (percent/deg_C)" as double) is null;
""")

┌────────────────────────┐
│ βVpmax (percent/deg_C) │
│        varchar         │
├────────────────────────┤
│ NULL                   │
│                        │
└────────────────────────┘

In [43]:
ob_temperature_coefficient_max_power_voltage = duckdb.sql("""
select
    csv_row_id,
    'percent_per_Cel' as TemperatureCoefficientMaxPowerVoltage_Unit,
    try_cast("βVpmax (percent/deg_C)" as double) as TemperatureCoefficientMaxPowerVoltage_Value,
from Dirty;
""")
ob_temperature_coefficient_max_power_voltage

┌────────────┬────────────────────────────────────────────┬─────────────────────────────────────────────┐
│ csv_row_id │ TemperatureCoefficientMaxPowerVoltage_Unit │ TemperatureCoefficientMaxPowerVoltage_Value │
│   int64    │                  varchar                   │                   double                    │
├────────────┼────────────────────────────────────────────┼─────────────────────────────────────────────┤
│          1 │ percent_per_Cel                            │                                     -0.4632 │
│          2 │ percent_per_Cel                            │                                     -0.4632 │
│          3 │ percent_per_Cel                            │                                     -0.4632 │
│          4 │ percent_per_Cel                            │                                     -0.4632 │
│          5 │ percent_per_Cel                            │                                     -0.4632 │
│          6 │ percent_per_Cel                

### Dimension.(Width|Height)

In [44]:
ob_width_height = duckdb.sql("""
select
    csv_row_id,
    'm' as Dimension__Width_Unit,
    "Short Side (m)" as Dimension__Width_Value,
    'm' as Dimension__Height_Unit,
    "Long Side (m)" as Dimension__Height_Value,
from Dirty;
""")
ob_width_height

┌────────────┬───────────────────────┬────────────────────────┬────────────────────────┬─────────────────────────┐
│ csv_row_id │ Dimension__Width_Unit │ Dimension__Width_Value │ Dimension__Height_Unit │ Dimension__Height_Value │
│   int64    │        varchar        │         double         │        varchar         │         double          │
├────────────┼───────────────────────┼────────────────────────┼────────────────────────┼─────────────────────────┤
│          1 │ m                     │                  0.992 │ m                      │                    1.64 │
│          2 │ m                     │                  0.992 │ m                      │                    1.64 │
│          3 │ m                     │                  0.992 │ m                      │                    1.64 │
│          4 │ m                     │                  0.992 │ m                      │                    1.64 │
│          5 │ m                     │                  0.992 │ m               

# Ignored columns

In [45]:
duckdb.sql("""
select distinct Notes from Dirty;
""")

┌──────────────────────────────────────────────────────────────────────────────────────────────┐
│                                            Notes                                             │
│                                           varchar                                            │
├──────────────────────────────────────────────────────────────────────────────────────────────┤
│ This model is also available with a white backsheet. Please refer to DNA-144-MF23-400W {Wht} │
│ Formerly listed under Hanwha Q CELLS (Qidong) Co., Ltd.                                      │
│ Formerly GEP NEW ENERGY VIETNAM COMPANY LIMITED GEP-BfMc450MHT                               │
│ Also sold as Renesola Jiangsu JC260M-24/Bzs                                                  │
│ Also sold as Renesola Jiangsu JC265M-24/Bzs                                                  │
│ Also sold as Renesola Jiangsu JC270M-24/Bzs-b                                                │
│ Also sold as Renesola Jiangs

In [46]:
duckdb.sql("""
select distinct Mounting from Dirty;
""")

┌──────────┐
│ Mounting │
│ varchar  │
├──────────┤
│ Rack     │
│ NULL     │
│ BIPV     │
│          │
└──────────┘

In [47]:
duckdb.sql("""
select distinct Type from Dirty;
""")

┌────────────┐
│    Type    │
│  varchar   │
├────────────┤
│ Flat Plate │
│ NULL       │
└────────────┘

In [48]:
duckdb.sql("""
select distinct "Geometric Multiplier" from Dirty;
""")

┌──────────────────────┐
│ Geometric Multiplier │
│        double        │
├──────────────────────┤
│                 NULL │
│                  1.0 │
└──────────────────────┘

In [49]:
duckdb.sql("""
select distinct "P2/Pref" from Dirty;
""")

┌─────────────┐
│   P2/Pref   │
│   double    │
├─────────────┤
│   0.2080728 │
│ 0.198167224 │
│  0.19138296 │
│ 0.191367286 │
│ 0.179858421 │
│ 0.192553426 │
│ 0.179555646 │
│ 0.186447063 │
│ 0.216205165 │
│ 0.215305484 │
│       ·     │
│       ·     │
│       ·     │
│  0.20279267 │
│ 0.177843288 │
│ 0.185312646 │
│ 0.185769656 │
│ 0.195145424 │
│ 0.201001982 │
│ 0.198839521 │
│ 0.200660581 │
│ 0.200624762 │
│ 0.190596364 │
├─────────────┤
│  3454 rows  │
│ (20 shown)  │
└─────────────┘

In [50]:
duckdb.sql("""
select distinct "CEC Listing Date" from Dirty;
""")

┌──────────────────┐
│ CEC Listing Date │
│       date       │
├──────────────────┤
│ 2024-08-01       │
│ 2024-07-22       │
│ 2022-04-11       │
│ 2024-06-21       │
│ 2023-10-23       │
│ 2025-05-12       │
│ 2017-06-01       │
│ 2018-07-02       │
│ 2020-05-21       │
│ 2023-05-11       │
│     ·            │
│     ·            │
│     ·            │
│ 2021-12-13       │
│ 2020-11-23       │
│ 2022-07-01       │
│ 2022-03-11       │
│ 2020-02-21       │
│ 2019-04-15       │
│ 2023-06-12       │
│ 2025-07-21       │
│ 2020-08-03       │
│ 2020-04-13       │
├──────────────────┤
│     278 rows     │
│    (20 shown)    │
└──────────────────┘

In [51]:
duckdb.sql("""
select distinct "Last Update" from Dirty;
""")

┌─────────────┐
│ Last Update │
│    date     │
├─────────────┤
│ 2018-07-02  │
│ 2025-07-01  │
│ 2024-07-22  │
│ 2024-06-21  │
│ 2018-09-17  │
│ 2019-03-01  │
│ 2020-05-21  │
│ 2022-05-23  │
│ 2023-05-11  │
│ 2021-01-11  │
│     ·       │
│     ·       │
│     ·       │
│ 2025-05-21  │
│ 2022-02-01  │
│ 2021-10-01  │
│ 2026-02-11  │
│ 2020-11-02  │
│ 2019-12-02  │
│ 2021-09-13  │
│ 2025-08-11  │
│ 2025-12-22  │
│ 2017-11-15  │
├─────────────┤
│  158 rows   │
│ (20 shown)  │
└─────────────┘

# Building clean data
Tip: You can use the Jupyter Notebook Table of Contents feature to see what columns we need to join together.

We will separate the cleaned values into multiple CSVs.
This is not necessary if the arrays only have one entry, but we do so here for a consistent data processing pipeline.

### ProdModule

In [52]:
ob_prod_modules = duckdb.sql("""
select
    csv_row_id,
    * exclude(csv_row_id),
from ob_prod_mfr
natural join ob_prod_code
natural join ob_description
natural join ob_cell_technology_type
natural join ob_module_area
natural join ob_cells_in_series
natural join ob_cell_strings_parallel_quantity
natural join ob_is_bipv
natural join ob_temperature_noct
natural join ob_temperature_coefficient_maximum_power
natural join ob_temperature_coefficient_short_circuit_current
natural join ob_temperature_coefficient_open_circuit_voltage
natural join ob_temperature_coefficient_max_power_current
natural join ob_temperature_coefficient_max_power_voltage
natural join ob_width_height
order by ob_prod_mfr.csv_row_id
""")
ob_prod_modules.show(max_width=120)

┌────────────┬──────────────────────┬──────────────────────┬───┬──────────────────────┬──────────────────────┐
│ csv_row_id │    ProdMfr_Value     │    ProdCode_Value    │ … │ Dimension__Height_…  │ Dimension__Height_…  │
│   int64    │       varchar        │       varchar        │   │       varchar        │        double        │
├────────────┼──────────────────────┼──────────────────────┼───┼──────────────────────┼──────────────────────┤
│          1 │ Ablytek              │ ABLYT-6MN6A270       │ … │ m                    │                 1.64 │
│          2 │ Ablytek              │ ABLYT-6MN6A275       │ … │ m                    │                 1.64 │
│          3 │ Ablytek              │ ABLYT-6MN6A280       │ … │ m                    │                 1.64 │
│          4 │ Ablytek              │ ABLYT-6MN6A285       │ … │ m                    │                 1.64 │
│          5 │ Ablytek              │ ABLYT-6MN6A290       │ … │ m                    │                 1.64 │
│

### ProdCertification

In [53]:
ob_prod_certifications_pl = pl.concat([
    ob_prod_certification.pl() for ob_prod_certification in ob_prod_certifications
], how='diagonal')
ob_prod_certifications_duckdb = duckdb.sql("""
select
    csv_row_id as prodmodule_id,
    * exclude(csv_row_id),
    row_number() over () as csv_row_id,
from ob_prod_certifications_pl
order by prodmodule_id
""")
ob_prod_certifications_duckdb

┌───────────────┬─────────────────────────────┬─────────────────────────┬────────────┐
│ prodmodule_id │ CertificationStandard_Value │ CertificationDate_Value │ csv_row_id │
│     int64     │           varchar           │          date           │   int64    │
├───────────────┼─────────────────────────────┼─────────────────────────┼────────────┤
│             1 │ UL1703_2002                 │ NULL                    │       3972 │
│             1 │ IEC61215_2016               │ NULL                    │      21543 │
│             1 │ IEC61853_1_2011             │ NULL                    │      42811 │
│             2 │ UL1703_2002                 │ NULL                    │      17579 │
│             2 │ IEC61215_2016               │ NULL                    │      21544 │
│             2 │ IEC61853_1_2011             │ NULL                    │      42812 │
│             3 │ UL1703_2002                 │ NULL                    │       3973 │
│             3 │ IEC61215_2016            

### ModuleElectRating

In [54]:
ob_module_elect_ratings_pl = pl.concat([
    ob_module_elect_rating.pl() for ob_module_elect_rating in ob_module_elect_ratings
], how='diagonal')
ob_module_elect_ratings_duckdb = duckdb.sql("""
select
    csv_row_id as prodmodule_id,
    * exclude(csv_row_id),
    row_number() over () as csv_row_id,
from ob_module_elect_ratings_pl
order by prodmodule_id
""")
ob_module_elect_ratings_duckdb.show(max_width=120)

┌───────────────┬──────────────────────┬──────────────┬───┬──────────────────────┬──────────────────────┬────────────┐
│ prodmodule_id │ ModuleRatingCondit…  │ PowerDC_Unit │ … │ VoltageAtMaximumPo…  │ VoltageAtMaximumPo…  │ csv_row_id │
│     int64     │       varchar        │   varchar    │   │       varchar        │        double        │   int64    │
├───────────────┼──────────────────────┼──────────────┼───┼──────────────────────┼──────────────────────┼────────────┤
│             1 │ STC                  │ W            │ … │ V                    │                30.72 │          1 │
│             1 │ PTC                  │ W            │ … │ NULL                 │                 NULL │      21269 │
│             1 │ LIC                  │ NULL         │ … │ V                    │                29.84 │      42537 │
│             1 │ NOCT                 │ NULL         │ … │ V                    │                26.34 │      63805 │
│             2 │ STC                  │ W      

### Write cleaned data

In [55]:
ob_prod_modules.pl().write_csv('ProdModule.csv')
ob_prod_certifications_duckdb.pl().write_csv('ProdCertification.csv')
ob_module_elect_ratings_duckdb.pl().write_csv('ModuleElectRating.csv')