In [1]:
import polars as pl
from dotenv import load_dotenv
import os
from datetime import datetime
from rapidfuzz import process, fuzz
load_dotenv()

True

In [2]:
load_dotenv()
output_file = os.getenv('EPC_OUTPUT_FILE')
all_epc = pl.read_csv(
    output_file,
    try_parse_dates=True
)
# filters:
# remove small properties
filtered_epc = all_epc.filter(pl.col('floor-area') >= 40) # 40m2 min size
# only concerned with records without a uprn
filtered_epc = filtered_epc.filter(pl.col('uprn').is_null())
# and remove prior records for same property
filtered_epc = filtered_epc.sort('lodgement-datetime').unique(subset=['building-reference-number'], keep='last')
len(filtered_epc) # 1213676 (1281342 before filter by floor area)

441492

In [3]:
# config to load OS
os_files_path = os.getenv('OS_FILES_PATH')
csv_file_names = sorted(os.listdir(os_files_path))

In [4]:
record_dfs = {}
# record_types = [21,23,24,28,31,32] # as a min would need 21,24,28 for this
record_types = [28]

In [5]:
# read each file and assign to correct os record type
for file_name in csv_file_names:
    record_type = int(file_name[2:4])
    if record_type not in record_types: continue
    print(f'file_name: [{file_name}], time: [{datetime.now().strftime("%H:%M:%S")}]')
    temp_df = pl.read_csv(f'{os_files_path}/{file_name}', infer_schema_length=0) # setting schema will speed this up
    record_dfs[record_type] = temp_df

print('done')

file_name: [ID28_DPA_Records.csv], time: [10:48:13]
done


In [6]:
# check everything is ok
for record_type in record_dfs.keys():
    print(f'type: [{record_type}], count: [{len(record_dfs[record_type])}]')
    
## type: [21], count: [40825714]
## type: [23], count: [197964510]
## type: [28], count: [30598226]
## type: [31], count: [1422087]
## type: [32], count: [44684909]

type: [28], count: [30598226]


In [7]:
# format addresses
def format_abp_address(abp_row):
    # todo: if welsh vs english - req join with 21, blpu.COUNTRY == E or W
    organisation_name = abp_row[5]
    department_name = abp_row[6]
    sub_building_name = abp_row[7]
    building_name = abp_row[8]
    building_number = abp_row[9]
    dependant_thoroughfare = abp_row[10]
    thoroughfare = abp_row[11]
    dependant_locality = abp_row[13]
    post_town = abp_row[14]
    postcode = abp_row[15]
    uprn = abp_row[3]
    
    formatted_address = ''
    # taylor this for each dataset, epc has an address field that is equiv to name,number,street so currently only using those
    for part in [#organisation_name, department_name, 
                 sub_building_name, building_name, building_number, 
                 dependant_thoroughfare, thoroughfare, 
                 #dependant_locality, post_town, postcode
                 ]:
        if part is not None:
            formatted_address += f'{part}, '
            
    return formatted_address[:-2], uprn
    

In [18]:
# attempt the match
uprn_matched = []
count = 0
count_no_postcode = 0
count_postcode_not_found = 0
count_no_address = 0
for row in filtered_epc.iter_rows(named=True):
    count += 1
    if count % 10000 == 0:
        print(f'count: [{count}]')
    if row['postcode'] == '': 
        uprn_matched.append(None)
        count_no_postcode += 1
        # at this point you could decide to choose the subset based on town/county/district instead
        continue
    
    local_abp = record_dfs[28].filter(pl.col('POSTCODE') == row['postcode'])
    if local_abp.is_empty():
        uprn_matched.append(None)
        count_postcode_not_found += 1
        # same as above, obvs the source postcode is wrong if not found in abp
        continue
        
    if row['address'] is None: 
        uprn_matched.append(None)
        count_no_address += 1
        # sames as above
        continue

    formatted_epc_address = row['address'].upper()

    local_addresses = local_abp.map_rows(format_abp_address)
    local_abp_addresses_0 = local_addresses.get_column('column_0')
    local_abp_addresses_1 = local_addresses.get_column('column_1')
    
    # ok, this needs some optimising to figure out the best performance vs accuracy. 
    # for first 1000 records in ccod: ratio: 5s; partial_ratio: 5s
    # cdist of both lists, using all available processors, may be better perf.
    matches = process.extract(
        formatted_epc_address, 
        local_abp_addresses_0, 
        scorer=fuzz.partial_ratio, 
        limit=5)
    # print(matches)
    if len(matches) > 0:
        for match in matches:
            uprn_matched.append((
                row['lmk-key'], 
                local_abp_addresses_1[match[2]], # gets uprn at same index as the match, relative to current postcode
                match[1]))
        

print('done')
print('postcode missing:', count_no_postcode)
print('postcode not found:', count_postcode_not_found)
print('address missing:', count_no_address)
# took 38m

count: [10000]
count: [20000]
count: [30000]
count: [40000]
count: [50000]
count: [60000]
count: [70000]
count: [80000]
count: [90000]
count: [100000]
count: [110000]
count: [120000]
count: [130000]
count: [140000]
count: [150000]
count: [160000]
count: [170000]
count: [180000]
count: [190000]
count: [200000]
count: [210000]
count: [220000]
count: [230000]
count: [240000]
count: [250000]
count: [260000]
count: [270000]
count: [280000]
count: [290000]
count: [300000]
count: [310000]
count: [320000]
count: [330000]
count: [340000]
count: [350000]
count: [360000]
count: [370000]
count: [380000]
count: [390000]
count: [400000]
count: [410000]
count: [420000]
count: [430000]
count: [440000]
done
postcode missing: 0
postcode not found: 33661
address missing: 7


In [28]:
filtered_epc.head(10)

lmk-key,address1,address2,address3,postcode,building-reference-number,asset-rating,asset-rating-band,property-type,inspection-date,local-authority,constituency,county,lodgement-date,transaction-type,new-build-benchmark,existing-stock-benchmark,building-level,main-heating-fuel,other-fuel-desc,special-energy-uses,renewable-sources,floor-area,standard-emissions,target-emissions,typical-emissions,building-emissions,aircon-present,aircon-kw-rating,estimated-aircon-kw-rating,ac-inspection-commissioned,building-environment,address,local-authority-label,constituency-label,posttown,lodgement-datetime,primary-energy-value,uprn,uprn-source
str,str,str,str,str,i64,i64,str,str,date,str,str,str,date,str,i64,i64,i64,str,str,str,str,i64,f64,f64,f64,f64,str,f64,f64,i64,str,str,str,str,str,datetime[μs],i64,i64,str
"""802aaf5c523849468ab92e91e429a1…","""BLOCK A""","""The Canalside""","""50 Lower Loveday Street""","""B19 3SJ""",10003553446,78,"""D""","""C2 Residential Institutions - …",2017-05-19,"""E08000025""","""E14000564""",,2017-06-27,"""Voluntary (No legal requiremen…",46,135,4,"""Grid Supplied Electricity""",,,,10218,93.07,85.46,250.43,145.76,"""No""",,,4.0,"""Heating and Natural Ventilatio…","""BLOCK A, The Canalside, 50 Low…","""Birmingham""","""Birmingham, Ladywood""","""BIRMINGHAM""",2017-06-27 00:00:00,862.0,,
"""123508840242017062817321698800…",,"""Yo!, Units 34 & 35""","""Boxpark Croydon""","""CR0 1LD""",629054890000,39,"""B""","""A3/A4/A5 Restaurant and Cafes/…",2017-06-28,"""E09000008""","""E14000654""","""Greater London Authority""",2017-06-28,"""Mandatory issue (Property on c…",41,119,4,"""Grid Supplied Electricity""",,,,56,132.06,106.99,313.53,101.83,"""No""",,,4.0,"""Air Conditioning""","""Yo!, Units 34 & 35, Boxpark Cr…","""Croydon""","""Croydon Central""","""CROYDON""",2017-06-28 17:32:16,602.0,,
"""197643632120150331152937020091…",,"""1 Old Field Road""","""Pencoed""","""CF35 5LJ""",109832110000,69,"""C""","""B1 Offices and Workshop busine…",2015-03-26,"""W06000013""","""W07000074""","""Pen-y-bont ar Ogwr - Bridgend""",2015-03-31,"""Mandatory issue (Marketed sale…",23,67,3,"""Grid Supplied Electricity""",,,,3015,47.48,21.6,63.29,65.42,"""Yes""",,2.0,5.0,"""Air Conditioning""","""1 Old Field Road, Pencoed""","""Bridgend""","""Ogmore""","""BRIDGEND""",2015-03-31 15:29:37,,,
"""380552702020100709131818785100…","""1B -""","""Barsand House""","""Pym Street""","""PL19 0AW""",226467570001,70,"""C""","""Retail""",2010-07-08,"""E07000047""","""E14001000""","""Devon County""",2010-07-09,"""Mandatory issue (Property to l…",36,46,3,"""Grid Supplied Electricity""",,,,92,92.42,66.62,84.66,129.91,"""No""",,,4.0,"""Heating and Natural Ventilatio…","""1B -, Barsand House, Pym Stree…","""West Devon""","""Torridge and West Devon""","""TAVISTOCK""",2010-07-09 13:18:18,,,
"""460172570120110107120423070094…",,"""Retail Unit""","""19 Northgate""","""WF1 3BJ""",429177670000,59,"""C""","""Retail""",2011-01-05,"""E08000036""","""E14001009""","""Wakefield District (B)""",2011-01-07,"""Mandatory issue (Marketed sale…",37,46,3,"""Grid Supplied Electricity""",,,,62,94.42,69.15,87.45,111.03,"""No""",,,4.0,"""Heating and Natural Ventilatio…","""Retail Unit, 19 Northgate""","""Wakefield""","""Wakefield""","""WAKEFIELD""",2011-01-07 12:04:23,,,
"""01a6a99953c96c3462e549a974944f…","""Unit 55A""","""9 Newland Street""","""Eden""","""HP11 2BY""",10005208975,43,"""B""","""Retail/Financial and Professio…",2023-10-10,"""E06000060""","""E14001056""",,2023-10-12,"""Mandatory issue (Marketed sale…",9,36,3,"""Grid Supplied Electricity""",,,,487,9.83,1.78,7.13,8.47,"""Yes""",,1.0,1.0,"""Air Conditioning""","""Unit 55A, 9 Newland Street, Ed…","""Buckinghamshire""","""Wycombe""","""High Wycombe""",2023-10-12 10:04:30,91.0,,
"""bef60fd9f7e5784d2e01cf86d8903c…",,"""171, Fortess Road""",,"""NW5 2HR""",10003523280,101,"""E""","""A3/A4/A5 Restaurant and Cafes/…",2017-09-13,"""E09000007""","""E14000750""",,2017-09-19,"""Voluntary (No legal requiremen…",29,85,3,"""Grid Supplied Electricity""",,,,95,87.54,50.84,148.99,177.12,"""No""",,,4.0,"""Heating and Natural Ventilatio…","""171, Fortess Road""","""Camden""","""Holborn and St Pancras""","""LONDON""",2017-09-19 00:00:00,1046.0,,
"""100019310320090122145638930006…",,"""151-155 Radford Road""",,"""CV6 3BT""",373368410000,69,"""C""","""Retail""",2009-01-21,"""E08000026""","""E14000650""",,2009-01-22,,38,55,3,"""Natural Gas""",,,,250,,,,,,,,,"""Heating and Natural Ventilatio…","""151-155 Radford Road""","""Coventry""","""Coventry North West""","""COVENTRY""",2009-01-22 14:56:38,,,
"""2bbf94592c0751ba8ae7ab5ae3ecd1…",,"""Frampton Cotterell Parish Coun…","""The Brockeridge Centre, Wooden…","""BS36 2LQ""",10003534305,81,"""D""","""B1 Offices and Workshop busine…",2014-02-20,"""E06000025""","""E14000994""",,2014-02-25,"""Voluntary (No legal requiremen…",31,84,3,"""Natural Gas""",,,,232,40.84,25.64,68.38,66.02,"""No""",,,4.0,"""Heating and Natural Ventilatio…","""Frampton Cotterell Parish Coun…","""South Gloucestershire""","""Thornbury and Yate""","""BRISTOL""",2014-02-25 00:00:00,,,
"""28d157aa1ce61ead8489adb7024e41…",,"""New Road Day Centre""","""5 Balliol Road""","""PO2 7PP""",130357270000,44,"""B""","""C2 Residential Institutions - …",2016-03-25,"""E06000044""","""E14000884""",,2016-03-30,"""Voluntary (No legal requiremen…",24,72,4,"""Natural Gas""",,,,634,76.36,37.39,109.58,66.78,"""No""",,,4.0,"""Heating and Natural Ventilatio…","""New Road Day Centre, 5 Balliol…","""Portsmouth""","""Portsmouth South""","""PORTSMOUTH""",2016-03-30 00:00:00,,,


In [21]:
uprn_matched[:10]
print(type(uprn_matched))

<class 'list'>


In [23]:
lmk_key = []
uprn = []
score = []
for row in uprn_matched:
    if row is None:
        continue
    lmk_key.append(row[0])
    uprn.append(row[1])
    score.append(row[2])

data = {
    'lmk-key': lmk_key,
    'uprn': uprn,
    'score': score
}
df_matched = pl.DataFrame(data, schema={'lmk-key': pl.String, 'uprn': pl.String, 'score': pl.Float32})
os.makedirs('output', exist_ok=True)
df_matched.write_parquet('output/epc-to-uprn.parquet')

In [36]:
df_matched

lmk-key,uprn,score
str,str,f64
"""802aaf5c523849468ab92e91e429a1…","""10024328586""",82.758621
"""802aaf5c523849468ab92e91e429a1…","""10024328587""",82.758621
"""802aaf5c523849468ab92e91e429a1…","""10024328588""",82.758621
"""802aaf5c523849468ab92e91e429a1…","""10024328589""",82.758621
"""802aaf5c523849468ab92e91e429a1…","""10024328620""",82.758621
…,…,…
"""28d157aa1ce61ead8489adb7024e41…","""1775096899""",96.551724
"""28d157aa1ce61ead8489adb7024e41…","""1775004442""",93.333333
"""28d157aa1ce61ead8489adb7024e41…","""1775004432""",92.857143
"""28d157aa1ce61ead8489adb7024e41…","""1775004433""",92.857143
