In [1]:
import polars as pl
from dotenv import load_dotenv
import os
from datetime import datetime
from rapidfuzz import process, fuzz
load_dotenv()

True

In [3]:
# load CCOD
ccod_zip_file_name = os.getenv('HMLR_CCOD_OUTPUT_FILE')
ccod_file_root = ccod_zip_file_name.replace('.zip', '')
df_ccod =  pl.read_csv(f'../downloads/{ccod_file_root}/{ccod_file_root}.csv')

In [3]:
len(df_ccod) # 4181223, ~4M

4181223

In [6]:
# load OS
os_files_path = os.getenv('OS_FILES_PATH')
csv_file_names = sorted(os.listdir(os_files_path))

In [7]:
record_dfs = {}
# record_types = [21,23,24,28,31,32] # as a min would need 21,24,28 for this
record_types = [28]

In [8]:
# read each file and assign to correct os record type
for file_name in csv_file_names:
    record_type = int(file_name[2:4])
    if record_type not in record_types: continue
    print(f'file_name: [{file_name}], time: [{datetime.now().strftime("%H:%M:%S")}]')
    temp_df = pl.read_csv(f'{os_files_path}/{file_name}', infer_schema_length=0)
    record_dfs[record_type] = temp_df

print('done')

file_name: [ID28_DPA_Records.csv], time: [18:41:36]
done


In [9]:
# check everything is ok
for record_type in record_dfs.keys():
    print(f'type: [{record_type}], count: [{len(record_dfs[record_type])}]')
    
## type: [21], count: [40825714]
## type: [23], count: [197964510]
## type: [28], count: [30598226]
## type: [31], count: [1422087]
## type: [32], count: [44684909]

type: [28], count: [30598226]


In [10]:
# format addresses
def format_abp_address(abp_row):
    # todo: if welsh vs english - req join with 21, blpu.COUNTRY == E or W
    organisation_name = abp_row[5]
    department_name = abp_row[6]
    sub_building_name = abp_row[7]
    building_name = abp_row[8]
    building_number = abp_row[9]
    dependant_thoroughfare = abp_row[10]
    thoroughfare = abp_row[11]
    dependant_locality = abp_row[13]
    post_town = abp_row[14]
    postcode = abp_row[15]
    uprn = abp_row[3]

    formatted_address = ''
    # taylor this for each dataset, epc has an address field that is equiv to name,number,street so currently only using those
    for part in [#organisation_name, department_name, 
        sub_building_name, building_name, building_number,
        dependant_thoroughfare, thoroughfare,
        post_town, f'({postcode})'
    ]:
        if part is not None:
            formatted_address += f'{part}, '

    return formatted_address[:-2], uprn


In [12]:
# attempt the match
uprn_matched = []
count = 0
count_no_postcode = 0
count_postcode_not_found = 0
count_no_address = 0
for row in df_ccod.iter_rows(named=True):
    count += 1
    if count % 10000 == 0:
        print(f'count: [{count}]')
    if row['Postcode'] == '':
        uprn_matched.append(None)
        count_no_postcode += 1
        # at this point you could decide to choose the subset based on town/county/district instead
        continue

    local_abp = record_dfs[28].filter(pl.col('POSTCODE') == row['Postcode'])
    if local_abp.is_empty():
        uprn_matched.append(None)
        count_postcode_not_found += 1
        # same as above, obvs the source postcode is wrong if not found in abp
        continue

    if row['Property Address'] is None:
        uprn_matched.append(None)
        count_no_address += 1
        # sames as above
        continue

    formatted_epc_address = row['Property Address'].upper()

    local_addresses = local_abp.map_rows(format_abp_address)
    local_abp_addresses_0 = local_addresses.get_column('column_0')
    local_abp_addresses_1 = local_addresses.get_column('column_1')

    # ok, this needs some optimising to figure out the best performance vs accuracy. 
    # for first 1000 records in ccod: ratio: 5s; partial_ratio: 5s
    # cdist of both lists, using all available processors, may be better perf.
    matches = process.extract(
        formatted_epc_address,
        local_abp_addresses_0,
        scorer=fuzz.partial_ratio,
        limit=1)
    # print(matches)
    if len(matches) > 0:
        for match in matches:
            uprn_matched.append((
                row['Title Number'],
                local_abp_addresses_1[match[2]], # gets uprn at same index as the match, relative to current postcode
                match[1]))


print('done')
print('postcode missing:', count_no_postcode)
print('postcode not found:', count_postcode_not_found)
print('address missing:', count_no_address)
# took 38m

count: [10000]
count: [20000]
count: [30000]
count: [40000]
count: [50000]
count: [60000]
count: [70000]
count: [80000]
count: [90000]
count: [100000]
count: [110000]
count: [120000]
count: [130000]
count: [140000]
count: [150000]
count: [160000]
count: [170000]
count: [180000]
count: [190000]
count: [200000]
count: [210000]
count: [220000]
count: [230000]
count: [240000]
count: [250000]
count: [260000]
count: [270000]
count: [280000]
count: [290000]
count: [300000]
count: [310000]
count: [320000]
count: [330000]
count: [340000]
count: [350000]
count: [360000]
count: [370000]
count: [380000]
count: [390000]
count: [400000]
count: [410000]
count: [420000]
count: [430000]
count: [440000]
count: [450000]
count: [460000]
count: [470000]
count: [480000]
count: [490000]
count: [500000]
count: [510000]
count: [520000]
count: [530000]
count: [540000]
count: [550000]
count: [560000]
count: [570000]
count: [580000]
count: [590000]
count: [600000]
count: [610000]
count: [620000]
count: [630000]
c

  local_abp = record_dfs[28].filter(pl.col('POSTCODE') == row['Postcode'])


In [13]:
# checking outputs
print(len(uprn_matched))
print(len(uprn_matched.filter(pl.col('UPRN').is_null()))) # 40% failed matching to a postcode.. 
print(len(df_ccod))

4181223


AttributeError: 'list' object has no attribute 'filter'

In [14]:
title_num = []
uprn = []
score = []
for row in uprn_matched:
    if row is None:
        continue
    title_num.append(row[0])
    uprn.append(row[1])
    score.append(row[2])

data = {
    'title_number': title_num,
    'uprn': uprn,
    'score': score
}
df_matched = pl.DataFrame(data, schema={'title_number': pl.String, 'uprn': pl.String, 'score': pl.Float32})
output_folder = os.getenv('OUTPUTS_FOLDER')

os.makedirs(output_folder, exist_ok=True)
df_matched.write_parquet(f'../{output_folder}/hmlr-poly-title-to-uprn.parquet')