In [1]:
import polars as pl
from dotenv import load_dotenv
import os
from datetime import datetime
from rapidfuzz import process, fuzz
load_dotenv()

True

In [2]:
# load CCOD
ccod_zip_file_name = os.getenv('HMLR_CCOD_OUTPUT_FILE')
ccod_file_root = ccod_zip_file_name.replace('.zip', '')
df_ccod =  pl.read_csv(f'downloads/{ccod_file_root}/{ccod_file_root}.csv')

In [3]:
len(df_ccod) # 4181223, ~4M

4181223

In [4]:
# load OS
os_files_path = os.getenv('OS_FILES_PATH')
csv_file_names = sorted(os.listdir(os_files_path))

In [5]:
record_dfs = {}
# record_types = [21,23,24,28,31,32] # as a min would need 21,24,28 for this
record_types = [28]

In [6]:
# read each file and assign to correct os record type
for file_name in csv_file_names:
    record_type = int(file_name[2:4])
    if record_type not in record_types: continue
    print(f'file_name: [{file_name}], time: [{datetime.now().strftime("%H:%M:%S")}]')
    temp_df = pl.read_csv(f'{os_files_path}/{file_name}', infer_schema_length=0)
    record_dfs[record_type] = temp_df

print('done')

file_name: [ID28_DPA_Records.csv], time: [18:24:00]
done


In [7]:
# check everything is ok
for record_type in record_dfs.keys():
    print(f'type: [{record_type}], count: [{len(record_dfs[record_type])}]')
    
## type: [21], count: [40825714]
## type: [23], count: [197964510]
## type: [28], count: [30598226]
## type: [31], count: [1422087]
## type: [32], count: [44684909]

type: [28], count: [30598226]


In [30]:
# format addresses
def format_abp_address(abp_row):
    # todo: if welsh vs english - req join with 21, blpu.COUNTRY == E or W
    organisation_name = abp_row[5]
    department_name = abp_row[6]
    sub_building_name = abp_row[7]
    building_name = abp_row[8]
    building_number = abp_row[9]
    dependant_thoroughfare = abp_row[10]
    thoroughfare = abp_row[11]
    dependant_locality = abp_row[13]
    post_town = abp_row[14]
    postcode = abp_row[15]
    uprn = abp_row[3]
    
    formatted_address = ''
    for part in [organisation_name, department_name, sub_building_name, building_name, building_number, 
                 dependant_thoroughfare, thoroughfare, dependant_locality, post_town, postcode]:
        if part is not None:
            formatted_address += f'{part}, '
            
    return formatted_address[:-2], uprn
    

In [100]:
# attempt the match
uprn_matched = []
count_no_hmlr_postcode = 0
count_hmlr_postcode_not_found = 0
for row_ccod in df_ccod[:4000].iter_rows(named=True):
    if row_ccod['Postcode'] == '': 
        uprn_matched.append(None)
        count_no_hmlr_postcode += 1
        # at this point you could decide to choose the subset based on town/county/district instead
        continue
    
    local_abp = record_dfs[28].filter(pl.col('POSTCODE') == row_ccod['Postcode'])
    if local_abp.is_empty():
        uprn_matched.append(None)
        count_hmlr_postcode_not_found += 1
        # same as above, obvs the postcode is wrong
        continue

    formatted_ccod = row_ccod['Property Address'].upper().replace(' (', ', ').replace(')', '')

    local_addresses = local_abp.map_rows(format_abp_address)
    local_abp_addresses_0 = local_addresses.get_column('column_0')
    local_abp_addresses_1 = local_addresses.get_column('column_1')
    
    # ok, this needs some optimising to figure out the best performance vs accuracy. 
    # using ratio for now as its the quickest (partial_ratio was used in disco). 
    # for first 10 records in ccod: ratio: 20s; partial_ratio: 2min10s
    # cdist of both lists, using all available processors, may be better perf.
    match = process.extract(formatted_ccod, local_abp_addresses_0, scorer=fuzz.ratio, limit=1)
    print(match)
    if len(match) > 0:
        uprn_matched.append(local_abp_addresses_1[match[0][2]])
        

print('done')
print('hmlr postcode missing:', count_no_hmlr_postcode)
print('hmlr postcode not found:', count_hmlr_postcode_not_found)

done
hmlr postcode missing: 489
hmlr postcode not found: 12


In [54]:
# join uprn and id cols
df_ccod.with_columns(uprn_matched)

['10033571957',
 '100023474659',
 '100023473825',
 '100023346266',
 '100023473814',
 '5031705',
 '5088285',
 '100023340238',
 '100023431519',
 '100022771868']

    organisation_name = abp_row['ORGANISATION_NAME']
    department_name = abp_row['DEPARTMENT_NAME']
    sub_building_name = abp_row['SUB_BUILDING_NAME']
    building_name = abp_row['BUILDING_NAME']
    building_number = abp_row['BUILDING_NUMBER']
    dependant_thoroughfare = abp_row['DEPENDANT_THOROUGHFARE']
    thoroughfare = abp_row['THOROUGHFARE']
    dependant_locality = abp_row['DEPENDANT_LOCALITY']
    post_town = abp_row['POST_TOWN']
    postcode = abp_row['POSTCODE']
    uprn = abp_row['UPRN']