In [1]:
import polars as pl
from dotenv import load_dotenv
import os
from datetime import datetime
from rapidfuzz import process, fuzz
load_dotenv()

True

In [2]:
# load CCOD
ccod_zip_file_name = os.getenv('HMLR_CCOD_OUTPUT_FILE')
ccod_file_root = ccod_zip_file_name.replace('.zip', '')
df_ccod =  pl.read_csv(f'downloads/{ccod_file_root}/{ccod_file_root}.csv')

In [3]:
len(df_ccod) # 4181223, ~4M

4181223

In [4]:
# load OS
os_files_path = os.getenv('OS_FILES_PATH')
csv_file_names = sorted(os.listdir(os_files_path))

In [5]:
record_dfs = {}
# record_types = [21,23,24,28,31,32] # as a min would need 21,24,28 for this
record_types = [28]

In [6]:
# read each file and assign to correct os record type
for file_name in csv_file_names:
    record_type = int(file_name[2:4])
    if record_type not in record_types: continue
    print(f'file_name: [{file_name}], time: [{datetime.now().strftime("%H:%M:%S")}]')
    temp_df = pl.read_csv(f'{os_files_path}/{file_name}', infer_schema_length=0)
    record_dfs[record_type] = temp_df

print('done')

file_name: [ID28_DPA_Records.csv], time: [18:24:00]
done


In [7]:
# check everything is ok
for record_type in record_dfs.keys():
    print(f'type: [{record_type}], count: [{len(record_dfs[record_type])}]')
    
## type: [21], count: [40825714]
## type: [23], count: [197964510]
## type: [28], count: [30598226]
## type: [31], count: [1422087]
## type: [32], count: [44684909]

type: [28], count: [30598226]


In [101]:
# attempt 'match' as a select to speed it up.
# could group abp by postcode
dpa_unique_postcodes = record_dfs[28].sort('POSTCODE').unique(subset=['POSTCODE'])
left_joined = df_ccod.join(dpa_unique_postcodes, how='left', left_on='Postcode', right_on='POSTCODE')
mock_poly = left_joined.select(['Title Number', 'UPRN'])
mock_poly.head()

Title Number,UPRN
str,str
"""BB152""",
"""BB154""","""100023474659"""
"""BB181""","""100023473825"""
"""BB232""","""100023346266"""
"""BB263""",


In [104]:
print(len(mock_poly))
print(len(mock_poly.filter(pl.col('UPRN').is_null()))) # 40% failed matching to a postcode.. 
print(len(df_ccod))

4181223
1671180
4181223


In [102]:
os.makedirs('output', exist_ok=True)
mock_poly.write_parquet('output/mock-poly.parquet')