In [1]:
import polars as pl
from dotenv import load_dotenv
import os
from datetime import datetime

In [2]:
load_dotenv()
zip_file_name = os.getenv('VOA_FILENAME')
zip_dir = zip_file_name.replace('.zip', '')
csv_file_names = sorted(os.listdir(zip_dir))

In [3]:
schema = {
    'entry_number': pl.Int64,
    'bill_auth_code': pl.String,
    'ndr_community_code': pl.String,
    'ba_ref_number': pl.String,
    'pri_sec_desc_code': pl.String,
    'pri_desc': pl.String,
    'uarn': pl.String, # is uint64, but aodc can't deal with that
    'full_property_identifier': pl.String,
    'firm_name': pl.String,
    'number_or_name': pl.String,
    'street': pl.String,
    'town': pl.String,
    'postal_district': pl.String,
    'county': pl.String,
    'postcode': pl.String,
    'effective_date': pl.String,
    'composite_indicator': pl.String,
    'rateable_value': pl.Int64,
    'appeal_stmt_code': pl.String,
    'assessment_ref': pl.String, # is uint64, but aodc can't deal with that
    'list_alteration_date': pl.String,
    'scat_code_and_suffix': pl.String,
    'sub_street_lvl3': pl.String,
    'sub_street_lvl2': pl.String,
    'sub_street_lvl1': pl.String,
    'case_number': pl.String, # is uint64, but aodc can't deal with that
    'current_from_date': pl.String,
    'current_to_date': pl.String,
    '29': pl.String # some lines have an extra comma
}

df_voa = pl.DataFrame()
for file_name in csv_file_names:
    if '-historic-' in file_name: continue
    print(f'file_name: [{file_name}], time: [{datetime.now().strftime("%H:%M:%S")}]')
    df_voa = pl.read_csv(f'{zip_dir}/{file_name}', has_header=False, schema=schema, separator='*')

print('done')

file_name: [uk-englandwales-ndr-2023-listentries-compiled-epoch-0008-baseline-csv.csv], time: [14:11:16]
done


In [4]:
output_folder = os.getenv('OUTPUTS_FOLDER')
os.makedirs(output_folder, exist_ok=True)
df_voa.write_parquet(f'{output_folder}/voa-business-rates.parquet')