In [1]:
import polars as pl
import numpy as np
from dotenv import load_dotenv
import os
from datetime import datetime
import os_record_types as os_records

In [13]:
# load all files to single df
load_dotenv()
zip_file_name = os.getenv('OS_FILENAME')
zip_dir = zip_file_name.replace('.zip', '')
csv_file_names = sorted(os.listdir(zip_dir))

record_dfs = {}

In [14]:
schema = {}
for i in range(30):
    schema[str(i)] = pl.String

for file_name in csv_file_names[:250]:
    print(f'file_name: [{file_name}], time: [{datetime.now().strftime("%H:%M:%S")}]')
    # temp_df = pl.read_csv(f'{zip_dir}/{file_name}', header=None, names=range(30), dtype=object)
    temp_df = pl.read_csv(f'{zip_dir}/{file_name}', has_header=False, schema=schema)
    # build individual dfs
    for record_type in os_records.record_types.keys():
        records_to_add = temp_df.filter(pl.col('0') == str(record_type))
        count = len(records_to_add)
        print(f'record_type: [{record_type}], time: [{datetime.now().strftime("%H:%M:%S")}], count: {count}')
        if count == 0: continue
        if record_type in record_dfs:
            record_dfs[record_type] = pl.concat([record_dfs[record_type], records_to_add]) 
        else:
            record_dfs[record_type] = records_to_add

print('done')

file_name: [AddressBasePremium_FULL_2024-05-29_001.csv], time: [18:42:49]
record_type: [11], time: [18:42:50], count: 1000000
record_type: [15], time: [18:42:50], count: 0
record_type: [21], time: [18:42:50], count: 0
record_type: [23], time: [18:42:50], count: 0
record_type: [24], time: [18:42:50], count: 0
record_type: [28], time: [18:42:50], count: 0
record_type: [30], time: [18:42:50], count: 0
record_type: [31], time: [18:42:50], count: 0
record_type: [32], time: [18:42:50], count: 0
file_name: [AddressBasePremium_FULL_2024-05-29_002.csv], time: [18:42:50]
record_type: [11], time: [18:42:50], count: 507301
record_type: [15], time: [18:42:50], count: 492699
record_type: [21], time: [18:42:50], count: 0
record_type: [23], time: [18:42:50], count: 0
record_type: [24], time: [18:42:50], count: 0
record_type: [28], time: [18:42:50], count: 0
record_type: [30], time: [18:42:50], count: 0
record_type: [31], time: [18:42:50], count: 0
record_type: [32], time: [18:42:50], count: 0
file_nam

In [45]:
# remove excess cols
for record_type in record_dfs.keys():
    if record_dfs[record_type].is_empty(): continue
    df_null_cols = record_dfs[record_type].select(pl.all().is_null().all())
    columns_to_drop = (df_null_cols
                       .unpivot()
                       .filter(pl.col('value')==True)['variable']
                       .to_list())
    print(f'type: [{record_type}], time: [{datetime.now().strftime("%H:%M:%S")}], cols_to_drop: [{columns_to_drop}]')
    record_dfs[record_type] = record_dfs[record_type].drop(columns_to_drop)

type: [11], time: [11:52:12], cols_to_drop: [['24', '25', '26', '27', '28', '29']]
type: [15], time: [11:52:12], cols_to_drop: [['13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29']]
type: [21], time: [11:52:12], cols_to_drop: [['22', '23', '24', '25', '26', '27', '28', '29']]
type: [23], time: [11:52:12], cols_to_drop: [['12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29']]
type: [24], time: [11:52:12], cols_to_drop: [['26', '27', '28', '29']]


In [46]:
# add col names
for record_type in record_dfs.keys():
    print(f'type: [{record_type}], time: [{datetime.now().strftime("%H:%M:%S")}]')
    record_dfs[record_type].columns = os_records.record_types[record_type].ColNames

type: [11], time: [11:53:03]
type: [15], time: [11:53:03]
type: [21], time: [11:53:03]
type: [23], time: [11:53:03]
type: [24], time: [11:53:03]


In [28]:
# not required in polars
# set key dtypes
# for record_type in record_dfs.keys():
#     record = os_records.record_types[record_type]
#     print(f'type: [{record_type}], time: [{datetime.now().strftime("%H:%M:%S")}]')
#     record_dfs[record_type] = record_dfs[record_type].cast({record.PrimaryKey: record.PkDtype})
#     
# # set keys
# for record_type in record_dfs.keys():
#     record = os_records.record_types[record_type]
#     print(f'type: [{record_type}], time: [{datetime.now().strftime("%H:%M:%S")}]')
#     record_dfs[record_type] = record_dfs[record_type].set_index(record.PrimaryKey)
    

type: [11], time: [16:17:48]
type: [15], time: [16:17:48]
type: [21], time: [16:17:48]
type: [23], time: [16:17:50]
type: [11], time: [16:17:50]
type: [15], time: [16:17:50]
type: [21], time: [16:17:50]
type: [23], time: [16:17:50]


In [48]:
# joins
xrefs_with_uarn = record_dfs[23].filter(pl.col('source') == '7666VN')
blpu = record_dfs[21]
blpu_with_uarn = blpu.join(xrefs_with_uarn, on='uprn', how='left')


In [47]:
for record_type in record_dfs.keys():
    print(f'type: [{record_type}], count: [{len(record_dfs[record_type])}]')

type: [11], count: [1507301]
type: [15], count: [1630431]
type: [21], count: [40825714]
type: [23], count: [197964510]
type: [24], count: [7209776]


In [50]:
blpu_with_uarn[0:10] # where uarn has value

record_identifier,change_type,pro_order,uprn,logical_status,blpu_state,blpu_state_date,parent_uprn,x_coordinate,y_coordinate,latitude,longitude,rpc,local_custodian_code,country,start_date,end_date,last_update_date,entry_date,addressbase_postal,postcode_locator,multi_occ_count,record_identifier_right,change_type_right,pro_order_right,xref_key,cross_reference,version,source,start_date_right,end_date_right,last_update_date_right,entry_date_right
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""21""","""I""","""1""","""10010385373""","""1""","""2""","""2007-12-05""",,"""408836.00""","""547677.00""","""54.8238677""","""-1.8640101""","""1""","""1355""","""E""","""2008-02-04""",,"""2016-02-10""","""2007-08-25""","""D""","""DH8 9AL""","""0""",,,,,,,,,,,
"""21""","""I""","""2""","""1775112186""","""1""","""2""","""2007-11-14""",,"""466331.94""","""100670.21""","""50.8017925""","""-1.0600678""","""1""","""1775""","""E""","""2007-11-28""",,"""2016-02-10""","""2007-11-14""","""D""","""PO3 6FR""","""0""",,,,,,,,,,,
"""21""","""I""","""3""","""192000731""","""1""","""2""","""2008-06-05""","""192000726""","""090352.00""","""010573.00""","""49.9149860""","""-6.3150173""","""2""","""835""","""E""","""2008-03-21""",,"""2018-11-12""","""2005-11-03""","""D""","""TR21 0LN""","""0""","""23""","""I""","""826501""","""0835X044640970""","""12789040000""",,"""7666VN""","""2020-07-15""",,"""2020-07-15""","""2020-07-06"""
"""21""","""I""","""4""","""10001092589""","""8""","""4""","""2012-12-14""","""10001092587""","""400747.00""","""091760.00""","""50.7254516""","""-1.9907750""","""1""","""1260""","""E""","""2008-01-31""","""2012-12-17""","""2019-04-24""","""2004-11-25""","""L""","""BH15 2BD""","""0""",,,,,,,,,,,
"""21""","""I""","""5""","""90082466""","""1""","""2""","""2007-10-09""",,"""391059.00""","""293558.00""","""52.5398198""","""-2.1332520""","""1""","""4615""","""E""","""2008-01-03""",,"""2018-02-25""","""2001-02-12""","""D""","""DY3 3LY""","""0""",,,,,,,,,,,
"""21""","""I""","""6""","""90121615""","""8""","""4""","""2010-01-14""",,"""391322.00""","""294743.00""","""52.5504771""","""-2.1294055""","""1""","""4615""","""E""","""2008-01-03""","""2010-01-19""","""2016-02-10""","""2001-02-12""","""N""","""DY3 3RF""","""0""",,,,,,,,,,,
"""21""","""I""","""7""","""37026889""","""1""","""2""","""2007-08-08""",,"""459603.00""","""098831.00""","""50.7859855""","""-1.1558435""","""2""","""1725""","""E""","""2007-09-07""",,"""2019-11-11""","""2001-07-06""","""D""","""PO12 2NQ""","""0""",,,,,,,,,,,
"""21""","""I""","""8""","""90002293""","""1""","""2""","""2007-10-09""",,"""389800.00""","""291986.00""","""52.5256652""","""-2.1517662""","""1""","""4615""","""E""","""2008-01-03""",,"""2018-08-11""","""2001-02-12""","""D""","""DY3 3AS""","""0""",,,,,,,,,,,
"""21""","""I""","""9""","""49065083""","""1""","""2""","""2008-09-11""","""49065045""","""241711.39""","""258452.63""","""52.2015256""","""-4.3176541""","""1""","""6820""","""W""","""2008-09-11""",,"""2016-02-06""","""2008-09-11""","""C""","""SA45 9ST""","""0""",,,,,,,,,,,
"""21""","""I""","""10""","""32042286""","""1""","""2""","""2001-02-16""",,"""396312.00""","""293150.00""","""52.5362141""","""-2.0557971""","""1""","""4620""","""E""","""2008-01-07""",,"""2016-02-10""","""2001-02-16""","""D""","""DY4 0PE""","""0""",,,,,,,,,,,


In [53]:
len(xrefs_with_uarn) # 2,048,417

2048417

In [16]:
record_dfs[11][:10]

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""11""","""I""","""1""","""82001558""","""1""","""2114""","""2""","""1998-11-09""","""1""","""8""","""0""","""2008-02-07""",,"""2018-02-25""","""1998-11-09""","""461680.00""","""092102.00""","""50.7252637""","""-1.1275089""","""462058.00""","""091919.00""","""50.7235780""","""-1.1221851""","""10""",,,,,,
"""11""","""I""","""2""","""24001239""","""1""","""230""","""2""","""1998-07-08""","""1""","""8""","""0""","""2007-12-17""",,"""2016-02-10""","""1998-07-08""","""505402.00""","""224191.00""","""51.9063042""","""-.4692386""","""505328.00""","""224212.00""","""51.9065069""","""-.4703075""","""5""",,,,,,
"""11""","""I""","""3""","""38504382""","""2""","""1625""","""2""","""2014-07-17""","""2""","""4""","""0""","""2014-07-21""",,"""2016-02-10""","""2014-07-17""","""388683.00""","""202256.00""","""51.7189176""","""-2.1652157""","""388829.00""","""202351.00""","""51.7197748""","""-2.1631053""","""10""",,,,,,
"""11""","""I""","""4""","""21403416""","""3""","""5510""",,,"""2""","""4""","""0""","""2014-09-09""",,"""2016-03-21""","""2014-09-08""","""507920.00""","""189102.00""","""51.5904457""","""-.4435334""","""507784.00""","""189195.00""","""51.5913076""","""-.4454672""","""1""",,,,,,
"""11""","""I""","""5""","""28103634""","""1""","""2620""","""2""","""2009-06-02""",,"""8""","""0""","""2009-06-18""",,"""2016-02-10""","""2009-06-02""","""606344.00""","""332213.00""","""52.8478932""","""1.0628166""","""606365.00""","""332316.00""","""52.8488097""","""1.0631931""","""10""",,,,,,
"""11""","""I""","""6""","""41402002""","""1""","""1570""",,,,,"""0""","""2007-10-22""",,"""2016-02-10""","""2007-09-06""","""554465.00""","""238234.00""","""52.0210365""",""".2497630""","""554472.00""","""238395.00""","""52.0224809""",""".2499375""","""10""",,,,,,
"""11""","""I""","""7""","""37500332""","""1""","""4315""","""2""","""2007-12-05""","""1""","""8""","""0""","""2008-02-04""",,"""2016-02-10""","""2003-12-12""","""348347.00""","""391721.00""","""53.4197621""","""-2.7786800""","""348179.00""","""391539.00""","""53.4181098""","""-2.7811776""","""0""",,,,,,
"""11""","""I""","""8""","""40901928""","""1""","""1145""","""2""","""2005-07-22""","""1""","""8""","""0""","""2008-01-02""",,"""2016-02-10""","""2005-07-22""","""242119.00""","""120765.00""","""50.9646097""","""-4.2497336""","""242033.00""","""120696.00""","""50.9639661""","""-4.2509273""","""10""",,,,,,
"""11""","""I""","""9""","""14407009""","""1""","""1720""",,,,,"""0""","""2007-10-17""",,"""2016-02-10""","""2004-12-06""","""455203.00""","""103326.00""","""50.8268387""","""-1.2175801""","""455177.00""","""103259.00""","""50.8262387""","""-1.2179593""","""10""",,,,,,
"""11""","""I""","""10""","""38780737""","""2""","""4525""","""2""","""2002-04-10""","""1""","""8""","""0""","""2008-01-03""",,"""2016-02-10""","""2002-04-10""","""434976.00""","""548452.00""","""54.8296966""","""-1.4570865""","""434875.00""","""548441.00""","""54.8296047""","""-1.4586600""","""10""",,,,,,


In [52]:

blpu_with_uarn.write_parquet('downloads/blpu.parquet')