In [23]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
from datetime import datetime
import os_record_types as os_records

In [None]:
# load all files to single df
load_dotenv()
zip_file_name = os.getenv('OS_FILENAME')
zip_dir = zip_file_name.replace('.zip', '')
csv_file_names = sorted(os.listdir(zip_dir))

record_dfs = {}

In [42]:

for file_name in csv_file_names[:250]:
    print(f'file_name: [{file_name}], time: [{datetime.now().strftime("%H:%M:%S")}]')
    temp_df = pd.read_csv(f'{zip_dir}/{file_name}', header=None, names=range(30), dtype=object)
    # build individual dfs
    for record_type in os_records.record_types.keys():
        records_to_add = temp_df[temp_df[0].astype(int) == record_type]
        count = len(records_to_add)
        print(f'record_type: [{record_type}], time: [{datetime.now().strftime("%H:%M:%S")}], count: {count}')
        if count == 0: continue
        if record_type in record_dfs:
            record_dfs[record_type] = pd.concat([record_dfs[record_type], records_to_add]) 
        else:
            record_dfs[record_type] = records_to_add

print('done')

file_name: [AddressBasePremium_FULL_2024-05-29_101.csv], time: [14:48:04]
record_type: [11], time: [14:48:06], count: 0
record_type: [15], time: [14:48:06], count: 0
record_type: [21], time: [14:48:06], count: 0
record_type: [23], time: [14:48:06], count: 1000000
record_type: [24], time: [14:49:00], count: 0
record_type: [28], time: [14:49:00], count: 0
record_type: [30], time: [14:49:00], count: 0
record_type: [31], time: [14:49:01], count: 0
record_type: [32], time: [14:49:01], count: 0
file_name: [AddressBasePremium_FULL_2024-05-29_102.csv], time: [14:49:01]
record_type: [11], time: [14:49:03], count: 0
record_type: [15], time: [14:49:03], count: 0
record_type: [21], time: [14:49:03], count: 0
record_type: [23], time: [14:49:03], count: 1000000
record_type: [24], time: [14:57:26], count: 0
record_type: [28], time: [14:57:26], count: 0
record_type: [30], time: [14:57:26], count: 0
record_type: [31], time: [14:57:26], count: 0
record_type: [32], time: [14:57:26], count: 0
file_name: [

KeyboardInterrupt: 

In [41]:
# remove excess cols
for record_type in record_dfs.keys():
    if record_dfs[record_type].empty: continue
    columns_to_drop = list(record_dfs[record_type].columns[record_dfs[record_type].isnull().all()])
    print(f'type: [{record_type}], time: [{datetime.now().strftime("%H:%M:%S")}], cols_to_drop: [{columns_to_drop}]')
    record_dfs[record_type].drop(columns=columns_to_drop, inplace=True)

type: [11], time: [14:31:02], cols_to_drop: [[]]
type: [15], time: [14:31:03], cols_to_drop: [[]]
type: [21], time: [14:31:24], cols_to_drop: [[]]
type: [23], time: [14:32:31], cols_to_drop: [[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]]


In [10]:
# add col names
for record_type in record_dfs.keys():
    print(f'type: [{record_type}], time: [{datetime.now().strftime("%H:%M:%S")}]')
    record_dfs[record_type].columns = os_records.record_types[record_type].ColNames

type: [11], time: [11:04:11]
type: [15], time: [11:04:11]
type: [21], time: [11:04:11]
type: [23], time: [11:04:11]


In [28]:
# set key dtypes
for record_type in record_dfs.keys():
    record = os_records.record_types[record_type]
    print(f'type: [{record_type}], time: [{datetime.now().strftime("%H:%M:%S")}]')
    record_dfs[record_type].astype({record.PrimaryKey: record.PkDtype}, copy=False)
    
# set keys
for record_type in record_dfs.keys():
    record = os_records.record_types[record_type]
    print(f'type: [{record_type}], time: [{datetime.now().strftime("%H:%M:%S")}]')
    record_dfs[record_type].set_index(record.PrimaryKey, inplace=True)
    

type: [11], time: [16:17:48]
type: [15], time: [16:17:48]
type: [21], time: [16:17:48]
type: [23], time: [16:17:50]
type: [11], time: [16:17:50]
type: [15], time: [16:17:50]
type: [21], time: [16:17:50]
type: [23], time: [16:17:50]


In [37]:
# joins
xrefs_with_uarn = record_dfs[23][record_dfs[23]['source'] == '7666VN']
blpu = record_dfs[21]
blpu_with_uarn = blpu.join(xrefs_with_uarn, lsuffix='_blpu', rsuffix='_xref', on='uprn', how='left')
# blpu_with_uarn = blpu.join(xrefs_with_uarn, lsuffix='_blpu', rsuffix='_xref', on='uprn', how='outer')


In [44]:
for record_type in record_dfs.keys():
    print(f'type: [{record_type}], count: [{len(record_dfs[record_type])}]')

type: [11], count: [1507301]
type: [15], count: [1630431]
type: [21], count: [40825714]
type: [23], count: [57174286]


In [38]:
blpu_with_uarn[0:10] # where uarn has value

Unnamed: 0_level_0,record_identifier_blpu,change_type_blpu,pro_order_blpu,logical_status,blpu_state,blpu_state_date,parent_uprn,x_coordinate,y_coordinate,latitude,...,change_type_xref,pro_order_xref,uprn,cross_reference,version,source,start_date_xref,end_date_xref,last_update_date_xref,entry_date_xref
uprn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10010385373,21,I,1,1,2,2007-12-05,,408836.0,547677.0,54.8238677,...,,,,,,,,,,
1775112186,21,I,2,1,2,2007-11-14,,466331.94,100670.21,50.8017925,...,,,,,,,,,,
192000731,21,I,3,1,2,2008-06-05,192000726.0,90352.0,10573.0,49.914986,...,,,,,,,,,,
10001092589,21,I,4,8,4,2012-12-14,10001092587.0,400747.0,91760.0,50.7254516,...,,,,,,,,,,
90082466,21,I,5,1,2,2007-10-09,,391059.0,293558.0,52.5398198,...,,,,,,,,,,
90121615,21,I,6,8,4,2010-01-14,,391322.0,294743.0,52.5504771,...,,,,,,,,,,
37026889,21,I,7,1,2,2007-08-08,,459603.0,98831.0,50.7859855,...,,,,,,,,,,
90002293,21,I,8,1,2,2007-10-09,,389800.0,291986.0,52.5256652,...,,,,,,,,,,
49065083,21,I,9,1,2,2008-09-11,49065045.0,241711.39,258452.63,52.2015256,...,,,,,,,,,,
32042286,21,I,10,1,2,2001-02-16,,396312.0,293150.0,52.5362141,...,,,,,,,,,,


In [35]:
len(xrefs_with_uarn) # 54378 - quite disappointing, I thought there'd be a lot more

54378