In [23]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
from datetime import datetime
import os_record_types as os_records

In [2]:
# load all files to single df
load_dotenv()
zip_file_name = os.getenv('OS_FILENAME')
zip_dir = zip_file_name.replace('.zip', '')
csv_file_names = sorted(os.listdir(zip_dir))

record_dfs = {}

for file_name in csv_file_names[:50]:
    print(f'file_name: [{file_name}], time: [{datetime.now().strftime("%H:%M:%S")}]')
    temp_df = pd.read_csv(f'{zip_dir}/{file_name}', header=None, names=range(30), dtype=object)
    # build individual dfs
    for record_type in os_records.record_types.keys():
        records_to_add = temp_df[temp_df[0].astype(int) == record_type]
        count = len(records_to_add)
        print(f'record_type: [{record_type}], time: [{datetime.now().strftime("%H:%M:%S")}], count: {count}')
        if count == 0: continue
        if record_type in record_dfs:
            record_dfs[record_type] = pd.concat([record_dfs[record_type], records_to_add]) 
        else:
            record_dfs[record_type] = records_to_add

print('done')

file_name: [AddressBasePremium_FULL_2024-05-29_001.csv], time: [18:37:01]
record_type: [11], time: [18:37:04], count: 1000000
record_type: [15], time: [18:37:04], count: 0
record_type: [21], time: [18:37:04], count: 0
record_type: [23], time: [18:37:04], count: 0
record_type: [24], time: [18:37:04], count: 0
record_type: [28], time: [18:37:04], count: 0
record_type: [30], time: [18:37:04], count: 0
record_type: [31], time: [18:37:04], count: 0
record_type: [32], time: [18:37:04], count: 0
file_name: [AddressBasePremium_FULL_2024-05-29_002.csv], time: [18:37:04]
record_type: [11], time: [18:37:06], count: 507301
record_type: [15], time: [18:37:06], count: 492699
record_type: [21], time: [18:37:06], count: 0
record_type: [23], time: [18:37:06], count: 0
record_type: [24], time: [18:37:06], count: 0
record_type: [28], time: [18:37:06], count: 0
record_type: [30], time: [18:37:06], count: 0
record_type: [31], time: [18:37:06], count: 0
record_type: [32], time: [18:37:06], count: 0
file_nam

In [9]:
# remove excess cols
for record_type in record_dfs.keys():
    if record_dfs[record_type].empty: continue
    columns_to_drop = list(record_dfs[record_type].columns[record_dfs[record_type].isnull().all()])
    print(f'type: [{record_type}], time: [{datetime.now().strftime("%H:%M:%S")}], cols_to_drop: [{columns_to_drop}]')
    record_dfs[record_type].drop(columns=columns_to_drop, inplace=True)

type: [11], time: [11:01:54]
type: [11], time: [11:01:54], cols_to_drop: [[24, 25, 26, 27, 28, 29]]
type: [15], time: [11:01:55]
type: [15], time: [11:01:55], cols_to_drop: [[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]]
type: [21], time: [11:01:55]
type: [21], time: [11:02:17], cols_to_drop: [[22, 23, 24, 25, 26, 27, 28, 29]]
type: [23], time: [11:02:38]
type: [23], time: [11:02:40], cols_to_drop: [[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]]


In [10]:
# add col names
for record_type in record_dfs.keys():
    print(f'type: [{record_type}], time: [{datetime.now().strftime("%H:%M:%S")}]')
    record_dfs[record_type].columns = os_records.record_types[record_type].ColNames

type: [11], time: [11:04:11]
type: [15], time: [11:04:11]
type: [21], time: [11:04:11]
type: [23], time: [11:04:11]


In [28]:
# set key dtypes
for record_type in record_dfs.keys():
    record = os_records.record_types[record_type]
    print(f'type: [{record_type}], time: [{datetime.now().strftime("%H:%M:%S")}]')
    record_dfs[record_type].astype({record.PrimaryKey: record.PkDtype}, copy=False)
    
# set keys
for record_type in record_dfs.keys():
    record = os_records.record_types[record_type]
    print(f'type: [{record_type}], time: [{datetime.now().strftime("%H:%M:%S")}]')
    record_dfs[record_type].set_index(record.PrimaryKey, inplace=True)
    

type: [11], time: [16:17:48]
type: [15], time: [16:17:48]
type: [21], time: [16:17:48]
type: [23], time: [16:17:50]
type: [11], time: [16:17:50]
type: [15], time: [16:17:50]
type: [21], time: [16:17:50]
type: [23], time: [16:17:50]


In [30]:
# joins
xrefs_with_uarn = record_dfs[23][record_dfs[23]['source'] == '7666VN']
blpu = record_dfs[21]
blpu_with_uarn = blpu.join(xrefs_with_uarn, lsuffix='_blpu', rsuffix='_xref', on='uprn', how='outer')


In [26]:
for record_type in record_dfs.keys():
    print(f'type: [{record_type}], count: [{len(record_dfs[record_type])}]')

type: [11], count: [1507301]
type: [15], count: [1630431]
type: [21], count: [40825714]
type: [23], count: [5174286]


In [32]:
len(blpu_with_uarn)

40880092