In [10]:
import pandas as pd

chunksize = 50000
chunks = pd.read_csv('ETL_Final.csv', chunksize=chunksize)

dfs = []
total_rows = 0

for chunk in chunks:
    total_rows += len(chunk)
    dfs.append(chunk)

df = pd.concat(dfs, ignore_index=True) 

print(total_rows)


6754200


In [11]:
df.head(5)

Unnamed: 0,Transfer Date,Postcode,primary_addressable_object_name,secondary_addressable_object_name,street,Town/City,District,County,Price (Thousands),Freehold Tenure,...,Postcode Area,Postcode District,Area Code Average Price (Thousands),Town/City Average Price (Thousands),District Average Price (Thousands),County Average Price (Thousands),LAT,LONG,OSEAST1M,OSNRTH1M
0,2018-09-24,M6 8GQ,1,,RIVINGTON ROAD,SALFORD,SALFORD,GREATER MANCHESTER,253.5,1,...,M,M6,286.373466,274.417259,259.866315,264.161265,53.49295,-2.311376,379439.0,399626.0
1,2018-09-28,WA3 2UE,35,,STONEACRE CLOSE,WARRINGTON,WIGAN,GREATER MANCHESTER,231.95,1,...,WA,WA3,301.647233,295.472854,194.319863,264.161265,53.46955,-2.583977,361332.0,397136.0
2,2018-08-29,OL6 6RJ,102,,THORNFIELD GROVE,ASHTON-UNDER-LYNE,TAMESIDE,GREATER MANCHESTER,112.5,1,...,OL,OL6,197.639388,184.326053,196.955406,264.161265,53.48984,-2.074338,395165.0,399237.0
3,2018-06-15,M46 0TW,37,,THREADNEEDLE PLACE,MANCHESTER,WIGAN,GREATER MANCHESTER,184.995,1,...,M,M46,286.373466,288.415706,194.319863,264.161265,53.52589,-2.495181,367270.0,403360.0
4,2018-09-28,M28 3XS,9,,MARPLE GARDENS,MANCHESTER,SALFORD,GREATER MANCHESTER,214.995,0,...,M,M28,286.373466,288.415706,259.866315,264.161265,53.51677,-2.411523,372810.0,402310.0


In [21]:
df['Postcode'] = df['Postcode'].str.replace(' ', '').str.upper()
df.head()

Unnamed: 0,Transfer Date,Postcode,primary_addressable_object_name,secondary_addressable_object_name,street,Town/City,District,County,Price (Thousands),Freehold Tenure,...,Postcode Area,Postcode District,Area Code Average Price (Thousands),Town/City Average Price (Thousands),District Average Price (Thousands),County Average Price (Thousands),LAT,LONG,OSEAST1M,OSNRTH1M
0,2018-09-24,M68GQ,1,,RIVINGTON ROAD,SALFORD,SALFORD,GREATER MANCHESTER,253.5,1,...,M,M6,286.373466,274.417259,259.866315,264.161265,53.49295,-2.311376,379439.0,399626.0
1,2018-09-28,WA32UE,35,,STONEACRE CLOSE,WARRINGTON,WIGAN,GREATER MANCHESTER,231.95,1,...,WA,WA3,301.647233,295.472854,194.319863,264.161265,53.46955,-2.583977,361332.0,397136.0
2,2018-08-29,OL66RJ,102,,THORNFIELD GROVE,ASHTON-UNDER-LYNE,TAMESIDE,GREATER MANCHESTER,112.5,1,...,OL,OL6,197.639388,184.326053,196.955406,264.161265,53.48984,-2.074338,395165.0,399237.0
3,2018-06-15,M460TW,37,,THREADNEEDLE PLACE,MANCHESTER,WIGAN,GREATER MANCHESTER,184.995,1,...,M,M46,286.373466,288.415706,194.319863,264.161265,53.52589,-2.495181,367270.0,403360.0
4,2018-09-28,M283XS,9,,MARPLE GARDENS,MANCHESTER,SALFORD,GREATER MANCHESTER,214.995,0,...,M,M28,286.373466,288.415706,259.866315,264.161265,53.51677,-2.411523,372810.0,402310.0


In [28]:
import os
import pandas as pd
import re
from tqdm import tqdm

def normalize_address_components(*components):
    combined = ' '.join([str(c) for c in components if pd.notna(c)])
    normalized = (
        combined.upper()
        .replace(',', '')
        .replace('.', '')
        .replace('-', ' ')
    )
    normalized = re.sub(r'\s+', ' ', normalized)
    return normalized.strip()

df['postcode_clean'] = df['Postcode'].astype(str).str.replace(' ', '').str.upper()
df['match_key'] = df.apply(
    lambda row: normalize_address_components(
        row.get('primary_addressable_object_name', ''),
        row.get('street', ''),
        row['postcode_clean']
    ),
    axis=1
)

folder = 'epc_certificates_only'
subfolders = [f for f in os.listdir(folder) if os.path.isdir(os.path.join(folder, f))]

output_dir = "matched_epc_rows"
os.makedirs(output_dir, exist_ok=True)

for subfolder in tqdm(subfolders, desc="Processing EPC folders", unit="file"):
    cert_path = os.path.join(folder, subfolder, 'certificates.csv')

    if not os.path.exists(cert_path):
        continue

    try:
        epc_df = pd.read_csv(cert_path, low_memory=False)

        epc_df['postcode_clean'] = epc_df['POSTCODE'].astype(str).str.replace(' ', '').str.upper()
        epc_df['match_key'] = epc_df.apply(
            lambda row: normalize_address_components(
                row.get('ADDRESS1', ''),
                row.get('ADDRESS2', ''),
                row['postcode_clean']
            ),
            axis=1
        )

        matched = epc_df[epc_df['match_key'].isin(df['match_key'])]
        print('Done')

        if not matched.empty:
            out_path = os.path.join(output_dir, f"{subfolder}.parquet")
            matched.to_parquet(out_path, index=False)

    except Exception as e:
        print(f"❌ Error processing {cert_path}: {e}")

Processing EPC folders:   0%|          | 0/347 [00:00<?, ?file/s]

Done


Processing EPC folders:   0%|          | 1/347 [00:05<29:41,  5.15s/file]

Done


Processing EPC folders:   1%|          | 2/347 [00:12<36:22,  6.33s/file]

Done


Processing EPC folders:   1%|          | 3/347 [00:17<33:34,  5.85s/file]

Done


Processing EPC folders:   1%|          | 4/347 [00:24<36:23,  6.37s/file]

Done


Processing EPC folders:   1%|▏         | 5/347 [00:29<33:21,  5.85s/file]

Done


Processing EPC folders:   2%|▏         | 6/347 [00:34<31:07,  5.48s/file]

Done


Processing EPC folders:   2%|▏         | 7/347 [00:42<36:11,  6.39s/file]

Done


Processing EPC folders:   2%|▏         | 8/347 [00:48<34:19,  6.08s/file]

Done


Processing EPC folders:   3%|▎         | 9/347 [00:56<38:12,  6.78s/file]

Done


Processing EPC folders:   3%|▎         | 10/347 [01:06<43:17,  7.71s/file]

Done


Processing EPC folders:   3%|▎         | 11/347 [01:17<49:35,  8.86s/file]

Done


Processing EPC folders:   3%|▎         | 12/347 [01:23<44:45,  8.02s/file]

Done


Processing EPC folders:   4%|▎         | 13/347 [01:29<41:01,  7.37s/file]

Done


Processing EPC folders:   4%|▍         | 14/347 [01:36<39:50,  7.18s/file]

Done


Processing EPC folders:   4%|▍         | 15/347 [01:44<42:03,  7.60s/file]

Done


Processing EPC folders:   5%|▍         | 17/347 [01:59<38:25,  6.99s/file]

Done
Done


Processing EPC folders:   5%|▌         | 19/347 [02:22<48:59,  8.96s/file]

Done


Processing EPC folders:   6%|▌         | 20/347 [02:29<45:24,  8.33s/file]

Done
Done


Processing EPC folders:   6%|▋         | 22/347 [02:46<44:39,  8.24s/file]

Done
Done


Processing EPC folders:   7%|▋         | 23/347 [03:00<54:39, 10.12s/file]

Done


Processing EPC folders:   7%|▋         | 25/347 [03:17<49:34,  9.24s/file]

Done
Done


Processing EPC folders:   7%|▋         | 26/347 [03:26<49:39,  9.28s/file]

Done


Processing EPC folders:   8%|▊         | 27/347 [03:32<44:03,  8.26s/file]

Done


Processing EPC folders:   8%|▊         | 28/347 [03:41<44:46,  8.42s/file]

Done


Processing EPC folders:   8%|▊         | 29/347 [03:48<43:16,  8.16s/file]

Done


Processing EPC folders:   9%|▊         | 30/347 [03:55<40:49,  7.73s/file]

Done


Processing EPC folders:   9%|▉         | 31/347 [04:02<38:43,  7.35s/file]

Done


Processing EPC folders:   9%|▉         | 32/347 [04:08<37:07,  7.07s/file]

Done


Processing EPC folders:  10%|▉         | 34/347 [04:22<35:23,  6.78s/file]

Done


Processing EPC folders:  10%|█         | 35/347 [04:28<33:47,  6.50s/file]

Done


Processing EPC folders:  10%|█         | 36/347 [04:34<33:30,  6.47s/file]

Done


Processing EPC folders:  11%|█         | 37/347 [04:40<32:59,  6.39s/file]

Done


Processing EPC folders:  11%|█         | 38/347 [04:46<31:30,  6.12s/file]

Done


Processing EPC folders:  11%|█         | 39/347 [04:51<30:04,  5.86s/file]

Done


Processing EPC folders:  12%|█▏        | 40/347 [05:00<34:46,  6.80s/file]

Done
Done


Processing EPC folders:  12%|█▏        | 41/347 [05:12<43:10,  8.46s/file]

Done


Processing EPC folders:  12%|█▏        | 42/347 [05:20<42:06,  8.28s/file]

Done


Processing EPC folders:  12%|█▏        | 43/347 [05:29<42:52,  8.46s/file]

Done


Processing EPC folders:  13%|█▎        | 44/347 [05:35<38:59,  7.72s/file]

Done


Processing EPC folders:  13%|█▎        | 45/347 [05:54<55:58, 11.12s/file]

Done


Processing EPC folders:  13%|█▎        | 46/347 [06:07<57:54, 11.54s/file]

Done


Processing EPC folders:  14%|█▎        | 47/347 [06:18<57:11, 11.44s/file]

Done


Processing EPC folders:  14%|█▍        | 48/347 [06:28<54:56, 11.02s/file]

Done


Processing EPC folders:  14%|█▍        | 50/347 [06:50<50:14, 10.15s/file]  

Done
Done


Processing EPC folders:  15%|█▍        | 51/347 [07:03<54:49, 11.11s/file]

Done


Processing EPC folders:  15%|█▍        | 52/347 [07:11<50:03, 10.18s/file]

Done


Processing EPC folders:  15%|█▌        | 53/347 [07:20<48:52,  9.98s/file]

Done


Processing EPC folders:  16%|█▌        | 54/347 [07:31<49:34, 10.15s/file]

Done


Processing EPC folders:  16%|█▌        | 55/347 [07:45<55:13, 11.35s/file]

Done


Processing EPC folders:  16%|█▌        | 56/347 [07:57<55:49, 11.51s/file]

Done


Processing EPC folders:  17%|█▋        | 58/347 [08:08<40:21,  8.38s/file]

Done
Done


Processing EPC folders:  17%|█▋        | 60/347 [08:16<28:52,  6.03s/file]

Done


Processing EPC folders:  18%|█▊        | 61/347 [08:19<24:08,  5.06s/file]

Done


Processing EPC folders:  18%|█▊        | 62/347 [08:23<22:41,  4.78s/file]

Done


Processing EPC folders:  18%|█▊        | 63/347 [08:26<21:00,  4.44s/file]

Done


Processing EPC folders:  18%|█▊        | 64/347 [08:32<22:01,  4.67s/file]

Done


Processing EPC folders:  19%|█▊        | 65/347 [08:35<19:42,  4.19s/file]

Done


Processing EPC folders:  19%|█▉        | 66/347 [08:38<18:32,  3.96s/file]

Done


Processing EPC folders:  19%|█▉        | 67/347 [08:43<19:57,  4.28s/file]

Done


Processing EPC folders:  20%|█▉        | 68/347 [08:48<21:14,  4.57s/file]

Done


Processing EPC folders:  20%|█▉        | 69/347 [08:52<20:16,  4.38s/file]

Done


Processing EPC folders:  20%|██        | 70/347 [08:56<19:42,  4.27s/file]

Done


Processing EPC folders:  20%|██        | 71/347 [09:03<22:21,  4.86s/file]

Done


Processing EPC folders:  21%|██        | 72/347 [09:09<24:58,  5.45s/file]

Done


Processing EPC folders:  21%|██        | 73/347 [09:13<22:55,  5.02s/file]

Done


Processing EPC folders:  21%|██▏       | 74/347 [09:17<20:28,  4.50s/file]

Done
Done


Processing EPC folders:  22%|██▏       | 76/347 [09:24<18:20,  4.06s/file]

Done


Processing EPC folders:  22%|██▏       | 77/347 [09:27<16:50,  3.74s/file]

Done


Processing EPC folders:  22%|██▏       | 78/347 [09:31<17:25,  3.89s/file]

Done


Processing EPC folders:  23%|██▎       | 79/347 [09:37<18:54,  4.23s/file]

Done


Processing EPC folders:  23%|██▎       | 80/347 [09:41<19:25,  4.37s/file]

Done


Processing EPC folders:  23%|██▎       | 81/347 [09:46<19:23,  4.37s/file]

Done


Processing EPC folders:  24%|██▎       | 82/347 [09:49<18:24,  4.17s/file]

Done


Processing EPC folders:  24%|██▍       | 83/347 [09:54<18:50,  4.28s/file]

Done


Processing EPC folders:  24%|██▍       | 84/347 [09:58<18:28,  4.21s/file]

Done


Processing EPC folders:  24%|██▍       | 85/347 [10:02<18:36,  4.26s/file]

Done


Processing EPC folders:  25%|██▍       | 86/347 [10:07<18:52,  4.34s/file]

Done
Done


Processing EPC folders:  25%|██▌       | 87/347 [10:14<21:58,  5.07s/file]

Done


Processing EPC folders:  26%|██▌       | 89/347 [10:23<20:34,  4.79s/file]

Done


Processing EPC folders:  26%|██▌       | 90/347 [10:27<20:13,  4.72s/file]

Done


Processing EPC folders:  26%|██▌       | 91/347 [10:32<19:55,  4.67s/file]

Done


Processing EPC folders:  27%|██▋       | 92/347 [10:38<21:15,  5.00s/file]

Done


Processing EPC folders:  27%|██▋       | 93/347 [10:42<20:08,  4.76s/file]

Done


Processing EPC folders:  27%|██▋       | 94/347 [10:45<18:07,  4.30s/file]

Done


Processing EPC folders:  27%|██▋       | 95/347 [10:50<18:37,  4.44s/file]

Done
Done


Processing EPC folders:  28%|██▊       | 97/347 [10:59<17:59,  4.32s/file]

Done


Processing EPC folders:  28%|██▊       | 98/347 [11:03<18:00,  4.34s/file]

Done
Done


Processing EPC folders:  29%|██▊       | 99/347 [11:10<20:54,  5.06s/file]

Done


Processing EPC folders:  29%|██▉       | 100/347 [11:16<22:01,  5.35s/file]

Done


Processing EPC folders:  29%|██▉       | 102/347 [11:25<19:49,  4.85s/file]

Done


Processing EPC folders:  30%|██▉       | 103/347 [11:28<18:11,  4.47s/file]

Done
Done


Processing EPC folders:  30%|██▉       | 104/347 [11:34<20:06,  4.97s/file]

Done


Processing EPC folders:  31%|███       | 106/347 [11:47<22:01,  5.48s/file]

Done


Processing EPC folders:  31%|███       | 107/347 [11:51<20:26,  5.11s/file]

Done


Processing EPC folders:  31%|███       | 108/347 [11:54<18:03,  4.54s/file]

Done


Processing EPC folders:  31%|███▏      | 109/347 [11:58<16:50,  4.25s/file]

Done
Done


Processing EPC folders:  32%|███▏      | 111/347 [12:08<18:06,  4.61s/file]

Done
Done


Processing EPC folders:  33%|███▎      | 113/347 [12:18<18:32,  4.75s/file]

Done


Processing EPC folders:  33%|███▎      | 114/347 [12:22<17:21,  4.47s/file]

Done


Processing EPC folders:  33%|███▎      | 115/347 [12:27<17:59,  4.65s/file]

Done


Processing EPC folders:  33%|███▎      | 116/347 [12:32<17:55,  4.65s/file]

Done


Processing EPC folders:  34%|███▎      | 117/347 [12:36<17:02,  4.45s/file]

Done
Done


Processing EPC folders:  34%|███▍      | 118/347 [12:43<20:12,  5.29s/file]

Done


Processing EPC folders:  35%|███▍      | 120/347 [12:53<19:40,  5.20s/file]

Done


Processing EPC folders:  35%|███▍      | 121/347 [12:57<18:28,  4.90s/file]

Done


Processing EPC folders:  35%|███▌      | 122/347 [13:01<16:54,  4.51s/file]

Done


Processing EPC folders:  35%|███▌      | 123/347 [13:05<16:13,  4.35s/file]

Done
Done


Processing EPC folders:  36%|███▌      | 125/347 [13:17<18:50,  5.09s/file]

Done
Done


Processing EPC folders:  37%|███▋      | 127/347 [13:25<17:25,  4.75s/file]

Done


Processing EPC folders:  37%|███▋      | 128/347 [13:30<17:35,  4.82s/file]

Done


Processing EPC folders:  37%|███▋      | 129/347 [13:34<16:28,  4.54s/file]

Done
Done


Processing EPC folders:  38%|███▊      | 131/347 [13:45<17:15,  4.79s/file]

Done
Done


Processing EPC folders:  38%|███▊      | 133/347 [13:54<16:19,  4.58s/file]

Done


Processing EPC folders:  39%|███▊      | 134/347 [13:58<15:48,  4.45s/file]

Done


Processing EPC folders:  39%|███▉      | 135/347 [14:03<16:21,  4.63s/file]

Done
Done


Processing EPC folders:  39%|███▉      | 136/347 [14:10<18:37,  5.30s/file]

Done


Processing EPC folders:  40%|███▉      | 138/347 [14:19<17:39,  5.07s/file]

Done


Processing EPC folders:  40%|████      | 139/347 [14:24<16:43,  4.83s/file]

Done
Done


Processing EPC folders:  41%|████      | 141/347 [14:34<16:56,  4.94s/file]

Done


Processing EPC folders:  41%|████      | 142/347 [14:40<18:01,  5.28s/file]

Done
Done


Processing EPC folders:  41%|████      | 143/347 [14:46<17:59,  5.29s/file]

Done


Processing EPC folders:  42%|████▏     | 145/347 [14:57<17:56,  5.33s/file]

Done


Processing EPC folders:  42%|████▏     | 146/347 [15:01<16:58,  5.07s/file]

Done
Done


Processing EPC folders:  43%|████▎     | 148/347 [15:11<16:50,  5.08s/file]

Done


Processing EPC folders:  43%|████▎     | 149/347 [15:16<16:15,  4.93s/file]

Done


Processing EPC folders:  43%|████▎     | 150/347 [15:20<15:24,  4.69s/file]

Done
Done


Processing EPC folders:  44%|████▍     | 152/347 [15:30<15:35,  4.80s/file]

Done


Processing EPC folders:  44%|████▍     | 153/347 [15:36<16:40,  5.16s/file]

Done


Processing EPC folders:  44%|████▍     | 154/347 [15:40<15:41,  4.88s/file]

Done


Processing EPC folders:  45%|████▍     | 155/347 [15:44<14:19,  4.48s/file]

Done


Processing EPC folders:  45%|████▍     | 156/347 [15:48<13:57,  4.39s/file]

Done


Processing EPC folders:  45%|████▌     | 157/347 [15:52<14:01,  4.43s/file]

Done
Done


Processing EPC folders:  46%|████▌     | 159/347 [16:01<13:38,  4.35s/file]

Done


Processing EPC folders:  46%|████▌     | 160/347 [16:08<15:59,  5.13s/file]

Done


Processing EPC folders:  46%|████▋     | 161/347 [16:12<15:19,  4.94s/file]

Done


Processing EPC folders:  47%|████▋     | 162/347 [16:17<14:42,  4.77s/file]

Done


Processing EPC folders:  47%|████▋     | 163/347 [16:20<12:58,  4.23s/file]

Done


Processing EPC folders:  47%|████▋     | 164/347 [16:25<13:25,  4.40s/file]

Done


Processing EPC folders:  48%|████▊     | 165/347 [16:27<12:00,  3.96s/file]

Done


Processing EPC folders:  48%|████▊     | 166/347 [16:31<11:42,  3.88s/file]

Done
Done


Processing EPC folders:  48%|████▊     | 167/347 [16:38<14:02,  4.68s/file]

Done


Processing EPC folders:  49%|████▊     | 169/347 [16:47<14:03,  4.74s/file]

Done


Processing EPC folders:  49%|████▉     | 170/347 [16:51<13:28,  4.57s/file]

Done
Done


Processing EPC folders:  50%|████▉     | 172/347 [17:02<13:52,  4.75s/file]

Done
Done


Processing EPC folders:  50%|█████     | 174/347 [17:12<14:38,  5.08s/file]

Done


Processing EPC folders:  50%|█████     | 175/347 [17:17<13:59,  4.88s/file]

Done


Processing EPC folders:  51%|█████     | 176/347 [17:23<15:11,  5.33s/file]

Done


Processing EPC folders:  51%|█████     | 177/347 [17:28<14:38,  5.17s/file]

Done
Done


Processing EPC folders:  52%|█████▏    | 179/347 [17:41<16:07,  5.76s/file]

Done


Processing EPC folders:  52%|█████▏    | 180/347 [17:44<13:43,  4.93s/file]

Done


Processing EPC folders:  52%|█████▏    | 181/347 [17:47<12:14,  4.42s/file]

Done


Processing EPC folders:  52%|█████▏    | 182/347 [17:50<11:22,  4.14s/file]

Done


Processing EPC folders:  53%|█████▎    | 183/347 [17:54<10:59,  4.02s/file]

Done
Done


Processing EPC folders:  53%|█████▎    | 185/347 [18:04<11:35,  4.29s/file]

Done


Processing EPC folders:  54%|█████▎    | 186/347 [18:07<10:41,  3.99s/file]

Done


Processing EPC folders:  54%|█████▍    | 187/347 [18:11<10:23,  3.90s/file]

Done


Processing EPC folders:  54%|█████▍    | 188/347 [18:15<10:23,  3.92s/file]

Done
Done


Processing EPC folders:  55%|█████▍    | 190/347 [18:24<10:40,  4.08s/file]

Done


Processing EPC folders:  55%|█████▌    | 191/347 [18:27<09:48,  3.78s/file]

Done


Processing EPC folders:  55%|█████▌    | 192/347 [18:31<10:21,  4.01s/file]

Done


Processing EPC folders:  56%|█████▌    | 193/347 [18:35<09:59,  3.89s/file]

Done


Processing EPC folders:  56%|█████▌    | 194/347 [18:41<11:29,  4.51s/file]

Done
Done


Processing EPC folders:  56%|█████▋    | 196/347 [18:50<11:30,  4.57s/file]

Done


Processing EPC folders:  57%|█████▋    | 197/347 [18:55<11:33,  4.62s/file]

Done


Processing EPC folders:  57%|█████▋    | 198/347 [19:00<11:28,  4.62s/file]

Done


Processing EPC folders:  57%|█████▋    | 199/347 [19:04<11:36,  4.71s/file]

Done


Processing EPC folders:  58%|█████▊    | 200/347 [19:10<12:11,  4.98s/file]

Done
Done


Processing EPC folders:  58%|█████▊    | 202/347 [19:22<13:16,  5.49s/file]

Done
Done


Processing EPC folders:  59%|█████▉    | 204/347 [19:33<12:55,  5.43s/file]

Done


Processing EPC folders:  59%|█████▉    | 205/347 [19:38<12:30,  5.28s/file]

Done


Processing EPC folders:  59%|█████▉    | 206/347 [19:43<12:06,  5.15s/file]

Done
Done


Processing EPC folders:  60%|█████▉    | 207/347 [19:48<11:51,  5.08s/file]

Done


Processing EPC folders:  60%|██████    | 209/347 [19:57<11:23,  4.96s/file]

Done
Done


Processing EPC folders:  61%|██████    | 211/347 [20:07<10:44,  4.74s/file]

Done


Processing EPC folders:  61%|██████    | 212/347 [20:12<11:08,  4.95s/file]

Done


Processing EPC folders:  61%|██████▏   | 213/347 [20:16<10:27,  4.68s/file]

Done
Done


Processing EPC folders:  62%|██████▏   | 215/347 [20:26<10:13,  4.65s/file]

Done


Processing EPC folders:  62%|██████▏   | 216/347 [20:29<09:24,  4.31s/file]

Done


Processing EPC folders:  63%|██████▎   | 217/347 [20:33<09:20,  4.31s/file]

Done
Done


Processing EPC folders:  63%|██████▎   | 219/347 [20:44<10:17,  4.82s/file]

Done


Processing EPC folders:  63%|██████▎   | 220/347 [20:50<10:45,  5.08s/file]

Done


Processing EPC folders:  64%|██████▎   | 221/347 [20:54<09:50,  4.69s/file]

Done


Processing EPC folders:  64%|██████▍   | 222/347 [20:59<10:09,  4.87s/file]

Done


Processing EPC folders:  64%|██████▍   | 223/347 [21:03<09:39,  4.67s/file]

Done
Done


Processing EPC folders:  65%|██████▍   | 225/347 [21:14<10:00,  4.92s/file]

Done


Processing EPC folders:  65%|██████▌   | 226/347 [21:19<09:40,  4.80s/file]

Done


Processing EPC folders:  65%|██████▌   | 227/347 [21:23<09:14,  4.62s/file]

Done


Processing EPC folders:  66%|██████▌   | 228/347 [21:27<08:43,  4.40s/file]

Done


Processing EPC folders:  66%|██████▌   | 229/347 [21:32<09:01,  4.59s/file]

Done


Processing EPC folders:  66%|██████▋   | 230/347 [21:36<08:59,  4.61s/file]

Done


Processing EPC folders:  67%|██████▋   | 231/347 [21:41<08:47,  4.55s/file]

Done
Done


Processing EPC folders:  67%|██████▋   | 232/347 [21:46<09:05,  4.74s/file]

Done


Processing EPC folders:  67%|██████▋   | 234/347 [21:57<09:48,  5.21s/file]

Done
Done


Processing EPC folders:  68%|██████▊   | 236/347 [22:07<09:04,  4.91s/file]

Done


Processing EPC folders:  68%|██████▊   | 237/347 [22:14<10:08,  5.53s/file]

Done


Processing EPC folders:  69%|██████▊   | 238/347 [22:19<09:56,  5.47s/file]

Done


Processing EPC folders:  69%|██████▉   | 239/347 [22:23<09:13,  5.12s/file]

Done


Processing EPC folders:  69%|██████▉   | 240/347 [22:29<09:20,  5.24s/file]

Done
Done


Processing EPC folders:  70%|██████▉   | 242/347 [22:41<09:59,  5.71s/file]

Done


Processing EPC folders:  70%|███████   | 243/347 [22:45<08:56,  5.16s/file]

Done


Processing EPC folders:  70%|███████   | 244/347 [22:49<08:11,  4.78s/file]

Done


Processing EPC folders:  71%|███████   | 245/347 [22:53<07:33,  4.45s/file]

Done
Done


Processing EPC folders:  71%|███████   | 246/347 [22:57<07:36,  4.52s/file]

Done


Processing EPC folders:  71%|███████   | 247/347 [23:03<07:54,  4.74s/file]

Done


Processing EPC folders:  72%|███████▏  | 249/347 [23:13<08:04,  4.95s/file]

Done


Processing EPC folders:  72%|███████▏  | 250/347 [23:18<07:55,  4.90s/file]

Done
Done


Processing EPC folders:  73%|███████▎  | 252/347 [23:27<07:30,  4.75s/file]

Done
Done


Processing EPC folders:  73%|███████▎  | 253/347 [23:36<09:28,  6.05s/file]

Done


Processing EPC folders:  73%|███████▎  | 254/347 [23:44<10:02,  6.47s/file]

Done


Processing EPC folders:  73%|███████▎  | 255/347 [23:49<09:32,  6.23s/file]

Done


Processing EPC folders:  74%|███████▍  | 257/347 [24:05<10:20,  6.90s/file]

Done
Done


Processing EPC folders:  75%|███████▍  | 259/347 [24:33<14:32,  9.91s/file]

Done
Done


Processing EPC folders:  75%|███████▌  | 261/347 [24:51<14:02,  9.80s/file]

Done
Done


Processing EPC folders:  76%|███████▌  | 262/347 [25:00<13:24,  9.47s/file]

Done


Processing EPC folders:  76%|███████▌  | 264/347 [25:16<11:49,  8.55s/file]

Done
Done


Processing EPC folders:  76%|███████▋  | 265/347 [25:25<12:10,  8.91s/file]

Done


Processing EPC folders:  77%|███████▋  | 266/347 [25:32<10:53,  8.07s/file]

Done


Processing EPC folders:  77%|███████▋  | 267/347 [25:50<14:53, 11.17s/file]

Done


Processing EPC folders:  77%|███████▋  | 268/347 [25:57<12:55,  9.82s/file]

Done


Processing EPC folders:  78%|███████▊  | 269/347 [26:06<12:28,  9.59s/file]

Done


Processing EPC folders:  78%|███████▊  | 270/347 [26:16<12:46,  9.95s/file]

Done


Processing EPC folders:  78%|███████▊  | 271/347 [26:25<11:55,  9.42s/file]

Done


Processing EPC folders:  79%|███████▊  | 273/347 [26:43<11:31,  9.34s/file]

Done
Done


Processing EPC folders:  79%|███████▉  | 274/347 [26:59<13:49, 11.37s/file]

Done


Processing EPC folders:  79%|███████▉  | 275/347 [27:10<13:15, 11.05s/file]

Done


Processing EPC folders:  80%|███████▉  | 276/347 [27:18<11:58, 10.12s/file]

Done


Processing EPC folders:  80%|███████▉  | 277/347 [27:24<10:23,  8.90s/file]

Done


Processing EPC folders:  80%|████████  | 278/347 [27:35<10:58,  9.54s/file]

Done


Processing EPC folders:  80%|████████  | 279/347 [28:08<18:54, 16.68s/file]

Done


Processing EPC folders:  81%|████████  | 280/347 [28:20<17:08, 15.35s/file]

Done


Processing EPC folders:  81%|████████  | 281/347 [28:30<14:57, 13.60s/file]

Done


Processing EPC folders:  81%|████████▏ | 282/347 [28:40<13:31, 12.48s/file]

Done


Processing EPC folders:  82%|████████▏ | 283/347 [28:47<11:36, 10.88s/file]

Done


Processing EPC folders:  82%|████████▏ | 284/347 [28:55<10:37, 10.13s/file]

Done


Processing EPC folders:  82%|████████▏ | 285/347 [29:03<09:51,  9.54s/file]

Done


Processing EPC folders:  82%|████████▏ | 286/347 [29:20<11:53, 11.70s/file]

Done


Processing EPC folders:  83%|████████▎ | 287/347 [29:28<10:29, 10.49s/file]

Done


Processing EPC folders:  83%|████████▎ | 288/347 [29:40<10:50, 11.02s/file]

Done


Processing EPC folders:  83%|████████▎ | 289/347 [30:05<14:47, 15.30s/file]

Done


Processing EPC folders:  84%|████████▎ | 290/347 [30:19<13:54, 14.65s/file]

Done


Processing EPC folders:  84%|████████▍ | 292/347 [30:28<08:31,  9.31s/file]

Done
Done


Processing EPC folders:  84%|████████▍ | 293/347 [30:35<07:41,  8.55s/file]

Done


Processing EPC folders:  85%|████████▍ | 294/347 [30:47<08:29,  9.61s/file]

Done


Processing EPC folders:  85%|████████▌ | 296/347 [31:04<07:50,  9.22s/file]

Done
Done


Processing EPC folders:  86%|████████▌ | 298/347 [31:22<07:27,  9.13s/file]

Done
Done


Processing EPC folders:  86%|████████▌ | 299/347 [31:34<07:55,  9.90s/file]

Done


Processing EPC folders:  86%|████████▋ | 300/347 [31:46<08:07, 10.37s/file]

Done


Processing EPC folders:  87%|████████▋ | 301/347 [31:55<07:47, 10.17s/file]

Done


Processing EPC folders:  87%|████████▋ | 303/347 [32:15<07:20, 10.02s/file]

Done


Processing EPC folders:  88%|████████▊ | 304/347 [32:23<06:48,  9.50s/file]

Done
Done


Processing EPC folders:  88%|████████▊ | 305/347 [32:33<06:39,  9.50s/file]

Done


Processing EPC folders:  88%|████████▊ | 306/347 [32:41<06:08,  8.98s/file]

Done


Processing EPC folders:  88%|████████▊ | 307/347 [32:48<05:41,  8.54s/file]

Done


Processing EPC folders:  89%|████████▉ | 308/347 [32:57<05:35,  8.59s/file]

Done


Processing EPC folders:  89%|████████▉ | 310/347 [33:15<05:26,  8.84s/file]

Done


Processing EPC folders:  90%|████████▉ | 311/347 [33:22<05:07,  8.53s/file]

Done


Processing EPC folders:  90%|████████▉ | 312/347 [33:29<04:34,  7.85s/file]

Done
Done


Processing EPC folders:  90%|█████████ | 313/347 [33:42<05:21,  9.46s/file]

Done


Processing EPC folders:  90%|█████████ | 314/347 [33:53<05:25,  9.87s/file]

Done


Processing EPC folders:  91%|█████████ | 315/347 [34:00<04:48,  9.00s/file]

Done


Processing EPC folders:  91%|█████████ | 316/347 [34:11<04:57,  9.60s/file]

Done


Processing EPC folders:  92%|█████████▏| 318/347 [34:25<04:02,  8.36s/file]

Done


Processing EPC folders:  92%|█████████▏| 319/347 [34:36<04:14,  9.08s/file]

Done
Done


Processing EPC folders:  93%|█████████▎| 321/347 [34:55<04:07,  9.51s/file]

Done
Done


Processing EPC folders:  93%|█████████▎| 322/347 [35:04<03:54,  9.36s/file]

Done


Processing EPC folders:  93%|█████████▎| 324/347 [35:27<03:57, 10.34s/file]

Done


Processing EPC folders:  94%|█████████▎| 325/347 [35:31<03:04,  8.37s/file]

Done


Processing EPC folders:  94%|█████████▍| 326/347 [35:36<02:33,  7.29s/file]

Done


Processing EPC folders:  94%|█████████▍| 327/347 [35:42<02:17,  6.89s/file]

Done
Done


Processing EPC folders:  95%|█████████▍| 329/347 [35:51<01:45,  5.89s/file]

Done


Processing EPC folders:  95%|█████████▌| 330/347 [35:56<01:34,  5.53s/file]

Done


Processing EPC folders:  95%|█████████▌| 331/347 [36:00<01:20,  5.01s/file]

Done


Processing EPC folders:  96%|█████████▌| 332/347 [36:05<01:14,  4.99s/file]

Done
Done


Processing EPC folders:  96%|█████████▋| 334/347 [36:20<01:21,  6.27s/file]

Done


Processing EPC folders:  97%|█████████▋| 335/347 [36:25<01:12,  6.05s/file]

Done


Processing EPC folders:  97%|█████████▋| 336/347 [36:30<01:03,  5.76s/file]

Done
Done


Processing EPC folders:  97%|█████████▋| 337/347 [36:36<00:56,  5.64s/file]

Done


Processing EPC folders:  97%|█████████▋| 338/347 [36:48<01:08,  7.59s/file]

Done


Processing EPC folders:  98%|█████████▊| 340/347 [37:01<00:49,  7.06s/file]

Done


Processing EPC folders:  98%|█████████▊| 341/347 [37:05<00:35,  5.96s/file]

Done


Processing EPC folders:  99%|█████████▊| 342/347 [37:09<00:27,  5.42s/file]

Done


Processing EPC folders:  99%|█████████▉| 343/347 [37:13<00:20,  5.14s/file]

Done
Done


Processing EPC folders:  99%|█████████▉| 345/347 [37:24<00:10,  5.28s/file]

Done


Processing EPC folders: 100%|█████████▉| 346/347 [37:28<00:04,  4.74s/file]

Done


Processing EPC folders: 100%|██████████| 347/347 [37:30<00:00,  6.48s/file]

Done





In [29]:
import os
import pandas as pd

matched_dir = 'matched_epc_rows'

parquet_files = [
    os.path.join(matched_dir, f)
    for f in os.listdir(matched_dir)
    if f.endswith('.parquet')
]

matched_dfs = [pd.read_parquet(f) for f in parquet_files]

if matched_dfs:
    total_matched = pd.concat(matched_dfs, ignore_index=True)
    print(f"✅ Loaded total matched rows: {len(total_matched):,}")
    display(total_matched.head())
else:
    print("⚠ No parquet files found in matched_epc_rows.")

✅ Loaded total matched rows: 4,178,789


Unnamed: 0,LMK_KEY,ADDRESS1,ADDRESS2,ADDRESS3,POSTCODE,BUILDING_REFERENCE_NUMBER,CURRENT_ENERGY_RATING,POTENTIAL_ENERGY_RATING,CURRENT_ENERGY_EFFICIENCY,POTENTIAL_ENERGY_EFFICIENCY,...,CONSTRUCTION_AGE_BAND,LODGEMENT_DATETIME,TENURE,FIXED_LIGHTING_OUTLETS_COUNT,LOW_ENERGY_FIXED_LIGHT_COUNT,UPRN,UPRN_SOURCE,REPORT_TYPE,postcode_clean,match_key
0,1451865729102016060917325243560118,"54, Chatham Road",,,TS24 8QQ,134825478,D,B,61,85,...,England and Wales: 1930-1949,2016-06-09 17:32:52,owner-occupied,,,100110000000.0,Address Matched,100,TS248QQ,54 CHATHAM ROAD TS248QQ
1,978835396212019052007505697910817,"19, Wainwright Walk",,,TS25 1XA,8777671178,D,C,65,79,...,England and Wales: 1967-1975,2019-05-20 07:50:56,owner-occupied,,,100110000000.0,Address Matched,100,TS251XA,19 WAINWRIGHT WALK TS251XA
2,1341712688132019081910310554978408,"116, Milton Road",,,TS26 8DX,8864447378,D,C,56,77,...,England and Wales: 1900-1929,2019-08-19 10:31:05,rental (private),,,100110000000.0,Address Matched,100,TS268DX,116 MILTON ROAD TS268DX
3,1186183659802014081111373821640768,"18, Brafferton Street",,,TS26 8LH,4895056278,D,B,57,90,...,England and Wales: 1900-1929,2014-08-11 11:37:38,rental (private),7.0,0.0,100110000000.0,Address Matched,100,TS268LH,18 BRAFFERTON STREET TS268LH
4,1566486539802017081008512757330418,"132, Kesteven Road",,,TS25 2NJ,862833578,C,B,70,85,...,England and Wales: 1967-1975,2017-08-10 08:51:27,owner-occupied,,,100110000000.0,Address Matched,100,TS252NJ,132 KESTEVEN ROAD TS252NJ


In [31]:
dfmax = total_matched[total_matched['CURRENT_ENERGY_EFFICIENCY'] > 100]
dfmax.shape

(1772, 95)

In [40]:
dfmax = dfmax.sort_values('CURRENT_ENERGY_EFFICIENCY')
dfmax.tail()

Unnamed: 0,LMK_KEY,ADDRESS1,ADDRESS2,ADDRESS3,POSTCODE,BUILDING_REFERENCE_NUMBER,CURRENT_ENERGY_RATING,POTENTIAL_ENERGY_RATING,CURRENT_ENERGY_EFFICIENCY,POTENTIAL_ENERGY_EFFICIENCY,...,CONSTRUCTION_AGE_BAND,LODGEMENT_DATETIME,TENURE,FIXED_LIGHTING_OUTLETS_COUNT,LOW_ENERGY_FIXED_LIGHT_COUNT,UPRN,UPRN_SOURCE,REPORT_TYPE,postcode_clean,match_key
2727841,801830106732013090919575667268109,"69, Monton Street",,,M14 4LS,967139968,A,A,143,144,...,England and Wales: 2007 onwards,2013-09-09 19:57:56,rental (private),2.0,2.0,77115770.0,Address Matched,100,M144LS,69 MONTON STREET M144LS
1099044,130401e5aa8581948660c7249f577d77ffdb27a14e5694...,Hatchlands,Rectory Lane,Charlton Musgrove,BA9 8ET,5066324278,A,A,144,158,...,England and Wales: 1967-1975,2024-05-17 06:00:58,Owner-occupied,30.0,,30000740.0,Energy Assessor,100,BA98ET,HATCHLANDS RECTORY LANE BA98ET
2002614,1134005089702019013119395721210628,The Island,Yarmouth Road,,NR7 0HE,3752382278,A,A,157,170,...,England and Wales: 1950-1966,2019-01-31 19:39:57,owner-occupied,,,200004300000.0,Address Matched,100,NR70HE,THE ISLAND YARMOUTH ROAD NR70HE
1463833,688824189222013102413461480348167,"14, Gainsborough Close",,,CO16 8DW,6812880968,A,A,161,166,...,England and Wales: 1983-1990,2013-10-24 13:46:14,owner-occupied,25.0,20.0,100090600000.0,Address Matched,100,CO168DW,14 GAINSBOROUGH CLOSE CO168DW
1349245,910920099502013041714444003679368,High Cross Hall,Station Road,Rotherfield,TN6 3HR,9554896078,A,A,165,165,...,NO DATA!,2013-04-17 14:44:40,NO DATA!,52.0,52.0,10024380000.0,Address Matched,101,TN63HR,HIGH CROSS HALL STATION ROAD TN63HR


In [45]:
unique_vals = total_matched['BUILDING_REFERENCE_NUMBER'].unique()
Unique_df = total_matched[total_matched['BUILDING_REFERENCE_NUMBER'].isin(unique_vals)]
Unique_df.shape

(4178789, 95)

In [46]:
df.columns

Index(['Transfer Date', 'Postcode', 'primary_addressable_object_name',
       'secondary_addressable_object_name', 'street', 'Town/City', 'District',
       'County', 'Price (Thousands)', 'Freehold Tenure', 'Leasehold Tenure',
       'Old Build', 'New Build', 'Detached', 'Flat', 'Other Property Type',
       'Semi-detached', 'Terraced', 'Year', 'Month', 'Quarter',
       'Day of the Week', 'Postcode Area', 'Postcode District',
       'Area Code Average Price (Thousands)',
       'Town/City Average Price (Thousands)',
       'District Average Price (Thousands)',
       'County Average Price (Thousands)', 'LAT', 'LONG', 'OSEAST1M',
       'OSNRTH1M', 'postcode_clean', 'match_key'],
      dtype='object')

In [47]:
trim_price = df.drop(columns=['primary_addressable_object_name', 'secondary_addressable_object_name', 'street', 'District', 'Town/City'], axis=1)
trim_price.columns

Index(['Transfer Date', 'Postcode', 'County', 'Price (Thousands)',
       'Freehold Tenure', 'Leasehold Tenure', 'Old Build', 'New Build',
       'Detached', 'Flat', 'Other Property Type', 'Semi-detached', 'Terraced',
       'Year', 'Month', 'Quarter', 'Day of the Week', 'Postcode Area',
       'Postcode District', 'Area Code Average Price (Thousands)',
       'Town/City Average Price (Thousands)',
       'District Average Price (Thousands)',
       'County Average Price (Thousands)', 'LAT', 'LONG', 'OSEAST1M',
       'OSNRTH1M', 'postcode_clean', 'match_key'],
      dtype='object')

In [48]:
 total_matched.columns

Index(['LMK_KEY', 'ADDRESS1', 'ADDRESS2', 'ADDRESS3', 'POSTCODE',
       'BUILDING_REFERENCE_NUMBER', 'CURRENT_ENERGY_RATING',
       'POTENTIAL_ENERGY_RATING', 'CURRENT_ENERGY_EFFICIENCY',
       'POTENTIAL_ENERGY_EFFICIENCY', 'PROPERTY_TYPE', 'BUILT_FORM',
       'INSPECTION_DATE', 'LOCAL_AUTHORITY', 'CONSTITUENCY', 'COUNTY',
       'LODGEMENT_DATE', 'TRANSACTION_TYPE', 'ENVIRONMENT_IMPACT_CURRENT',
       'ENVIRONMENT_IMPACT_POTENTIAL', 'ENERGY_CONSUMPTION_CURRENT',
       'ENERGY_CONSUMPTION_POTENTIAL', 'CO2_EMISSIONS_CURRENT',
       'CO2_EMISS_CURR_PER_FLOOR_AREA', 'CO2_EMISSIONS_POTENTIAL',
       'LIGHTING_COST_CURRENT', 'LIGHTING_COST_POTENTIAL',
       'HEATING_COST_CURRENT', 'HEATING_COST_POTENTIAL',
       'HOT_WATER_COST_CURRENT', 'HOT_WATER_COST_POTENTIAL',
       'TOTAL_FLOOR_AREA', 'ENERGY_TARIFF', 'MAINS_GAS_FLAG', 'FLOOR_LEVEL',
       'FLAT_TOP_STOREY', 'FLAT_STOREY_COUNT', 'MAIN_HEATING_CONTROLS',
       'MULTI_GLAZE_PROPORTION', 'GLAZED_TYPE', 'GLAZED_AREA',
      

In [49]:
matched_trimmed = total_matched.drop(
    columns=[
        'LMK_KEY', 'ADDRESS3', 'BUILDING_REFERENCE_NUMBER', 'INSPECTION_DATE',
        'LOCAL_AUTHORITY', 'CONSTITUENCY', 'COUNTY', 'LODGEMENT_DATE',
        'TRANSACTION_TYPE', 'ENVIRONMENT_IMPACT_CURRENT', 'ENVIRONMENT_IMPACT_POTENTIAL',
        'ENERGY_CONSUMPTION_CURRENT', 'ENERGY_CONSUMPTION_POTENTIAL',
        'CO2_EMISSIONS_CURRENT', 'CO2_EMISS_CURR_PER_FLOOR_AREA',
        'CO2_EMISSIONS_POTENTIAL', 'LIGHTING_COST_CURRENT', 'LIGHTING_COST_POTENTIAL',
        'HEATING_COST_CURRENT', 'HEATING_COST_POTENTIAL', 'HOT_WATER_COST_CURRENT',
        'HOT_WATER_COST_POTENTIAL', 'ENERGY_TARIFF', 'FLOOR_DESCRIPTION',
        'WINDOWS_DESCRIPTION', 'WALLS_DESCRIPTION', 'SECONDHEAT_DESCRIPTION',
        'ROOF_DESCRIPTION', 'MAINHEAT_DESCRIPTION', 'MAINHEATCONT_DESCRIPTION',
        'LIGHTING_DESCRIPTION', 'HEAT_LOSS_CORRIDOR', 'UNHEATED_CORRIDOR_LENGTH',
        'PHOTO_SUPPLY', 'MECHANICAL_VENTILATION', 'ADDRESS', 'LOCAL_AUTHORITY_LABEL',
        'CONSTITUENCY_LABEL', 'POSTTOWN', 'LODGEMENT_DATETIME', 'TENURE',
        'FIXED_LIGHTING_OUTLETS_COUNT', 'LOW_ENERGY_FIXED_LIGHT_COUNT',
        'UPRN', 'UPRN_SOURCE', 'REPORT_TYPE', 'postcode_clean'
    ]
)

In [50]:
merge_df = matched_trimmed.merge(trim_price, how='left', on=['match_key'])

In [51]:
merge_df.shape

(4824809, 76)

In [52]:
merge_df.to_csv('merge.csv', index=False)

In [53]:
summary_df = pd.DataFrame({
    'Column': merge_df.columns,
    'Dtype': merge_df.dtypes.values,
    'Missing': merge_df.isna().sum().values,
    'Percent': (((merge_df.isna().sum().values)/4864524)*100).round(1)
})

print(summary_df)

                     Column    Dtype  Missing  Percent
0                  ADDRESS1   object        1      0.0
1                  ADDRESS2   object  4506651     92.6
2                  POSTCODE   object        0      0.0
3     CURRENT_ENERGY_RATING   object        0      0.0
4   POTENTIAL_ENERGY_RATING   object        0      0.0
..                      ...      ...      ...      ...
71                      LAT  float64       21      0.0
72                     LONG  float64       21      0.0
73                 OSEAST1M  float64       21      0.0
74                 OSNRTH1M  float64       21      0.0
75           postcode_clean   object        0      0.0

[76 rows x 4 columns]


In [54]:
summary_df.to_csv('column_summary.csv', index=False)

In [55]:
save=merge_df

In [56]:
drop_cols = ['ADDRESS2', 'FLAT_TOP_STOREY', 'FLAT_STOREY_COUNT','HOT_WATER_ENV_EFF', 'FLOOR_ENERGY_EFF', 'WALLS_ENV_EFF', 'SHEATING_ENERGY_EFF','FLOOR_ENV_EFF','SHEATING_ENV_EFF']
merge_df.drop(columns=drop_cols, inplace=True, errors='ignore')

mode_impute_cols = ['MAINS_GAS_FLAG', 'MAIN_HEATING_CONTROLS','GLAZED_TYPE', 'LOW_ENERGY_LIGHTING','SOLAR_WATER_HEATING_FLAG', 'MAIN_FUEL', 'GLAZED_AREA', 'ROOF_ENERGY_EFF', 'ROOF_ENV_EFF']

for col in mode_impute_cols:
    if col in merge_df.columns:
        mode_series = merge_df[col].mode(dropna=True)
        if not mode_series.empty:
            merge_df[col].fillna(mode_series[0], inplace=True)

median_impute_cols = ['MULTI_GLAZE_PROPORTION', '', 'EXTENSION_COUNT', 'NUMBER_HABITABLE_ROOMS', 'NUMBER_HEATED_ROOMS', 'FLOOR_HEIGHT']

for col in median_impute_cols:
    if col in merge_df.columns:
        merge_df[col].fillna(merge_df[col].median(), inplace=True)

zero_fill_cols = ['EXTENSION_COUNT', 'WIND_TURBINE_COUNT','NUMBER_OPEN_FIREPLACES']
for col in zero_fill_cols:
    if col in merge_df.columns:
        merge_df[col].fillna(0, inplace=True)

unknown_bin_cols = ['FLOOR_LEVEL']
for col in unknown_bin_cols:
    if col in merge_df.columns:
        merge_df[col] = merge_df[col].fillna('Unknown')

critical_cols = ['BUILT_FORM', 'LAT', 'LONG', 'OSEAST1M', 'OSNRTH1M','LIGHTING_ENV_EFF', 'LIGHTING_ENERGY_EFF', 'MAINHEATC_ENV_EFF','MAINHEATC_ENERGY_EFF', 'MAINHEAT_ENV_EFF', 'MAINHEAT_ENERGY_EFF','WALLS_ENERGY_EFF',  'WINDOWS_ENERGY_EFF','WINDOWS_ENV_EFF', 'HOT_WATER_ENERGY_EFF', 'HOTWATER_DESCRIPTION', 'ADDRESS1']

merge_df.dropna(subset=[col for col in critical_cols if col in merge_df.columns], inplace=True)

missing_summary = merge_df.isnull().sum()
print("Remaining missing values:\n", missing_summary[missing_summary > 0])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merge_df[col].fillna(mode_series[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merge_df[col].fillna(merge_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

Remaining missing values:
 CONSTRUCTION_AGE_BAND    43125
dtype: int64


In [57]:
merge_df['CONSTRUCTION_AGE_BAND'] = merge_df['CONSTRUCTION_AGE_BAND'].fillna('Unknown')
merge_df.shape

(4818318, 67)

In [58]:
turn_binary = merge_df
turn_binary.shape

(4818318, 67)

In [59]:
summary_df1 = pd.DataFrame({
    'Column': turn_binary.columns,
    'Dtype': turn_binary.dtypes.values,
    'Example': turn_binary.iloc[0].values

})

summary_df1.to_csv('dtype_inq.csv', index=False)

In [60]:
turn_binary = turn_binary.drop(columns=[
    'ADDRESS1','POSTCODE','CURRENT_ENERGY_RATING','POTENTIAL_ENERGY_RATING',
    'BUILT_FORM','FLOOR_LEVEL','GLAZED_TYPE','GLAZED_AREA',
    'MAIN_FUEL','PROPERTY_TYPE','match_key','County',
    'postcode_clean','Postcode District','Postcode Area'
])

turn_binary['MAINS_GAS_FLAG'] = turn_binary['MAINS_GAS_FLAG'].map({'Y': 1, 'N': 0})
turn_binary['SOLAR_WATER_HEATING_FLAG'] = turn_binary['SOLAR_WATER_HEATING_FLAG'].map({'Y': 1, 'N': 0})

scale = ['HOT_WATER_ENERGY_EFF', 'WINDOWS_ENERGY_EFF','WINDOWS_ENV_EFF', 'WALLS_ENERGY_EFF', 'ROOF_ENERGY_EFF','ROOF_ENV_EFF', 'MAINHEAT_ENERGY_EFF', 'MAINHEAT_ENV_EFF','MAINHEATC_ENERGY_EFF', 'MAINHEATC_ENV_EFF', 'LIGHTING_ENERGY_EFF','LIGHTING_ENV_EFF']

for i in scale:
    turn_binary[i] = turn_binary[i].map({'Very Poor': 0, 'Poor': 1, 'Average': 2, 'Good': 3,'Very Good': 4})

In [62]:
print(turn_binary['CONSTRUCTION_AGE_BAND'].unique())

['England and Wales: 1930-1949' 'England and Wales: 1967-1975'
 'England and Wales: 1900-1929' 'NO DATA!' 'England and Wales: 1950-1966'
 'England and Wales: before 1900' 'England and Wales: 2007 onwards'
 'England and Wales: 1983-1990' 'England and Wales: 1976-1982' 'INVALID!'
 '2019' 'England and Wales: 1996-2002' 'England and Wales: 2003-2006'
 '2020' 'England and Wales: 1991-1995' 'Unknown' '2021'
 'England and Wales: 2007-2011' '2017' '2022'
 'England and Wales: 2012 onwards' '2023' '2024' '2016' '2025'
 'England and Wales: 2012-2021' '2018' '2014' '2015' '2012' '1950' '1965'
 'England and Wales: 2022 onwards' '1783' '1920' '1900' '2008' '2002'
 '1999' '1800' '2009' '1970' '2010' '1890' '2011' '1889' '2013' '1907'
 '2007' '2000' '1888' '1960' '1930' '1935' '1850' '1947' '1975' '2031'
 '1958' '1910' '1881' '1700' '2033' '2006' '1967' '1940' '1954' '1820'
 '2005' '1915' '1927' '2003' '2042' '1983' '1905' '1870']


In [64]:
import numpy as np

def clean_age_band(val):
    if pd.isna(val):
        return np.nan
    val = str(val).strip()

    if val.startswith('England and Wales:'):
        val = val.replace('England and Wales:', '').strip()

    if val.isdigit():
        year = int(val)
        if year <= 1929:
            return '1900-1929'
        elif year <= 1949:
            return '1930-1949'
        elif year <= 1966:
            return '1950-1966'
        elif year <= 1975:
            return '1967-1975'
        elif year <= 1982:
            return '1976-1982'
        elif year <= 1990:
            return '1983-1990'
        elif year <= 1995:
            return '1991-1995'
        elif year <= 2002:
            return '1996-2002'
        elif year <= 2006:
            return '2003-2006'
        elif year <= 2011:
            return '2007-2011'
        elif year <= 2021:
            return '2012-2021'
        else:
            return '2022 onwards'

    if val in ['NO DATA!', 'INVALID!', 'None']:
        return np.nan

    return val

turn_binary['CONSTRUCTION_AGE_BAND_CLEAN'] = turn_binary['CONSTRUCTION_AGE_BAND'].apply(clean_age_band)

age_band_map = {
    'before 1900': 0,
    '1900-1929': 1,
    '1930-1949': 2,
    '1950-1966': 3,
    '1967-1975': 4,
    '1976-1982': 5,
    '1983-1990': 6,
    '1991-1995': 7,
    '1996-2002': 8,
    '2003-2006': 9,
    '2007-2011': 10,
    '2012-2021': 11,
    '2022 onwards': 12
}

turn_binary['CONSTRUCTION_AGE_BAND_ENCODED'] = turn_binary['CONSTRUCTION_AGE_BAND_CLEAN'].map(age_band_map)

In [65]:
turn_binary.dtypes

CURRENT_ENERGY_EFFICIENCY                int64
POTENTIAL_ENERGY_EFFICIENCY              int64
TOTAL_FLOOR_AREA                       float64
MAINS_GAS_FLAG                           int64
MAIN_HEATING_CONTROLS                   object
MULTI_GLAZE_PROPORTION                 float64
EXTENSION_COUNT                        float64
NUMBER_HABITABLE_ROOMS                 float64
NUMBER_HEATED_ROOMS                    float64
LOW_ENERGY_LIGHTING                    float64
NUMBER_OPEN_FIREPLACES                 float64
HOTWATER_DESCRIPTION                    object
HOT_WATER_ENERGY_EFF                     int64
WINDOWS_ENERGY_EFF                       int64
WINDOWS_ENV_EFF                          int64
WALLS_ENERGY_EFF                         int64
ROOF_ENERGY_EFF                          int64
ROOF_ENV_EFF                             int64
MAINHEAT_ENERGY_EFF                      int64
MAINHEAT_ENV_EFF                         int64
MAINHEATC_ENERGY_EFF                     int64
MAINHEATC_ENV

In [66]:
conv = turn_binary[turn_binary['MAIN_HEATING_CONTROLS'] != '%%MAINHEATCONTROL%%']

In [67]:
conv['MAIN_HEATING_CONTROLS'] = conv['MAIN_HEATING_CONTROLS'].astype('float64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conv['MAIN_HEATING_CONTROLS'] = conv['MAIN_HEATING_CONTROLS'].astype('float64')


In [68]:
conv.dtypes

CURRENT_ENERGY_EFFICIENCY                int64
POTENTIAL_ENERGY_EFFICIENCY              int64
TOTAL_FLOOR_AREA                       float64
MAINS_GAS_FLAG                           int64
MAIN_HEATING_CONTROLS                  float64
MULTI_GLAZE_PROPORTION                 float64
EXTENSION_COUNT                        float64
NUMBER_HABITABLE_ROOMS                 float64
NUMBER_HEATED_ROOMS                    float64
LOW_ENERGY_LIGHTING                    float64
NUMBER_OPEN_FIREPLACES                 float64
HOTWATER_DESCRIPTION                    object
HOT_WATER_ENERGY_EFF                     int64
WINDOWS_ENERGY_EFF                       int64
WINDOWS_ENV_EFF                          int64
WALLS_ENERGY_EFF                         int64
ROOF_ENERGY_EFF                          int64
ROOF_ENV_EFF                             int64
MAINHEAT_ENERGY_EFF                      int64
MAINHEAT_ENV_EFF                         int64
MAINHEATC_ENERGY_EFF                     int64
MAINHEATC_ENV

In [69]:
conv = conv.drop(columns='HOTWATER_DESCRIPTION')

In [70]:
conv = conv.drop(columns=['CONSTRUCTION_AGE_BAND_CLEAN','CONSTRUCTION_AGE_BAND'])

In [71]:
conv.shape

(4818234, 51)

In [72]:
epc_rename_map = {
    'CURRENT_ENERGY_EFFICIENCY': 'Current Energy Efficiency (EPC Score)',
    'POTENTIAL_ENERGY_EFFICIENCY': 'Potential Energy Efficiency (EPC Score)',
    'TOTAL_FLOOR_AREA': 'Total Floor Area (m²)',
    'MAINS_GAS_FLAG': 'Has Mains Gas',
    'MAIN_HEATING_CONTROLS': 'Main Heating Controls',
    'MULTI_GLAZE_PROPORTION': 'Proportion of Multi-Glazing (%)',
    'EXTENSION_COUNT': 'Number of Extensions',
    'NUMBER_HABITABLE_ROOMS': 'Number of Habitable Rooms',
    'NUMBER_HEATED_ROOMS': 'Number of Heated Rooms',
    'LOW_ENERGY_LIGHTING': 'Low Energy Lighting (%)',
    'NUMBER_OPEN_FIREPLACES': 'Number of Open Fireplaces',
    'HOT_WATER_ENERGY_EFF': 'Hot Water Energy Efficiency',
    'WINDOWS_ENERGY_EFF': 'Windows Energy Efficiency',
    'WINDOWS_ENV_EFF': 'Windows Environmental Impact',
    'WALLS_ENERGY_EFF': 'Walls Energy Efficiency',
    'ROOF_ENERGY_EFF': 'Roof Energy Efficiency',
    'ROOF_ENV_EFF': 'Roof Environmental Impact',
    'MAINHEAT_ENERGY_EFF': 'Main Heating Energy Efficiency',
    'MAINHEAT_ENV_EFF': 'Main Heating Environmental Impact',
    'MAINHEATC_ENERGY_EFF': 'Main Heating Controls Energy Efficiency',
    'MAINHEATC_ENV_EFF': 'Main Heating Controls Environmental Impact',
    'LIGHTING_ENERGY_EFF': 'Lighting Energy Efficiency',
    'LIGHTING_ENV_EFF': 'Lighting Environmental Impact',
    'WIND_TURBINE_COUNT': 'Number of Wind Turbines',
    'FLOOR_HEIGHT': 'Floor Height (m)',
    'SOLAR_WATER_HEATING_FLAG': 'Has Solar Water Heating'
}
conv = conv.rename(columns=epc_rename_map)

In [74]:
conv = conv.drop(columns=['Transfer Date','Postcode'])

In [None]:
conv.to_csv('EPC_Fin.csv', index=False)