# Join EPC, deprivation and flood risk data
EPC data: https://epc.opendatacommunities.org/domestic/search  
Deprivation data: https://imd-by-postcode.opendatacommunities.org/imd/2019  
Flood risk data: https://www.getthedata.com/open-flood-risk-by-postcode  
  
Use the postcode previously placed into the Zoopla dataset to join EPC, deprivation and flood risk data. Since EPC is at the property (rather than postcode) level, use the average EPC for that postcode, road and property type

In [1]:
import os
import numpy as np
import pandas as pd
import re
pd.set_option('display.max_columns', 100)

### Read in Zoopla, EPC, deprivation and flood risk files
Concatenate the files for the separate towns

In [2]:
DATA_RAW_FOLDER = os.path.join('data', 'raw')
DATA_PROCESSED_FOLDER = os.path.join('data', 'processed')
SAVE_FOLDER = DATA_PROCESSED_FOLDER

In [3]:
zoopla_df_filename = 'zoopla_properties_with_postcode.csv'
zoopla_df = pd.read_csv(os.path.join(DATA_PROCESSED_FOLDER, zoopla_df_filename), dtype=str)

epc_filename_nuneaton = 'epcs_nuneaton.csv'
epc_filename_hinckley = 'epcs_hinckley.csv'
epc_df_nuneaton = pd.read_csv(os.path.join(DATA_RAW_FOLDER, epc_filename_nuneaton), dtype=str)
epc_df_hinckley = pd.read_csv(os.path.join(DATA_RAW_FOLDER, epc_filename_hinckley), dtype=str)
epc_df = epc_df_nuneaton.append(epc_df_hinckley).drop_duplicates()

deprivation_filename_nuneaton = '2019-deprivation-by-postcode_nuneaton.csv'
deprivation_filename_hinckley = '2019-deprivation-by-postcode_hinckley.csv'
deprivation_df_nuneaton = pd.read_csv(os.path.join(DATA_RAW_FOLDER, deprivation_filename_nuneaton), dtype=str)
deprivation_df_hinckley = pd.read_csv(os.path.join(DATA_RAW_FOLDER, deprivation_filename_hinckley), dtype=str)
deprivation_df = deprivation_df_nuneaton.append(deprivation_df_hinckley).drop_duplicates()

floodrisk_df_filename = 'open_flood_risk_by_postcode.csv'
floodrisk_df = pd.read_csv(os.path.join(DATA_RAW_FOLDER, floodrisk_df_filename), header=None, dtype=str)

In [4]:
display(zoopla_df.head())
display(epc_df.head())
display(deprivation_df.head())
display(floodrisk_df.head())

Unnamed: 0,agent_logo,outcode,price_modifier,num_recepts,street_name,first_published_date,agent_address,property_type,floor_plan,details_url,country,num_bathrooms,agent_name,listing_status,listing_id,price,displayable_address,image_url,latitude,longitude,description,post_town,country_code,county,last_published_date,num_bedrooms,category,agent_phone,postcode,parish
0,https://st.zoocdn.com/zoopla_static_agent_logo...,CV11,from,3,"Meadow Green, Watling Street",2023-02-04 05:28:55,"Meadow Green, Watling Street, Nuneaton",Detached house,,https://www.zoopla.co.uk/for-sale/details/6388...,England,0,Taylor Wimpey - Meadow Green,sale,63883197,376500.0,"""The Lanford - Plot 322"" at Windrower Close, N...",https://lid.zoocdn.com/354/255/fd606582b571af7...,52.52016999999999,-1.4552873,"Discover this 4 bedroom Lanford home, ideal fo...",Nuneaton,gb,Warwickshire,2023-02-04 05:40:31,4,Residential,024 7511 6265,CV11 4FS,"Nuneaton and Bedworth, unparished area"
1,https://st.zoocdn.com/zoopla_static_agent_logo...,CV11,from,3,"Meadow Green, Watling Street",2023-02-04 05:28:47,"Meadow Green, Watling Street, Nuneaton",Detached house,,https://www.zoopla.co.uk/for-sale/details/6388...,England,0,Taylor Wimpey - Meadow Green,sale,63883200,489950.0,"""The Ransford - Plot 119"" at Windrower Close, ...",https://lid.zoocdn.com/354/255/f5547b1657bfbfd...,52.52016999999999,-1.4552873,This four bedroom Ransford home is perfect for...,Nuneaton,gb,Warwickshire,2023-02-04 05:38:17,4,Residential,024 7511 6265,CV11 4FS,"Nuneaton and Bedworth, unparished area"
2,https://st.zoocdn.com/zoopla_static_agent_logo...,CV11,from,2,"Meadow Green, Watling Street",2023-02-04 05:28:47,"Meadow Green, Watling Street, Nuneaton",Detached house,,https://www.zoopla.co.uk/for-sale/details/6388...,England,0,Taylor Wimpey - Meadow Green,sale,63883198,305000.0,"""The Byford - Plot 323"" at Windrower Close, Nu...",https://lid.zoocdn.com/354/255/941aa37a7610247...,52.52016999999999,-1.4552873,Find out how our mortgage contribution scheme*...,Nuneaton,gb,Warwickshire,2023-02-04 05:38:28,3,Residential,024 7511 6265,CV11 4FS,"Nuneaton and Bedworth, unparished area"
3,https://st.zoocdn.com/zoopla_static_agent_logo...,CV11,from,2,"Meadow Green, Watling Street",2023-02-04 05:28:47,"Meadow Green, Watling Street, Nuneaton",Detached house,,https://www.zoopla.co.uk/for-sale/details/6388...,England,0,Taylor Wimpey - Meadow Green,sale,63883199,314950.0,"""The Amersham - Plot 373"" at Windrower Close, ...",https://lid.zoocdn.com/354/255/b4096bb0c276201...,52.52016999999999,-1.4552873,A delightful three bedroom home with an integr...,Nuneaton,gb,Warwickshire,2023-02-04 05:40:02,3,Residential,024 7511 6265,CV11 4FS,"Nuneaton and Bedworth, unparished area"
4,https://st.zoocdn.com/zoopla_static_agent_logo...,CV10,guide_price,1,Duckpond Lane,2023-02-03 19:18:48,"22 Newdegate Street, Nuneaton",Detached house,,https://www.zoopla.co.uk/for-sale/details/6388...,England,2,Alan Cooper Estates,sale,63881100,300000.0,"Duckpond Lane, Weddington, Nuneaton CV10",https://lid.zoocdn.com/354/255/7649fd019aaf859...,52.54377,-1.463799,Here is a superb double fronted Detached Resid...,Nuneaton,gb,Warwickshire,2023-02-03 19:37:49,3,Residential,024 7513 8435,CV10 0FH,"Nuneaton and Bedworth, unparished area"


Unnamed: 0,LMK_KEY,ADDRESS1,ADDRESS2,ADDRESS3,POSTCODE,BUILDING_REFERENCE_NUMBER,CURRENT_ENERGY_RATING,POTENTIAL_ENERGY_RATING,CURRENT_ENERGY_EFFICIENCY,POTENTIAL_ENERGY_EFFICIENCY,PROPERTY_TYPE,BUILT_FORM,INSPECTION_DATE,LOCAL_AUTHORITY,CONSTITUENCY,COUNTY,LODGEMENT_DATE,TRANSACTION_TYPE,ENVIRONMENT_IMPACT_CURRENT,ENVIRONMENT_IMPACT_POTENTIAL,ENERGY_CONSUMPTION_CURRENT,ENERGY_CONSUMPTION_POTENTIAL,CO2_EMISSIONS_CURRENT,CO2_EMISS_CURR_PER_FLOOR_AREA,CO2_EMISSIONS_POTENTIAL,LIGHTING_COST_CURRENT,LIGHTING_COST_POTENTIAL,HEATING_COST_CURRENT,HEATING_COST_POTENTIAL,HOT_WATER_COST_CURRENT,HOT_WATER_COST_POTENTIAL,TOTAL_FLOOR_AREA,ENERGY_TARIFF,MAINS_GAS_FLAG,FLOOR_LEVEL,FLAT_TOP_STOREY,FLAT_STOREY_COUNT,MAIN_HEATING_CONTROLS,MULTI_GLAZE_PROPORTION,GLAZED_TYPE,GLAZED_AREA,EXTENSION_COUNT,NUMBER_HABITABLE_ROOMS,NUMBER_HEATED_ROOMS,LOW_ENERGY_LIGHTING,NUMBER_OPEN_FIREPLACES,HOTWATER_DESCRIPTION,HOT_WATER_ENERGY_EFF,HOT_WATER_ENV_EFF,FLOOR_DESCRIPTION,FLOOR_ENERGY_EFF,FLOOR_ENV_EFF,WINDOWS_DESCRIPTION,WINDOWS_ENERGY_EFF,WINDOWS_ENV_EFF,WALLS_DESCRIPTION,WALLS_ENERGY_EFF,WALLS_ENV_EFF,SECONDHEAT_DESCRIPTION,SHEATING_ENERGY_EFF,SHEATING_ENV_EFF,ROOF_DESCRIPTION,ROOF_ENERGY_EFF,ROOF_ENV_EFF,MAINHEAT_DESCRIPTION,MAINHEAT_ENERGY_EFF,MAINHEAT_ENV_EFF,MAINHEATCONT_DESCRIPTION,MAINHEATC_ENERGY_EFF,MAINHEATC_ENV_EFF,LIGHTING_DESCRIPTION,LIGHTING_ENERGY_EFF,LIGHTING_ENV_EFF,MAIN_FUEL,WIND_TURBINE_COUNT,HEAT_LOSS_CORRIDOR,UNHEATED_CORRIDOR_LENGTH,FLOOR_HEIGHT,PHOTO_SUPPLY,SOLAR_WATER_HEATING_FLAG,MECHANICAL_VENTILATION,ADDRESS,LOCAL_AUTHORITY_LABEL,CONSTITUENCY_LABEL,POSTTOWN,CONSTRUCTION_AGE_BAND,LODGEMENT_DATETIME,TENURE,FIXED_LIGHTING_OUTLETS_COUNT,LOW_ENERGY_FIXED_LIGHT_COUNT,UPRN,UPRN_SOURCE
0,1230178709552014110416461495049124,"71, Riversley Road",,,CV11 5QT,1404959278,E,D,45,64,Maisonette,End-Terrace,2014-11-01,E07000219,E14000868,Warwickshire,2014-11-04,none of the above,42,62,384,236,5.1,74,3.1,96,51,989,616,110,110,69.0,dual,Y,1st,Y,,2102,100,double glazing installed before 2002,Normal,1,4,4,11,0,From main system,Good,Good,(other premises below),,,Fully double glazed,Average,Average,"Solid brick, as built, no insulation (assumed)",Very Poor,Very Poor,,,,"Pitched, no insulation (assumed)",Very Poor,Very Poor,"Boiler and radiators, mains gas",Good,Good,"Programmer, no room thermostat",Very Poor,Very Poor,Low energy lighting in 11% of fixed outlets,Poor,Poor,mains gas (not community),0,no corridor,,,0.0,,natural,"71, Riversley Road",Nuneaton and Bedworth,Nuneaton,NUNEATON,England and Wales: 1900-1929,2014-11-04 16:46:14,owner-occupied,9.0,1.0,100070163319,Address Matched
1,868018689262012122112363153798182,"23, Arden Road",Bulkington,,CV12 9JJ,8533893078,D,B,59,85,House,Detached,2012-12-21,E07000219,E14000905,Warwickshire,2012-12-21,FiT application,55,85,251,80,4.6,48,1.5,64,64,795,441,86,63,96.0,Single,Y,NODATA!,,,2106,100,double glazing installed during or after 2002,Normal,1,7,7,75,0,From main system,Good,Good,"Solid, no insulation (assumed)",,,Fully double glazed,Good,Good,"Solid brick, as built, no insulation (assumed)",Very Poor,Very Poor,"Room heaters, mains gas",,,"Pitched, 150 mm loft insulation",Good,Good,"Boiler and radiators, mains gas",Good,Good,"Programmer, room thermostat and TRVs",Good,Good,Low energy lighting in 75% of fixed outlets,Very Good,Very Good,mains gas (not community),0,NO DATA!,,,0.0,,natural,"23, Arden Road, Bulkington",Nuneaton and Bedworth,Rugby,BEDWORTH,England and Wales: 1900-1929,2012-12-21 12:36:31,owner-occupied,12.0,9.0,100070135974,Address Matched
2,1008849941732013092016263284978300,"249, Lutterworth Road",,,CV11 6PU,4044393178,E,C,50,76,Bungalow,Detached,2013-09-16,E07000219,E14000868,Warwickshire,2013-09-20,marketed sale,48,75,273,126,7.1,52,3.3,112,73,1292,857,172,84,136.0,dual,Y,NODATA!,,,2106,95,double glazing installed before 2002,Normal,1,6,6,47,1,From main system,Average,Average,"Solid, no insulation (assumed)",,,Mostly double glazing,Average,Average,"Cavity wall, filled cavity",Good,Good,"Room heaters, electric",,,"Pitched, 100 mm loft insulation",Average,Average,"Boiler and radiators, mains gas",Good,Good,"Programmer, room thermostat and TRVs",Good,Good,Low energy lighting in 47% of fixed outlets,Good,Good,mains gas (not community),0,NO DATA!,,,0.0,,natural,"249, Lutterworth Road",Nuneaton and Bedworth,Nuneaton,NUNEATON,England and Wales: 1950-1966,2013-09-20 16:26:32,owner-occupied,15.0,7.0,100070155947,Address Matched
3,1512639457112017012009415295930043,"130, Haunchwood Road",,,CV10 8DJ,363759478,E,C,54,76,House,Mid-Terrace,2017-01-12,E07000219,E14000868,Warwickshire,2017-01-20,marketed sale,47,71,364,189,5.1,64,2.7,96,54,932,702,97,66,79.0,Unknown,Y,NODATA!,,,2107,100,"double glazing, unknown install date",Normal,0,4,4,20,0,From main system,Good,Good,"Suspended, no insulation (assumed)",NO DATA!,,Fully double glazed,Average,Average,"Solid brick, as built, no insulation (assumed)",Very Poor,Very Poor,,,,"Pitched, no insulation (assumed)",Very Poor,Very Poor,"Boiler and radiators, mains gas",Good,Good,"Programmer, TRVs and bypass",Average,Average,Low energy lighting in 20% of fixed outlets,Poor,Poor,mains gas (not community),0,NO DATA!,,,,N,natural,"130, Haunchwood Road",Nuneaton and Bedworth,Nuneaton,NUNEATON,England and Wales: 1900-1929,2017-01-20 09:41:52,owner-occupied,,,100070150683,Address Matched
4,346951970962009081717201716538621,"8, Seeswood Close",,,CV10 7JF,2183726668,D,C,65,73,House,Mid-Terrace,2009-08-17,E07000219,E14000868,Warwickshire,2009-08-17,marketed sale,63,72,308,232,2.8,50,2.1,54,27,415,348,124,99,54.4,Single,Y,NO DATA!,,,2104,100,double glazing installed before 2002,Normal,0,3,2,0,0,From main system,Good,Good,"Solid, no insulation (assumed)",,,Fully double glazed,Average,Average,"Cavity wall, as built, insulated (assumed)",Good,Good,Portable electric heaters,,,"Pitched, 100mm loft insulation",Average,Average,"Boiler and radiators, mains gas",Good,Good,Programmer and room thermostat,Poor,Poor,No low energy lighting,Very Poor,Very Poor,mains gas - this is for backwards compatibilit...,0,NO DATA!,,2.4,0.0,N,natural,"8, Seeswood Close",Nuneaton and Bedworth,Nuneaton,NUNEATON,England and Wales: 1991-1995,2009-08-17 17:20:17,owner-occupied,,,10000225799,Address Matched


Unnamed: 0,Postcode,Postcode Status,LSOA code,LSOA Name,User Data A,User Data B,User Data C,User Data D,User Data E,User Data F,User Data G,User Data H,User Data I,User Data J,User Data K,User Data L,User Data M,User Data N,User Data O,User Data P,Index of Multiple Deprivation Rank,Index of Multiple Deprivation Decile,Income Rank,Income Decile,Income Score,Employment Rank,Employment Decile,Employment Score,Education and Skills Rank,Education and Skills Decile,Health and Disability Rank,Health and Disability Decile,Crime Rank,Crime Decile,Barriers to Housing and Services Rank,Barriers to Housing and Services Decile,Living Environment Rank,Living Environment Decile,IDACI Rank,IDACI Decile,IDACI Score,IDAOPI Rank,IDAOPI Decile,IDAOPI Score
0,Postcode,**UNMATCHED**,,,In Use?,Latitude,Longitude,Easting,Northing,Grid Ref,Ward,Parish,Introduced,Terminated,Altitude,Country,Last Updated,Quality,LSOA Code,LSOA Name,,,,,,,,,,,,,,,,,,,,,,,,
1,CV10 0AA,Live,E01031102,Nuneaton and Bedworth 003C E01031102,Yes,52.52675,-1.46076,436681,292234,SP366922,St. Nicolas,"Nuneaton and Bedworth, unparished area",1980-01-01,,85,England,2022-11-25,Within the building of the matched address clo...,E01031102,Nuneaton and Bedworth 003C,12210.0,4.0,15558.0,5.0,0.105,10631.0,4.0,0.113,23574.0,8.0,7774.0,3.0,4258.0,2.0,10021.0,4.0,18272.0,6.0,19941.0,7.0,0.096,17648.0,6.0,0.118
2,CV10 0AB,Live,E01031102,Nuneaton and Bedworth 003C E01031102,Yes,52.527391,-1.459293,436780,292306,SP367923,St. Nicolas,"Nuneaton and Bedworth, unparished area",1980-01-01,,86,England,2022-11-25,Within the building of the matched address clo...,E01031102,Nuneaton and Bedworth 003C,12210.0,4.0,15558.0,5.0,0.105,10631.0,4.0,0.113,23574.0,8.0,7774.0,3.0,4258.0,2.0,10021.0,4.0,18272.0,6.0,19941.0,7.0,0.096,17648.0,6.0,0.118
3,CV10 0AD,Live,E01031102,Nuneaton and Bedworth 003C E01031102,Yes,52.5276,-1.461965,436599,292328,SP365923,St. Nicolas,"Nuneaton and Bedworth, unparished area",1980-01-01,,86,England,2022-11-25,Within the building of the matched address clo...,E01031102,Nuneaton and Bedworth 003C,12210.0,4.0,15558.0,5.0,0.105,10631.0,4.0,0.113,23574.0,8.0,7774.0,3.0,4258.0,2.0,10021.0,4.0,18272.0,6.0,19941.0,7.0,0.096,17648.0,6.0,0.118
4,CV10 0AE,Terminated,E01031102,Nuneaton and Bedworth 003C E01031102,No,52.52684,-1.462793,436543,292243,SP365922,St. Nicolas,"Nuneaton and Bedworth, unparished area",1980-01-01,2009-11-01,86,England,2022-11-25,Within the building of the matched address clo...,E01031102,Nuneaton and Bedworth 003C,12210.0,4.0,15558.0,5.0,0.105,10631.0,4.0,0.113,23574.0,8.0,7774.0,3.0,4258.0,2.0,10021.0,4.0,18272.0,6.0,19941.0,7.0,0.096,17648.0,6.0,0.118


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,AL10 0AA,\N,,\N,\N,\N,522503,208775,51.764264,-0.226254
1,AL10 0AB,\N,,\N,\N,\N,522680,209765,51.773122,-0.223341
2,AL10 0AD,\N,,\N,\N,\N,522997,209812,51.773475,-0.218732
3,AL10 0AE,\N,,\N,\N,\N,522530,209750,51.77302,-0.225519
4,AL10 0AG,\N,,\N,\N,\N,522515,209794,51.773419,-0.225721


### Fix column names in each dataset

In [5]:
# Deprivation dataset - colnames spread over two rows.
# If colname starts with 'User Data', then use second row, else first row

for colname in deprivation_df.columns:
    if colname.startswith('User Data'):
        new_colname = deprivation_df[colname].iloc[0]
        deprivation_df.rename(columns={colname: new_colname}, inplace=True)

# remove the first row
deprivation_df = deprivation_df.iloc[1:]
deprivation_df.head()

Unnamed: 0,Postcode,Postcode Status,LSOA code,LSOA Name,In Use?,Latitude,Longitude,Easting,Northing,Grid Ref,Ward,Parish,Introduced,Terminated,Altitude,Country,Last Updated,Quality,LSOA Code,LSOA Name.1,Index of Multiple Deprivation Rank,Index of Multiple Deprivation Decile,Income Rank,Income Decile,Income Score,Employment Rank,Employment Decile,Employment Score,Education and Skills Rank,Education and Skills Decile,Health and Disability Rank,Health and Disability Decile,Crime Rank,Crime Decile,Barriers to Housing and Services Rank,Barriers to Housing and Services Decile,Living Environment Rank,Living Environment Decile,IDACI Rank,IDACI Decile,IDACI Score,IDAOPI Rank,IDAOPI Decile,IDAOPI Score
1,CV10 0AA,Live,E01031102,Nuneaton and Bedworth 003C E01031102,Yes,52.52675,-1.46076,436681,292234,SP366922,St. Nicolas,"Nuneaton and Bedworth, unparished area",1980-01-01,,85,England,2022-11-25,Within the building of the matched address clo...,E01031102,Nuneaton and Bedworth 003C,12210,4,15558,5,0.105,10631,4,0.113,23574,8,7774,3,4258,2,10021,4,18272,6,19941,7,0.096,17648,6,0.118
2,CV10 0AB,Live,E01031102,Nuneaton and Bedworth 003C E01031102,Yes,52.527391,-1.459293,436780,292306,SP367923,St. Nicolas,"Nuneaton and Bedworth, unparished area",1980-01-01,,86,England,2022-11-25,Within the building of the matched address clo...,E01031102,Nuneaton and Bedworth 003C,12210,4,15558,5,0.105,10631,4,0.113,23574,8,7774,3,4258,2,10021,4,18272,6,19941,7,0.096,17648,6,0.118
3,CV10 0AD,Live,E01031102,Nuneaton and Bedworth 003C E01031102,Yes,52.5276,-1.461965,436599,292328,SP365923,St. Nicolas,"Nuneaton and Bedworth, unparished area",1980-01-01,,86,England,2022-11-25,Within the building of the matched address clo...,E01031102,Nuneaton and Bedworth 003C,12210,4,15558,5,0.105,10631,4,0.113,23574,8,7774,3,4258,2,10021,4,18272,6,19941,7,0.096,17648,6,0.118
4,CV10 0AE,Terminated,E01031102,Nuneaton and Bedworth 003C E01031102,No,52.52684,-1.462793,436543,292243,SP365922,St. Nicolas,"Nuneaton and Bedworth, unparished area",1980-01-01,2009-11-01,86,England,2022-11-25,Within the building of the matched address clo...,E01031102,Nuneaton and Bedworth 003C,12210,4,15558,5,0.105,10631,4,0.113,23574,8,7774,3,4258,2,10021,4,18272,6,19941,7,0.096,17648,6,0.118
5,CV10 0AF,Live,E01031113,Nuneaton and Bedworth 001D E01031113,Yes,52.538271,-1.467776,436196,293512,SP361935,Weddington,"Nuneaton and Bedworth, unparished area",2006-04-01,,84,England,2022-11-25,Within the building of the matched address clo...,E01031113,Nuneaton and Bedworth 001D,26040,8,26694,9,0.048,24075,8,0.052,25403,8,21185,7,17466,6,22854,7,12915,4,25758,8,0.059,24829,8,0.072


In [6]:
# Flood risk dataset - set names
colnames = {0: 'postcode', 1: 'FID', 2: 'PROB_4BAND', 3: 'SUITABILITY', 4: 'PUB_DATE',
            5: 'RISK_FOR_INSURANCE_SOP', 6: 'easting', 7: 'northing', 8: 'latitude', 9: 'longitude'}

floodrisk_df.rename(columns=colnames, inplace=True)
floodrisk_df.head()

Unnamed: 0,postcode,FID,PROB_4BAND,SUITABILITY,PUB_DATE,RISK_FOR_INSURANCE_SOP,easting,northing,latitude,longitude
0,AL10 0AA,\N,,\N,\N,\N,522503,208775,51.764264,-0.226254
1,AL10 0AB,\N,,\N,\N,\N,522680,209765,51.773122,-0.223341
2,AL10 0AD,\N,,\N,\N,\N,522997,209812,51.773475,-0.218732
3,AL10 0AE,\N,,\N,\N,\N,522530,209750,51.77302,-0.225519
4,AL10 0AG,\N,,\N,\N,\N,522515,209794,51.773419,-0.225721


### Get street name from EPC dataset

In [7]:
def get_street_name(address_1, address_2):
    
    """
    Get street name from first two street address fields
    """
    
    street_and_road = re.compile(r'^\d,\s+')
    
    # if street name starts with a number (maybe followed by comma) and a space, likely next part is street name
    if street_and_road.match(address_1):
        street = re.split(street_and_road, address_1)[1].lower()
        
    # otherwise choose the second part of the address as the street name
    else:
        street = str(address_2).lower()
        
    return street

In [8]:
epc_df['Street'] = epc_df[['ADDRESS1', 'ADDRESS2']].apply(lambda x: get_street_name(x[0], x[1]), axis=1)

### Also remove prefix 'England and Wales: ' from construction age band in EPC data, and drop imvalid values

In [9]:
epc_df['CONSTRUCTION_AGE_BAND'].value_counts(dropna=False)

England and Wales: 1950-1966       13748
England and Wales: 1900-1929       11290
England and Wales: 1967-1975       11112
England and Wales: 1930-1949       10554
NO DATA!                            8879
England and Wales: 1983-1990        6390
England and Wales: 1976-1982        6149
England and Wales: 1996-2002        5182
England and Wales: 2003-2006        4490
England and Wales: before 1900      3821
England and Wales: 1991-1995        3748
England and Wales: 2007 onwards     1881
NaN                                  898
2021                                 670
England and Wales: 2007-2011         567
2020                                 547
2022                                 497
2018                                 304
INVALID!                             136
England and Wales: 2012 onwards      122
2017                                 108
2019                                  60
2016                                  32
2023                                   8
2014            

In [10]:
# e.g. England and Wales: 1967-1975 -> 1967-1975
epc_df['CONSTRUCTION_AGE_BAND'] = epc_df['CONSTRUCTION_AGE_BAND'].replace('England and Wales: ' , '', regex=True)
epc_df['CONSTRUCTION_AGE_BAND'] = epc_df['CONSTRUCTION_AGE_BAND'].replace(r'(NO DATA!|INVALID!)' , np.nan, regex=True)

In [11]:
epc_df['CONSTRUCTION_AGE_BAND'].value_counts(dropna=False)

1950-1966       13748
1900-1929       11290
1967-1975       11112
1930-1949       10554
NaN              9913
1983-1990        6390
1976-1982        6149
1996-2002        5182
2003-2006        4490
before 1900      3821
1991-1995        3748
2007 onwards     1881
2021              670
2007-2011         567
2020              547
2022              497
2018              304
2012 onwards      122
2017              108
2019               60
2016               32
2023                8
2014                7
1930                6
2013                4
1900                2
Name: CONSTRUCTION_AGE_BAND, dtype: int64

### Select out relevant columns in each dataset and drop duplicates

In [12]:
zoopla_df = zoopla_df[['listing_id', 'parish', 'post_town', 'postcode', 'latitude', 'longitude',
                       'property_type', 'num_bedrooms', 'num_bathrooms', 'description',
                       'first_published_date', 'last_published_date', 'price']].drop_duplicates()

epc_df = epc_df[['Street', 'POSTCODE', 'CURRENT_ENERGY_RATING', 'POTENTIAL_ENERGY_RATING',
                 'PROPERTY_TYPE', 'BUILT_FORM', 'TOTAL_FLOOR_AREA', 
                 'NUMBER_HABITABLE_ROOMS', 'CONSTRUCTION_AGE_BAND']].drop_duplicates()
epc_df.rename(columns={'POSTCODE': 'postcode'}, inplace=True)

deprivation_df = deprivation_df[['Postcode', 'Index of Multiple Deprivation Decile', 'Income Decile',
                                 'Employment Decile', 'Education and Skills Decile',
                                 'Health and Disability Decile', 'Crime Decile', 
                                 'Barriers to Housing and Services Decile', 'Living Environment Decile',
                                 'IDACI Decile', 'IDAOPI Decile']].drop_duplicates()
deprivation_df.rename(columns={'Postcode': 'postcode'}, inplace=True)

floodrisk_df = floodrisk_df[['postcode', 'PROB_4BAND']].drop_duplicates()

### Map PROPERTY_TYPE and BUILT_FORM in EPC data onto those for property_type in Zoopla data
We will then be able to narrow down possible EPC data houses onto the Zoopla house and join them

In [13]:
# first see if EPC appears in any of the Zoopla property descriptions
zoopla_df_epcs = zoopla_df[zoopla_df['description'].str.contains('epc', case=False)]['description']
for epc in zoopla_df_epcs:
    print(epc)

Here is a Detached Residence with four bedrooms occupying a prominent corner plot within this highly regarded and most sought-after location opposite Nuneaton Golf Club and handy for all local amenities. EPC rating D.
A vastly improved Semi Detached House in a sought-after residential area and offering excellent family accommodation with three bedrooms and two bathrooms. EPC rating C.
Here is a modern three storey Semi Detached Residence offering well planned accommodation designed to suit the needs of a modern family lifestyle. Early Viewing advised EPC rating B.
Here is a most delightful traditional style Semi Detached House offering much improved and particularly well maintained accommodation enjoying a wealth of charm and character throughout. EPC rating D.
Here's a great starter home! A larger style centre terrace house offering deceptively spacious and well presented accommodation considered ideal for the first time buyer. EPC rating E.
Unexpectedly avaiable; A refurbished and mu

Very few descriptions have EPCs so use the EPC data to infer them (first dropping description)

In [14]:
zoopla_df.drop('description', axis=1, inplace=True)

In [15]:
print('Zoopla house types:')
display(zoopla_df['property_type'].value_counts())

print('\nEPC house types:')
display(epc_df.groupby(['PROPERTY_TYPE', 'BUILT_FORM'])['PROPERTY_TYPE'].count())

Zoopla house types:


Detached house            271
Semi-detached house       255
Terraced house            118
Flat                       72
End terrace house          42
Detached bungalow          30
Town house                 17
Link-detached house        16
Mobile/park home           12
Bungalow                   11
Semi-detached bungalow      9
Maisonette                  6
Land                        5
Mews house                  2
Cottage                     1
Chalet                      1
Block of flats              1
Name: property_type, dtype: int64


EPC house types:


PROPERTY_TYPE  BUILT_FORM          
Bungalow       Detached                 4299
               Enclosed End-Terrace        5
               Enclosed Mid-Terrace        4
               End-Terrace               544
               Mid-Terrace               607
               NO DATA!                    4
               Semi-Detached            2738
Flat           Detached                 1649
               Enclosed End-Terrace      341
               Enclosed Mid-Terrace      221
               End-Terrace              2033
               Mid-Terrace              2696
               NO DATA!                  468
               Semi-Detached            3075
House          Detached                16425
               Enclosed End-Terrace      126
               Enclosed Mid-Terrace       50
               End-Terrace              6895
               Mid-Terrace             13371
               NO DATA!                  163
               Semi-Detached           23907
Maisonette     Deta

In [16]:
def set_property_genre_epc(property_type, built_form):
    
    """
    Map the EPC property type and built form pairs into the Zoopla values
    """
    
    property_type = str(property_type).lower()
    built_form = str(built_form).lower()
    
    if property_type == 'bungalow':
        property_genre = 'Bungalow'
            
    elif property_type == 'flat':
        property_genre = 'Flat'
        
    elif property_type == 'house':
        if built_form == 'detached':
            property_genre = 'Detached house'
        elif built_form == 'semi-detached':
            property_genre = 'Semi-detached house'
        elif 'end-terrace' in built_form:
            property_genre = 'End terrace house'
        elif 'mid-terrace' in built_form:
            property_genre = 'Terraced house'
        else:
            property_genre = 'House'
            
    elif property_type == 'maisonette':
        property_genre = 'Maisonette'
        
    else:
        property_genre = 'Other/Unknown'
        
    return property_genre


In [17]:
def set_property_genre_zoopla(property_type):
    
    """
    Merge rare Zoopla property types into their own parent category or an 'Other' genre
    This is to create a temporary property type field that we can then join on to the EPC data
    But for the prediction, we will use the original property types
    """
    
    property_type = str(property_type).lower()
    
    if property_type == 'town house':
        property_genre = 'Terraced house'
    elif property_type == 'link-detached house':
        property_genre = 'Detached house'
    elif 'bungalow' in property_type:
        property_genre = 'Bungalow'
    elif property_type in ['semi-detached house', 'detached house', 'bungalow', 'flat',
       'town house', 'link-detached house', 'end terrace house', 'terraced house',
       'detached bungalow', 'maisonette', 'semi-detached bungalow']:
        property_genre = property_type.capitalize()
    else:
        property_genre = 'Other/Unknown'
        
    return property_genre
    

In [18]:
epc_df['property_type_general'] = epc_df[['PROPERTY_TYPE', 'BUILT_FORM']].apply(
    lambda x: set_property_genre_epc(x[0], x[1]), axis=1)

zoopla_df['property_type_general'] = zoopla_df['property_type'].apply(set_property_genre_zoopla)
zoopla_df.drop('property_type', axis=1, inplace=True)

display(epc_df['property_type_general'].value_counts(dropna=False))
display(zoopla_df['property_type_general'].value_counts(dropna=False))

Semi-detached house    23907
Detached house         16425
Terraced house         13421
Flat                   10484
Bungalow                8201
End terrace house       7021
Maisonette              1336
House                    163
Other/Unknown             28
Name: property_type_general, dtype: int64

Detached house         287
Semi-detached house    255
Terraced house         135
Flat                    72
Bungalow                50
End terrace house       42
Other/Unknown           38
Maisonette               6
Name: property_type_general, dtype: int64

Convert numerical fields from strings to numbers, and energy ratings and construction age ranges to ints. Since energy ratings and construction ranges form an ordered scale, it makes sense to convert these from categorical to numerical values for the ML step later

In [19]:
epc_df['TOTAL_FLOOR_AREA'] = epc_df['TOTAL_FLOOR_AREA'].astype(float, errors='ignore')
epc_df['NUMBER_HABITABLE_ROOMS'] = epc_df['NUMBER_HABITABLE_ROOMS'].astype(float, errors='ignore')

In [20]:
energy_rating_mapper = {
    'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7
}
epc_df['CURRENT_ENERGY_RATING'] = epc_df['CURRENT_ENERGY_RATING'].map(energy_rating_mapper)
epc_df['POTENTIAL_ENERGY_RATING'] = epc_df['POTENTIAL_ENERGY_RATING'].map(energy_rating_mapper)

In [21]:
display(epc_df['CONSTRUCTION_AGE_BAND'].sort_index().value_counts(dropna=False))

def get_ageband_rank(ageband):
    
    """
    Convert construction age bands into a standardised set of bands and then convert to 
    integers where lower integer means older property
    """
    
    # account for cases where before 1900 or first 4 characters are not numeric
    if str(ageband).lower() == 'before 1900':
        return 1
    elif pd.isnull(ageband) or not ageband[:4].isnumeric():
        return None
    
    # if first 4 characters are numeric, convert bins to ints
    else:
        age_lowerband = int(ageband[:4])
        
        if age_lowerband >= 1900 and age_lowerband < 1930:
            return 2
        elif age_lowerband >= 1930 and age_lowerband < 1950:
            return 3
        elif age_lowerband >= 1950 and age_lowerband < 1967:
            return 4
        elif age_lowerband >= 1967 and age_lowerband < 1976:
            return 5
        elif age_lowerband >= 1976 and age_lowerband < 1983:
            return 6
        elif age_lowerband >= 1983 and age_lowerband < 1991:
            return 7
        elif age_lowerband >= 1991 and age_lowerband < 1996:
            return 8
        elif age_lowerband >= 1996 and age_lowerband < 2003:
            return 9
        elif age_lowerband >= 2003 and age_lowerband < 2007:
            return 10
        elif age_lowerband >= 2007 and age_lowerband < 2012:
            return 11
        elif age_lowerband >= 2012:
            return 12
        else:
            return None
        
epc_df['CONSTRUCTION_AGE_BAND'] = epc_df['CONSTRUCTION_AGE_BAND'].apply(get_ageband_rank)
display(epc_df['CONSTRUCTION_AGE_BAND'].sort_index().value_counts(dropna=False))


1950-1966       12963
1900-1929       10795
1967-1975       10494
1930-1949       10004
1983-1990        6087
1976-1982        5588
NaN              5487
1996-2002        4826
2003-2006        4190
before 1900      3729
1991-1995        3372
2007 onwards     1630
2007-2011         531
2021              369
2022              282
2020              255
2018              157
2012 onwards      105
2017               43
2019               36
2016               20
2023                7
2014                6
1930                5
2013                3
1900                2
Name: CONSTRUCTION_AGE_BAND, dtype: int64

4.0     12963
2.0     10797
5.0     10494
3.0     10009
7.0      6087
6.0      5588
NaN      5487
9.0      4826
10.0     4190
1.0      3729
8.0      3372
11.0     2161
12.0     1283
Name: CONSTRUCTION_AGE_BAND, dtype: int64

Now for a given postcode and property type in the EPC dataset, get both the most common EPC and range of EPCs

In [22]:
def get_mode(values):
    
    """
    Get most frequent value of a column in a given group.
    If there are no values, or all values are null, return null
    """

    if len(values) == 0 or all(pd.isnull(values)):
        return np.nan
    else:
        return pd.Series.mode(values)[0]

epc_df = epc_df.groupby(by=['postcode', 'property_type_general']).agg({
    'CURRENT_ENERGY_RATING': lambda x: get_mode(x),
    'POTENTIAL_ENERGY_RATING': lambda x: get_mode(x),
    'TOTAL_FLOOR_AREA': 'median',
    'NUMBER_HABITABLE_ROOMS': lambda x: get_mode(x),
    'CONSTRUCTION_AGE_BAND': lambda x: get_mode(x)
})

In [23]:
epc_df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,CURRENT_ENERGY_RATING,POTENTIAL_ENERGY_RATING,TOTAL_FLOOR_AREA,NUMBER_HABITABLE_ROOMS,CONSTRUCTION_AGE_BAND
postcode,property_type_general,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CV10 0AA,Detached house,4,2,110.5,5.0,4.0
CV10 0AA,End terrace house,5,3,83.0,5.0,2.0
CV10 0AA,Flat,4,3,76.0,2.0,2.0
CV10 0AA,Maisonette,5,3,47.0,3.0,2.0
CV10 0AA,Semi-detached house,5,3,157.0,6.0,1.0
CV10 0AA,Terraced house,5,3,103.5,5.0,2.0
CV10 0AB,Detached house,5,5,102.5,5.0,2.0
CV10 0AB,Flat,5,4,99.67,4.0,1.0
CV10 0AB,Semi-detached house,4,2,96.0,5.0,1.0
CV10 0AB,Terraced house,4,2,113.81,5.0,2.0


In [24]:
# set the indices back as columns so we can join them to the zoopla dataframe
epc_df = epc_df.reset_index()

In [25]:
# rename the lambda columns to more meaningful names
epc_df.rename(columns={'CURRENT_ENERGY_RATING': 'CURRENT_ENERGY_RATING_mode'}, inplace=True)
epc_df.rename(columns={'POTENTIAL_ENERGY_RATING': 'POTENTIAL_ENERGY_RATING_mode'}, inplace=True)
epc_df.rename(columns={'TOTAL_FLOOR_AREA': 'TOTAL_FLOOR_AREA_median'}, inplace=True)
epc_df.rename(columns={'NUMBER_HABITABLE_ROOMS': 'NUMBER_HABITABLE_ROOMS_mode'}, inplace=True)
epc_df.rename(columns={'CONSTRUCTION_AGE_BAND': 'CONSTRUCTION_AGE_BAND_mode'}, inplace=True)

In [26]:
epc_df.head()

Unnamed: 0,postcode,property_type_general,CURRENT_ENERGY_RATING_mode,POTENTIAL_ENERGY_RATING_mode,TOTAL_FLOOR_AREA_median,NUMBER_HABITABLE_ROOMS_mode,CONSTRUCTION_AGE_BAND_mode
0,CV10 0AA,Detached house,4,2,110.5,5.0,4.0
1,CV10 0AA,End terrace house,5,3,83.0,5.0,2.0
2,CV10 0AA,Flat,4,3,76.0,2.0,2.0
3,CV10 0AA,Maisonette,5,3,47.0,3.0,2.0
4,CV10 0AA,Semi-detached house,5,3,157.0,6.0,1.0


### Left join Zoopla dataset and EPC dataset
If postcode and property type same, then join

In [27]:
zoopla_df = zoopla_df.merge(epc_df, on=['postcode', 'property_type_general'], how='left')
zoopla_df.head()

Unnamed: 0,listing_id,parish,post_town,postcode,latitude,longitude,num_bedrooms,num_bathrooms,first_published_date,last_published_date,price,property_type_general,CURRENT_ENERGY_RATING_mode,POTENTIAL_ENERGY_RATING_mode,TOTAL_FLOOR_AREA_median,NUMBER_HABITABLE_ROOMS_mode,CONSTRUCTION_AGE_BAND_mode
0,63883197,"Nuneaton and Bedworth, unparished area",Nuneaton,CV11 4FS,52.52016999999999,-1.4552873,4,0,2023-02-04 05:28:55,2023-02-04 05:40:31,376500.0,Detached house,4.0,2.0,109.897,7.0,8.0
1,63883200,"Nuneaton and Bedworth, unparished area",Nuneaton,CV11 4FS,52.52016999999999,-1.4552873,4,0,2023-02-04 05:28:47,2023-02-04 05:38:17,489950.0,Detached house,4.0,2.0,109.897,7.0,8.0
2,63883198,"Nuneaton and Bedworth, unparished area",Nuneaton,CV11 4FS,52.52016999999999,-1.4552873,3,0,2023-02-04 05:28:47,2023-02-04 05:38:28,305000.0,Detached house,4.0,2.0,109.897,7.0,8.0
3,63883199,"Nuneaton and Bedworth, unparished area",Nuneaton,CV11 4FS,52.52016999999999,-1.4552873,3,0,2023-02-04 05:28:47,2023-02-04 05:40:02,314950.0,Detached house,4.0,2.0,109.897,7.0,8.0
4,63881100,"Nuneaton and Bedworth, unparished area",Nuneaton,CV10 0FH,52.54377,-1.463799,3,2,2023-02-03 19:18:48,2023-02-03 19:37:49,300000.0,Detached house,,,,,


In [28]:
print(len(zoopla_df[zoopla_df['CURRENT_ENERGY_RATING_mode'].isnull()]))
display(zoopla_df[zoopla_df['CURRENT_ENERGY_RATING_mode'].isnull()]['postcode'].value_counts())

144


CV11 6BD    7
CV10 9BW    6
CV10 0QE    6
CV10 9AN    4
CV10 9BY    3
           ..
CV10 0GN    1
CV11 6QL    1
CV10 9AZ    1
CV11 5JZ    1
CV11 4QT    1
Name: postcode, Length: 99, dtype: int64

Quite a few rows have missing energy ratings. We will look to impute these later

### Left join Zoopla dataset and deprivation dataset
If postcode same, then join

In [29]:
zoopla_df = zoopla_df.merge(deprivation_df, on='postcode', how='left')
zoopla_df.head()

Unnamed: 0,listing_id,parish,post_town,postcode,latitude,longitude,num_bedrooms,num_bathrooms,first_published_date,last_published_date,price,property_type_general,CURRENT_ENERGY_RATING_mode,POTENTIAL_ENERGY_RATING_mode,TOTAL_FLOOR_AREA_median,NUMBER_HABITABLE_ROOMS_mode,CONSTRUCTION_AGE_BAND_mode,Index of Multiple Deprivation Decile,Income Decile,Employment Decile,Education and Skills Decile,Health and Disability Decile,Crime Decile,Barriers to Housing and Services Decile,Living Environment Decile,IDACI Decile,IDAOPI Decile
0,63883197,"Nuneaton and Bedworth, unparished area",Nuneaton,CV11 4FS,52.52016999999999,-1.4552873,4,0,2023-02-04 05:28:55,2023-02-04 05:40:31,376500.0,Detached house,4.0,2.0,109.897,7.0,8.0,4,4,3,4,4,4,7,2,4,4
1,63883200,"Nuneaton and Bedworth, unparished area",Nuneaton,CV11 4FS,52.52016999999999,-1.4552873,4,0,2023-02-04 05:28:47,2023-02-04 05:38:17,489950.0,Detached house,4.0,2.0,109.897,7.0,8.0,4,4,3,4,4,4,7,2,4,4
2,63883198,"Nuneaton and Bedworth, unparished area",Nuneaton,CV11 4FS,52.52016999999999,-1.4552873,3,0,2023-02-04 05:28:47,2023-02-04 05:38:28,305000.0,Detached house,4.0,2.0,109.897,7.0,8.0,4,4,3,4,4,4,7,2,4,4
3,63883199,"Nuneaton and Bedworth, unparished area",Nuneaton,CV11 4FS,52.52016999999999,-1.4552873,3,0,2023-02-04 05:28:47,2023-02-04 05:40:02,314950.0,Detached house,4.0,2.0,109.897,7.0,8.0,4,4,3,4,4,4,7,2,4,4
4,63881100,"Nuneaton and Bedworth, unparished area",Nuneaton,CV10 0FH,52.54377,-1.463799,3,2,2023-02-03 19:18:48,2023-02-03 19:37:49,300000.0,Detached house,,,,,,8,9,8,8,7,6,7,4,8,8


### Left join Zoopla dataset and flood risk dataset
If postcode same, then join

In [30]:
zoopla_df = zoopla_df.merge(floodrisk_df, on='postcode', how='left')
zoopla_df.head()

Unnamed: 0,listing_id,parish,post_town,postcode,latitude,longitude,num_bedrooms,num_bathrooms,first_published_date,last_published_date,price,property_type_general,CURRENT_ENERGY_RATING_mode,POTENTIAL_ENERGY_RATING_mode,TOTAL_FLOOR_AREA_median,NUMBER_HABITABLE_ROOMS_mode,CONSTRUCTION_AGE_BAND_mode,Index of Multiple Deprivation Decile,Income Decile,Employment Decile,Education and Skills Decile,Health and Disability Decile,Crime Decile,Barriers to Housing and Services Decile,Living Environment Decile,IDACI Decile,IDAOPI Decile,PROB_4BAND
0,63883197,"Nuneaton and Bedworth, unparished area",Nuneaton,CV11 4FS,52.52016999999999,-1.4552873,4,0,2023-02-04 05:28:55,2023-02-04 05:40:31,376500.0,Detached house,4.0,2.0,109.897,7.0,8.0,4,4,3,4,4,4,7,2,4,4,
1,63883200,"Nuneaton and Bedworth, unparished area",Nuneaton,CV11 4FS,52.52016999999999,-1.4552873,4,0,2023-02-04 05:28:47,2023-02-04 05:38:17,489950.0,Detached house,4.0,2.0,109.897,7.0,8.0,4,4,3,4,4,4,7,2,4,4,
2,63883198,"Nuneaton and Bedworth, unparished area",Nuneaton,CV11 4FS,52.52016999999999,-1.4552873,3,0,2023-02-04 05:28:47,2023-02-04 05:38:28,305000.0,Detached house,4.0,2.0,109.897,7.0,8.0,4,4,3,4,4,4,7,2,4,4,
3,63883199,"Nuneaton and Bedworth, unparished area",Nuneaton,CV11 4FS,52.52016999999999,-1.4552873,3,0,2023-02-04 05:28:47,2023-02-04 05:40:02,314950.0,Detached house,4.0,2.0,109.897,7.0,8.0,4,4,3,4,4,4,7,2,4,4,
4,63881100,"Nuneaton and Bedworth, unparished area",Nuneaton,CV10 0FH,52.54377,-1.463799,3,2,2023-02-03 19:18:48,2023-02-03 19:37:49,300000.0,Detached house,,,,,,8,9,8,8,7,6,7,4,8,8,


### Drop duplicates and save to csv file

In [31]:
zoopla_df = zoopla_df.drop_duplicates()

In [32]:
try:
    os.mkdir(SAVE_FOLDER)
except OSError:
    pass

save_file = os.path.join(SAVE_FOLDER, 'zoopla_properties_with_postcode_epc_dep_flood.csv')
    
zoopla_df.to_csv(save_file, index=False)