In [1]:
import pandas as pd
years = [18, 19, 20, 21, 22, 23, 24, 25]
headers = [
    "transaction_id",
    "price",
    "date_of_transfer",
    "postcode",
    "property_type",
    "new_build_flag",
    "tenure_type",
    "primary_addressable_object_name",
    "secondary_addressable_object_name",
    "street",
    "locality",
    "town_city",
    "district",
    "county",
    "ppd_category_type",
    "record_status"
]
drop = [
    "transaction_id",
    "locality",
    "ppd_category_type",
    "record_status"
]
dfs = {}
for year in years:
    url = f"http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-20{year}.csv"
    df = pd.read_csv(url)
    df.columns = headers
    df = df.drop(columns=drop)
    dfs[f"df20{year}"] = df
    print(f"{year} done")


18 done
19 done
20 done
21 done
22 done


  df = pd.read_csv(url)


23 done
24 done
25 done


In [2]:
df_full = pd.concat(dfs)

In [3]:
df_full['Price (Thousands)'] = df_full['price'] / 1000
df_full.drop(columns=['price'], inplace=True)
df_full.describe()

Unnamed: 0,Price (Thousands)
count,7219742.0
mean,381.8475
std,1711.452
min,0.001
25%,165.0
50%,260.0
75%,400.0
max,900000.0


In [4]:
df_full.isna().sum()

date_of_transfer                           0
postcode                               24503
property_type                              0
new_build_flag                             0
tenure_type                                0
primary_addressable_object_name            0
secondary_addressable_object_name    6263102
street                                130620
town_city                                  0
district                                   0
county                                     0
Price (Thousands)                          0
dtype: int64

In [5]:
df_clean = df_full[df_full['postcode'].notna()]
df_clean.describe()

Unnamed: 0,Price (Thousands)
count,7195239.0
mean,379.4882
std,1660.432
min,0.001
25%,165.0
50%,260.0
75%,400.0
max,900000.0


In [6]:
df_binary = pd.get_dummies(df_clean , dtype=int, columns=['tenure_type','new_build_flag', 'property_type'])
df_binary.rename(columns={'date_of_transfer':'Transfer Date','town_city':'Town/City','district':'District','county':'County','tenure_type_F':'Freehold Tenure','tenure_type_L':'Leasehold Tenure','new_build_flag_N':'Old Build','new_build_flag_Y':'New Build','property_type_D':'Detached', 'property_type_F':'Flat', 'property_type_O':'Other Property Type','property_type_S':'Semi-detached','property_type_T':'Terraced', 'postcode':'Postcode'}, inplace=True)
df_binary.head()

Unnamed: 0,Unnamed: 1,Transfer Date,Postcode,primary_addressable_object_name,secondary_addressable_object_name,street,Town/City,District,County,Price (Thousands),Freehold Tenure,Leasehold Tenure,Old Build,New Build,Detached,Flat,Other Property Type,Semi-detached,Terraced
df2018,0,2018-09-24 00:00,M6 8GQ,1,,RIVINGTON ROAD,SALFORD,SALFORD,GREATER MANCHESTER,253.5,1,0,1,0,1,0,0,0,0
df2018,1,2018-09-28 00:00,WA3 2UE,35,,STONEACRE CLOSE,WARRINGTON,WIGAN,GREATER MANCHESTER,231.95,1,0,0,1,1,0,0,0,0
df2018,2,2018-08-29 00:00,OL6 6RJ,102,,THORNFIELD GROVE,ASHTON-UNDER-LYNE,TAMESIDE,GREATER MANCHESTER,112.5,1,0,1,0,0,0,0,1,0
df2018,3,2018-06-15 00:00,M46 0TW,37,,THREADNEEDLE PLACE,MANCHESTER,WIGAN,GREATER MANCHESTER,184.995,1,0,0,1,0,0,0,1,0
df2018,4,2018-09-28 00:00,M28 3XS,9,,MARPLE GARDENS,MANCHESTER,SALFORD,GREATER MANCHESTER,214.995,0,1,0,1,1,0,0,0,0


In [7]:
df_binary['Transfer Date'] = pd.to_datetime(df_binary['Transfer Date'], errors='coerce')
df_binary['Year'] = df_binary['Transfer Date'].dt.year
df_binary['Month'] = df_binary['Transfer Date'].dt.month
df_binary['Quarter'] = df_binary['Transfer Date'].dt.quarter
df_binary['Day of the Week'] = df_binary['Transfer Date'].dt.dayofweek
df_binary['Transfer Date'] = df_binary['Transfer Date'].astype(str).str[:10]

In [8]:
print(df_binary.dtypes)

Transfer Date                         object
Postcode                              object
primary_addressable_object_name       object
secondary_addressable_object_name     object
street                                object
Town/City                             object
District                              object
County                                object
Price (Thousands)                    float64
Freehold Tenure                        int64
Leasehold Tenure                       int64
Old Build                              int64
New Build                              int64
Detached                               int64
Flat                                   int64
Other Property Type                    int64
Semi-detached                          int64
Terraced                               int64
Year                                   int32
Month                                  int32
Quarter                                int32
Day of the Week                        int32
dtype: obj

In [9]:
print(df_binary.nunique())

Transfer Date                           2731
Postcode                             1162297
primary_addressable_object_name       311294
secondary_addressable_object_name      37652
street                                302288
Town/City                               1150
District                                 361
County                                   117
Price (Thousands)                     143229
Freehold Tenure                            2
Leasehold Tenure                           2
Old Build                                  2
New Build                                  2
Detached                                   2
Flat                                       2
Other Property Type                        2
Semi-detached                              2
Terraced                                   2
Year                                       8
Month                                     12
Quarter                                    4
Day of the Week                            7
dtype: int

In [10]:
df_binary['Postcode Area'] = df_binary['Postcode'].str.extract(r'^([A-Z]+)')
df_binary['Postcode District'] = df_binary['Postcode'].str.extract(r'^([A-Z]+[0-9]+)')
df_binary.head()

Unnamed: 0,Unnamed: 1,Transfer Date,Postcode,primary_addressable_object_name,secondary_addressable_object_name,street,Town/City,District,County,Price (Thousands),Freehold Tenure,...,Flat,Other Property Type,Semi-detached,Terraced,Year,Month,Quarter,Day of the Week,Postcode Area,Postcode District
df2018,0,2018-09-24,M6 8GQ,1,,RIVINGTON ROAD,SALFORD,SALFORD,GREATER MANCHESTER,253.5,1,...,0,0,0,0,2018,9,3,0,M,M6
df2018,1,2018-09-28,WA3 2UE,35,,STONEACRE CLOSE,WARRINGTON,WIGAN,GREATER MANCHESTER,231.95,1,...,0,0,0,0,2018,9,3,4,WA,WA3
df2018,2,2018-08-29,OL6 6RJ,102,,THORNFIELD GROVE,ASHTON-UNDER-LYNE,TAMESIDE,GREATER MANCHESTER,112.5,1,...,0,0,1,0,2018,8,3,2,OL,OL6
df2018,3,2018-06-15,M46 0TW,37,,THREADNEEDLE PLACE,MANCHESTER,WIGAN,GREATER MANCHESTER,184.995,1,...,0,0,1,0,2018,6,2,4,M,M46
df2018,4,2018-09-28,M28 3XS,9,,MARPLE GARDENS,MANCHESTER,SALFORD,GREATER MANCHESTER,214.995,0,...,0,0,0,0,2018,9,3,4,M,M28


In [11]:
df_avg_pc = df_binary.groupby('Postcode Area')['Price (Thousands)'].mean().reset_index()
df_avg_pc.columns = ['Postcode Area', 'Area Code Average Price (Thousands)']
df_avg_county = df_binary.groupby('County')['Price (Thousands)'].mean().reset_index()
df_avg_county.columns = ['County', 'County Average Price (Thousands)']
df_avg_tc = df_binary.groupby('Town/City')['Price (Thousands)'].mean().reset_index()
df_avg_tc.columns = ['Town/City', 'Town/City Average Price (Thousands)']
df_avg_dist = df_binary.groupby('District')['Price (Thousands)'].mean().reset_index()
df_avg_dist.columns = ['District', 'District Average Price (Thousands)']
df_avg_pc.head()

Unnamed: 0,Postcode Area,Area Code Average Price (Thousands)
0,AL,624.364499
1,B,302.164939
2,BA,397.04674
3,BB,186.553792
4,BD,198.704839


In [12]:
df_merge = df_binary.merge(df_avg_pc, on=['Postcode Area'], how='left')
df_merge = df_merge.merge(df_avg_tc, on=['Town/City'], how='left')
df_merge = df_merge.merge(df_avg_dist, on=['District'], how='left')
df_merge = df_merge.merge(df_avg_county, on=['County'], how='left')
df_merge.head()

Unnamed: 0,Transfer Date,Postcode,primary_addressable_object_name,secondary_addressable_object_name,street,Town/City,District,County,Price (Thousands),Freehold Tenure,...,Year,Month,Quarter,Day of the Week,Postcode Area,Postcode District,Area Code Average Price (Thousands),Town/City Average Price (Thousands),District Average Price (Thousands),County Average Price (Thousands)
0,2018-09-24,M6 8GQ,1,,RIVINGTON ROAD,SALFORD,SALFORD,GREATER MANCHESTER,253.5,1,...,2018,9,3,0,M,M6,286.373466,261.29226,253.098761,262.010594
1,2018-09-28,WA3 2UE,35,,STONEACRE CLOSE,WARRINGTON,WIGAN,GREATER MANCHESTER,231.95,1,...,2018,9,3,4,WA,WA3,301.647233,289.308785,193.246849,262.010594
2,2018-08-29,OL6 6RJ,102,,THORNFIELD GROVE,ASHTON-UNDER-LYNE,TAMESIDE,GREATER MANCHESTER,112.5,1,...,2018,8,3,2,OL,OL6,197.639388,183.803044,196.658531,262.010594
3,2018-06-15,M46 0TW,37,,THREADNEEDLE PLACE,MANCHESTER,WIGAN,GREATER MANCHESTER,184.995,1,...,2018,6,2,4,M,M46,286.373466,285.381745,193.246849,262.010594
4,2018-09-28,M28 3XS,9,,MARPLE GARDENS,MANCHESTER,SALFORD,GREATER MANCHESTER,214.995,0,...,2018,9,3,4,M,M28,286.373466,285.381745,253.098761,262.010594


In [14]:
loc_df = pd.read_csv('ONSPD_Online_Latest_Centroids.csv')
loc_df.head()

  loc_df = pd.read_csv('ONSPD_Online_Latest_Centroids.csv')


Unnamed: 0,X,Y,OBJECTID,PCD,PCD2,PCDS,DOINTR,DOTERM,OSCTY,CED,...,LEP2,PFA,IMD,CALNCV,ICB,OA21,LSOA21,MSOA21,RUC21IND,GlobalID
0,385386.0,801193.0,1,AB1 0AA,AB1 0AA,AB1 0AA,198001,199606.0,S99999999,S99999999,...,,S23000009,6715,S99999999,S99999999,S00137176,S01013490,S02002516,,b2d74491-a06c-4dd0-be0d-3f3fd5997475
1,385177.0,801314.0,2,AB1 0AB,AB1 0AB,AB1 0AB,198001,199606.0,S99999999,S99999999,...,,S23000009,6715,S99999999,S99999999,S00137176,S01013490,S02002516,,83e9e27a-724a-4020-9cf0-d87eff5872e8
2,385053.0,801092.0,3,AB1 0AD,AB1 0AD,AB1 0AD,198001,199606.0,S99999999,S99999999,...,,S23000009,6715,S99999999,S99999999,S00137176,S01013490,S02002516,,994eb987-1286-4c7f-b16b-87d03f6b0dec
3,384600.0,799300.0,4,AB1 0AE,AB1 0AE,AB1 0AE,199402,199606.0,S99999999,S99999999,...,,S23000009,5069,S99999999,S99999999,S00138891,S01013856,S02002577,,fd4a2772-2414-4d69-ba99-311e90df23d5
4,384460.0,800660.0,5,AB1 0AF,AB1 0AF,AB1 0AF,199012,199207.0,S99999999,S99999999,...,,S23000009,6253,S99999999,S99999999,S00137241,S01013487,S02002515,,270fcb01-bcda-42fc-8c38-6fdea1c3d1a3


In [15]:
loc_df.columns

Index(['X', 'Y', 'OBJECTID', 'PCD', 'PCD2', 'PCDS', 'DOINTR', 'DOTERM',
       'OSCTY', 'CED', 'OSLAUA', 'OSWARD', 'PARISH', 'USERTYPE', 'OSEAST1M',
       'OSNRTH1M', 'OSGRDIND', 'OSHLTHAU', 'NHSER', 'CTRY', 'RGN', 'STREG',
       'PCON', 'EER', 'TECLEC', 'TTWA', 'PCT', 'ITL', 'STATSWARD', 'OA01',
       'CASWARD', 'NPARK', 'LSOA01', 'MSOA01', 'UR01IND', 'OAC01', 'OA11',
       'LSOA11', 'MSOA11', 'WZ11', 'SICBL', 'BUA24', 'RU11IND', 'OAC11', 'LAT',
       'LONG', 'LEP1', 'LEP2', 'PFA', 'IMD', 'CALNCV', 'ICB', 'OA21', 'LSOA21',
       'MSOA21', 'RUC21IND', 'GlobalID'],
      dtype='object')

In [16]:
loc_df_drop = loc_df[['LAT','LONG','PCDS','OSEAST1M','OSNRTH1M']]
loc_df_drop.rename(columns={'PCDS':'Postcode'}, inplace=True)
loc_df_drop.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loc_df_drop.rename(columns={'PCDS':'Postcode'}, inplace=True)


Unnamed: 0,LAT,LONG,Postcode,OSEAST1M,OSNRTH1M
0,57.10146,-2.242858,AB1 0AA,385386.0,801193.0
1,57.10254,-2.246315,AB1 0AB,385177.0,801314.0
2,57.10054,-2.248349,AB1 0AD,385053.0,801092.0
3,57.08443,-2.255714,AB1 0AE,384600.0,799300.0
4,57.09664,-2.258109,AB1 0AF,384460.0,800660.0


In [17]:
loc_house_df = df_merge.merge(loc_df_drop, on=['Postcode'], how='left')
loc_house_df.head()

Unnamed: 0,Transfer Date,Postcode,primary_addressable_object_name,secondary_addressable_object_name,street,Town/City,District,County,Price (Thousands),Freehold Tenure,...,Postcode Area,Postcode District,Area Code Average Price (Thousands),Town/City Average Price (Thousands),District Average Price (Thousands),County Average Price (Thousands),LAT,LONG,OSEAST1M,OSNRTH1M
0,2018-09-24,M6 8GQ,1,,RIVINGTON ROAD,SALFORD,SALFORD,GREATER MANCHESTER,253.5,1,...,M,M6,286.373466,261.29226,253.098761,262.010594,53.49295,-2.311376,379439.0,399626.0
1,2018-09-28,WA3 2UE,35,,STONEACRE CLOSE,WARRINGTON,WIGAN,GREATER MANCHESTER,231.95,1,...,WA,WA3,301.647233,289.308785,193.246849,262.010594,53.46955,-2.583977,361332.0,397136.0
2,2018-08-29,OL6 6RJ,102,,THORNFIELD GROVE,ASHTON-UNDER-LYNE,TAMESIDE,GREATER MANCHESTER,112.5,1,...,OL,OL6,197.639388,183.803044,196.658531,262.010594,53.48984,-2.074338,395165.0,399237.0
3,2018-06-15,M46 0TW,37,,THREADNEEDLE PLACE,MANCHESTER,WIGAN,GREATER MANCHESTER,184.995,1,...,M,M46,286.373466,285.381745,193.246849,262.010594,53.52589,-2.495181,367270.0,403360.0
4,2018-09-28,M28 3XS,9,,MARPLE GARDENS,MANCHESTER,SALFORD,GREATER MANCHESTER,214.995,0,...,M,M28,286.373466,285.381745,253.098761,262.010594,53.51677,-2.411523,372810.0,402310.0


In [18]:
loc_house_df.isna().sum()

Transfer Date                                0
Postcode                                     0
primary_addressable_object_name              0
secondary_addressable_object_name      6246213
street                                  125959
Town/City                                    0
District                                     0
County                                       0
Price (Thousands)                            0
Freehold Tenure                              0
Leasehold Tenure                             0
Old Build                                    0
New Build                                    0
Detached                                     0
Flat                                         0
Other Property Type                          0
Semi-detached                                0
Terraced                                     0
Year                                         0
Month                                        0
Quarter                                      0
Day of the We

In [19]:
print(loc_house_df['LONG'].min())
print(loc_house_df['LAT'].min())
print(loc_house_df['LONG'].max())
print(loc_house_df['LAT'].max())

-6.352711
49.89516
1.760167
55.79741


In [20]:
join_cols = [
    'Area Code Average Price (Thousands)',
    'Town/City Average Price (Thousands)',
    'District Average Price (Thousands)',
    'County Average Price (Thousands)',
    'LAT', 'LONG', 'OSEAST1M', 'OSNRTH1M',
    'Price (Thousands)', 'Freehold Tenure', 'Leasehold Tenure',
    'Old Build', 'New Build', 'Detached', 'Flat', 'Other Property Type',
    'Semi-detached', 'Terraced', 'Year', 'Month', 'Quarter',
    'Day of the Week'
]
loc_house_df = loc_house_df.drop_duplicates(subset=join_cols)

In [21]:
upper = loc_house_df['Price (Thousands)'].quantile(0.95)
df_no_outliers =  loc_house_df[loc_house_df['Price (Thousands)'] <= upper]
df_no_outliers.head()

Unnamed: 0,Transfer Date,Postcode,primary_addressable_object_name,secondary_addressable_object_name,street,Town/City,District,County,Price (Thousands),Freehold Tenure,...,Postcode Area,Postcode District,Area Code Average Price (Thousands),Town/City Average Price (Thousands),District Average Price (Thousands),County Average Price (Thousands),LAT,LONG,OSEAST1M,OSNRTH1M
0,2018-09-24,M6 8GQ,1,,RIVINGTON ROAD,SALFORD,SALFORD,GREATER MANCHESTER,253.5,1,...,M,M6,286.373466,261.29226,253.098761,262.010594,53.49295,-2.311376,379439.0,399626.0
1,2018-09-28,WA3 2UE,35,,STONEACRE CLOSE,WARRINGTON,WIGAN,GREATER MANCHESTER,231.95,1,...,WA,WA3,301.647233,289.308785,193.246849,262.010594,53.46955,-2.583977,361332.0,397136.0
2,2018-08-29,OL6 6RJ,102,,THORNFIELD GROVE,ASHTON-UNDER-LYNE,TAMESIDE,GREATER MANCHESTER,112.5,1,...,OL,OL6,197.639388,183.803044,196.658531,262.010594,53.48984,-2.074338,395165.0,399237.0
3,2018-06-15,M46 0TW,37,,THREADNEEDLE PLACE,MANCHESTER,WIGAN,GREATER MANCHESTER,184.995,1,...,M,M46,286.373466,285.381745,193.246849,262.010594,53.52589,-2.495181,367270.0,403360.0
4,2018-09-28,M28 3XS,9,,MARPLE GARDENS,MANCHESTER,SALFORD,GREATER MANCHESTER,214.995,0,...,M,M28,286.373466,285.381745,253.098761,262.010594,53.51677,-2.411523,372810.0,402310.0


In [22]:
df_no_outliers.to_csv('ETL_Final.csv', index=False)