In [1]:
import pandas as pd
years = [18, 19, 20, 21, 22, 23, 24, 25]
headers = [
    "transaction_id",
    "price",
    "date_of_transfer",
    "postcode",
    "property_type",
    "new_build_flag",
    "tenure_type",
    "primary_addressable_object_name",
    "secondary_addressable_object_name",
    "street",
    "locality",
    "town_city",
    "district",
    "county",
    "ppd_category_type",
    "record_status"
]
drop = [
    "transaction_id",
    "primary_addressable_object_name",
    "secondary_addressable_object_name",
    "street",
    "locality",
    "ppd_category_type",
    "record_status"
]
dfs = {}
for year in years:
    url = f"http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-20{year}.csv"
    df = pd.read_csv(url)
    df.columns = headers
    df = df.drop(columns=drop)
    dfs[f"df20{year}"] = df
    print(f"{year} done")
dfs

18 done
19 done
20 done
21 done
22 done


  df = pd.read_csv(url)


23 done
24 done
25 done


{'df2018':           price  date_of_transfer  postcode property_type new_build_flag  \
 0        253500  2018-09-24 00:00    M6 8GQ             D              N   
 1        231950  2018-09-28 00:00   WA3 2UE             D              Y   
 2        112500  2018-08-29 00:00   OL6 6RJ             S              N   
 3        184995  2018-06-15 00:00   M46 0TW             S              Y   
 4        214995  2018-09-28 00:00   M28 3XS             D              Y   
 ...         ...               ...       ...           ...            ...   
 1037248  100000  2018-11-16 00:00   DH4 7HW             T              N   
 1037249  334950  2018-11-30 00:00  NE31 1BF             D              Y   
 1037250  106010  2018-12-12 00:00   NE2 1NY             F              N   
 1037251  144995  2018-07-26 00:00  NE27 0JA             T              Y   
 1037252  334950  2018-07-13 00:00  NE15 9BW             D              Y   
 
         tenure_type            town_city             district  

In [2]:
df_full = pd.concat(dfs)

In [3]:
df_full['Price (Thousands)'] = df_full['price'] / 1000
df_full.drop(columns=['price'], inplace=True)
df_full.describe()

Unnamed: 0,Price (Thousands)
count,7219742.0
mean,381.8475
std,1711.452
min,0.001
25%,165.0
50%,260.0
75%,400.0
max,900000.0


In [4]:
df_full.isna().sum()

date_of_transfer         0
postcode             24503
property_type            0
new_build_flag           0
tenure_type              0
town_city                0
district                 0
county                   0
Price (Thousands)        0
dtype: int64

In [5]:
df_clean = df_full.dropna(axis=0)
df_clean.describe()

Unnamed: 0,Price (Thousands)
count,7195239.0
mean,379.4882
std,1660.432
min,0.001
25%,165.0
50%,260.0
75%,400.0
max,900000.0


In [6]:
df_clean.head()

Unnamed: 0,Unnamed: 1,date_of_transfer,postcode,property_type,new_build_flag,tenure_type,town_city,district,county,Price (Thousands)
df2018,0,2018-09-24 00:00,M6 8GQ,D,N,F,SALFORD,SALFORD,GREATER MANCHESTER,253.5
df2018,1,2018-09-28 00:00,WA3 2UE,D,Y,F,WARRINGTON,WIGAN,GREATER MANCHESTER,231.95
df2018,2,2018-08-29 00:00,OL6 6RJ,S,N,F,ASHTON-UNDER-LYNE,TAMESIDE,GREATER MANCHESTER,112.5
df2018,3,2018-06-15 00:00,M46 0TW,S,Y,F,MANCHESTER,WIGAN,GREATER MANCHESTER,184.995
df2018,4,2018-09-28 00:00,M28 3XS,D,Y,L,MANCHESTER,SALFORD,GREATER MANCHESTER,214.995


In [7]:
df_binary = pd.get_dummies(df_clean, dtype=int, columns=['tenure_type','new_build_flag', 'property_type'])
df_binary.rename(columns={'date_of_transfer':'Transfer Date','town_city':'Town/City','district':'District','county':'County','tenure_type_F':'Freehold Tenure','tenure_type_L':'Leasehold Tenure','new_build_flag_N':'Old Build','new_build_flag_Y':'New Build','property_type_D':'Detached', 'property_type_F':'Flat', 'property_type_O':'Other Property Type','property_type_S':'Semi-detached','property_type_T':'Terraced', 'postcode':'Postcode'}, inplace=True)
df_binary.head()

Unnamed: 0,Unnamed: 1,Transfer Date,Postcode,Town/City,District,County,Price (Thousands),Freehold Tenure,Leasehold Tenure,Old Build,New Build,Detached,Flat,Other Property Type,Semi-detached,Terraced
df2018,0,2018-09-24 00:00,M6 8GQ,SALFORD,SALFORD,GREATER MANCHESTER,253.5,1,0,1,0,1,0,0,0,0
df2018,1,2018-09-28 00:00,WA3 2UE,WARRINGTON,WIGAN,GREATER MANCHESTER,231.95,1,0,0,1,1,0,0,0,0
df2018,2,2018-08-29 00:00,OL6 6RJ,ASHTON-UNDER-LYNE,TAMESIDE,GREATER MANCHESTER,112.5,1,0,1,0,0,0,0,1,0
df2018,3,2018-06-15 00:00,M46 0TW,MANCHESTER,WIGAN,GREATER MANCHESTER,184.995,1,0,0,1,0,0,0,1,0
df2018,4,2018-09-28 00:00,M28 3XS,MANCHESTER,SALFORD,GREATER MANCHESTER,214.995,0,1,0,1,1,0,0,0,0


In [8]:
df_binary['Transfer Date'] = pd.to_datetime(df_binary['Transfer Date'], errors='coerce')
df_binary['Year'] = df_binary['Transfer Date'].dt.year
df_binary['Month'] = df_binary['Transfer Date'].dt.month
df_binary['Quarter'] = df_binary['Transfer Date'].dt.quarter
df_binary['Day of the Week'] = df_binary['Transfer Date'].dt.dayofweek
df_binary['Transfer Date'] = df_binary['Transfer Date'].astype(str).str[:10]
df_binary.tail()

Unnamed: 0,Unnamed: 1,Transfer Date,Postcode,Town/City,District,County,Price (Thousands),Freehold Tenure,Leasehold Tenure,Old Build,New Build,Detached,Flat,Other Property Type,Semi-detached,Terraced,Year,Month,Quarter,Day of the Week
df2025,252142,2025-01-17,NN4 8SA,NORTHAMPTON,WEST NORTHAMPTONSHIRE,WEST NORTHAMPTONSHIRE,98.0,0,1,1,0,0,1,0,0,0,2025,1,1,4
df2025,252143,2025-01-24,NN16 0SE,KETTERING,NORTH NORTHAMPTONSHIRE,NORTH NORTHAMPTONSHIRE,120.0,0,1,1,0,0,1,0,0,0,2025,1,1,4
df2025,252144,2025-01-14,NN10 0NB,RUSHDEN,NORTH NORTHAMPTONSHIRE,NORTH NORTHAMPTONSHIRE,604.5,1,0,1,0,0,0,1,0,0,2025,1,1,1
df2025,252145,2025-01-10,NN4 8TE,NORTHAMPTON,WEST NORTHAMPTONSHIRE,WEST NORTHAMPTONSHIRE,133.65,0,1,1,0,0,1,0,0,0,2025,1,1,4
df2025,252146,2025-01-27,NN15 5YL,KETTERING,NORTH NORTHAMPTONSHIRE,NORTH NORTHAMPTONSHIRE,332.0,1,0,1,0,1,0,0,0,0,2025,1,1,0


In [9]:
df_binary.shape

(7195239, 19)

In [10]:
print(df_binary.dtypes)
print(df_binary.nunique())

Transfer Date           object
Postcode                object
Town/City               object
District                object
County                  object
Price (Thousands)      float64
Freehold Tenure          int64
Leasehold Tenure         int64
Old Build                int64
New Build                int64
Detached                 int64
Flat                     int64
Other Property Type      int64
Semi-detached            int64
Terraced                 int64
Year                     int32
Month                    int32
Quarter                  int32
Day of the Week          int32
dtype: object
Transfer Date             2731
Postcode               1162297
Town/City                 1150
District                   361
County                     117
Price (Thousands)       143229
Freehold Tenure              2
Leasehold Tenure             2
Old Build                    2
New Build                    2
Detached                     2
Flat                         2
Other Property Type      

In [11]:
df_binary['Postcode Area'] = df_binary['Postcode'].str.extract(r'^([A-Z]+)')
df_binary['Postcode District'] = df_binary['Postcode'].str.extract(r'^([A-Z]+[0-9]+)')
df_binary.head()

Unnamed: 0,Unnamed: 1,Transfer Date,Postcode,Town/City,District,County,Price (Thousands),Freehold Tenure,Leasehold Tenure,Old Build,New Build,...,Flat,Other Property Type,Semi-detached,Terraced,Year,Month,Quarter,Day of the Week,Postcode Area,Postcode District
df2018,0,2018-09-24,M6 8GQ,SALFORD,SALFORD,GREATER MANCHESTER,253.5,1,0,1,0,...,0,0,0,0,2018,9,3,0,M,M6
df2018,1,2018-09-28,WA3 2UE,WARRINGTON,WIGAN,GREATER MANCHESTER,231.95,1,0,0,1,...,0,0,0,0,2018,9,3,4,WA,WA3
df2018,2,2018-08-29,OL6 6RJ,ASHTON-UNDER-LYNE,TAMESIDE,GREATER MANCHESTER,112.5,1,0,1,0,...,0,0,1,0,2018,8,3,2,OL,OL6
df2018,3,2018-06-15,M46 0TW,MANCHESTER,WIGAN,GREATER MANCHESTER,184.995,1,0,0,1,...,0,0,1,0,2018,6,2,4,M,M46
df2018,4,2018-09-28,M28 3XS,MANCHESTER,SALFORD,GREATER MANCHESTER,214.995,0,1,0,1,...,0,0,0,0,2018,9,3,4,M,M28


In [12]:
df_binary.to_csv('Pre_Outlier_removal.csv', index=False)

In [13]:
lower = df_binary['Price (Thousands)'].quantile(0.005)
upper = df_binary['Price (Thousands)'].quantile(0.995)
df_no_outliers = df_binary[(df_binary['Price (Thousands)'] >= lower) & (df_binary['Price (Thousands)'] <= upper)]
df_no_outliers.describe()

Unnamed: 0,Price (Thousands),Freehold Tenure,Leasehold Tenure,Old Build,New Build,Detached,Flat,Other Property Type,Semi-detached,Terraced,Year,Month,Quarter,Day of the Week
count,7124208.0,7124208.0,7124208.0,7124208.0,7124208.0,7124208.0,7124208.0,7124208.0,7124208.0,7124208.0,7124208.0,7124208.0,7124208.0,7124208.0
mean,329.8715,0.7683223,0.2316777,0.8882258,0.1117742,0.2339497,0.1779576,0.047002,0.2681612,0.2729296,2021.035,6.576128,2.504003,2.694955
std,278.8689,0.4219042,0.4219042,0.3150885,0.3150885,0.4233406,0.3824771,0.2116431,0.443002,0.445465,2.044985,3.433242,1.124971,1.466922
min,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,1.0,1.0,0.0
25%,165.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2019.0,3.0,1.0,2.0
50%,260.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2021.0,7.0,3.0,3.0
75%,399.995,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2023.0,10.0,4.0,4.0
max,3000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2025.0,12.0,4.0,6.0


In [14]:
df_no_outliers.to_csv('ETL_Final.csv', index=False)