In [3]:
import pandas as pd
years = [18, 19, 20, 21, 22, 23, 24, 25]
headers = [
    "transaction_id",
    "price",
    "date_of_transfer",
    "postcode",
    "property_type",
    "new_build_flag",
    "tenure_type",
    "primary_addressable_object_name",
    "secondary_addressable_object_name",
    "street",
    "locality",
    "town_city",
    "district",
    "county",
    "ppd_category_type",
    "record_status"
]
drop = [
    "transaction_id",
    "primary_addressable_object_name",
    "secondary_addressable_object_name",
    "street",
    "locality",
    "ppd_category_type",
    "record_status"
]
dfs = {}
for year in years:
    url = f"http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-20{year}.csv"
    df = pd.read_csv(url)
    df.columns = headers
    df = df.drop(columns=drop)
    dfs[f"df20{year}"] = df
    print(f"{year} done")
dfs

18 done
19 done
20 done
21 done
22 done


  df = pd.read_csv(url)


23 done
24 done
25 done


{'df2018':           price  date_of_transfer  postcode property_type new_build_flag  \
 0        253500  2018-09-24 00:00    M6 8GQ             D              N   
 1        231950  2018-09-28 00:00   WA3 2UE             D              Y   
 2        112500  2018-08-29 00:00   OL6 6RJ             S              N   
 3        184995  2018-06-15 00:00   M46 0TW             S              Y   
 4        214995  2018-09-28 00:00   M28 3XS             D              Y   
 ...         ...               ...       ...           ...            ...   
 1037248  100000  2018-11-16 00:00   DH4 7HW             T              N   
 1037249  334950  2018-11-30 00:00  NE31 1BF             D              Y   
 1037250  106010  2018-12-12 00:00   NE2 1NY             F              N   
 1037251  144995  2018-07-26 00:00  NE27 0JA             T              Y   
 1037252  334950  2018-07-13 00:00  NE15 9BW             D              Y   
 
         tenure_type            town_city             district  

In [4]:
df_full = pd.concat(dfs)

In [5]:
df_full['Price (Thousands)'] = df_full['price'] / 1000
df_full.drop(columns=['price'], inplace=True)
df_full.describe()

Unnamed: 0,Price (Thousands)
count,7219742.0
mean,381.8475
std,1711.452
min,0.001
25%,165.0
50%,260.0
75%,400.0
max,900000.0


In [6]:
df_full.isna().sum()

date_of_transfer         0
postcode             24503
property_type            0
new_build_flag           0
tenure_type              0
town_city                0
district                 0
county                   0
Price (Thousands)        0
dtype: int64

In [22]:
df_clean = df_full.dropna(axis=1)
df_clean.describe()

Unnamed: 0,Price (Thousands)
count,7219742.0
mean,381.8475
std,1711.452
min,0.001
25%,165.0
50%,260.0
75%,400.0
max,900000.0


In [23]:
df_binary = pd.get_dummies(df_clean, dtype=int, columns=['tenure_type','new_build_flag', 'property_type'])
df_binary.rename(columns={'date_of_transfer':'Transfer Date','town_city':'Town/City','district':'District','county':'County','tenure_type_F':'Freehold Tenure','tenure_type_L':'Leasehold Tenure','new_build_flag_N':'Old Build','new_build_flag_Y':'New Build','property_type_D':'Detached', 'property_type_F':'Flat', 'property_type_O':'Other Property Type','property_type_S':'Semi-detached','property_type_T':'Terraced'}, inplace=True)
df_binary.head()

Unnamed: 0,Unnamed: 1,Transfer Date,Town/City,District,County,Price (Thousands),Freehold Tenure,Leasehold Tenure,Old Build,New Build,Detached,Flat,Other Property Type,Semi-detached,Terraced
df2018,0,2018-09-24 00:00,SALFORD,SALFORD,GREATER MANCHESTER,253.5,1,0,1,0,1,0,0,0,0
df2018,1,2018-09-28 00:00,WARRINGTON,WIGAN,GREATER MANCHESTER,231.95,1,0,0,1,1,0,0,0,0
df2018,2,2018-08-29 00:00,ASHTON-UNDER-LYNE,TAMESIDE,GREATER MANCHESTER,112.5,1,0,1,0,0,0,0,1,0
df2018,3,2018-06-15 00:00,MANCHESTER,WIGAN,GREATER MANCHESTER,184.995,1,0,0,1,0,0,0,1,0
df2018,4,2018-09-28 00:00,MANCHESTER,SALFORD,GREATER MANCHESTER,214.995,0,1,0,1,1,0,0,0,0


In [24]:
df_binary['Transfer Date'] = df_binary['Transfer Date'].astype(str).str[:10]
df_binary.tail()

Unnamed: 0,Unnamed: 1,Transfer Date,Town/City,District,County,Price (Thousands),Freehold Tenure,Leasehold Tenure,Old Build,New Build,Detached,Flat,Other Property Type,Semi-detached,Terraced
df2025,252142,2025-01-17,NORTHAMPTON,WEST NORTHAMPTONSHIRE,WEST NORTHAMPTONSHIRE,98.0,0,1,1,0,0,1,0,0,0
df2025,252143,2025-01-24,KETTERING,NORTH NORTHAMPTONSHIRE,NORTH NORTHAMPTONSHIRE,120.0,0,1,1,0,0,1,0,0,0
df2025,252144,2025-01-14,RUSHDEN,NORTH NORTHAMPTONSHIRE,NORTH NORTHAMPTONSHIRE,604.5,1,0,1,0,0,0,1,0,0
df2025,252145,2025-01-10,NORTHAMPTON,WEST NORTHAMPTONSHIRE,WEST NORTHAMPTONSHIRE,133.65,0,1,1,0,0,1,0,0,0
df2025,252146,2025-01-27,KETTERING,NORTH NORTHAMPTONSHIRE,NORTH NORTHAMPTONSHIRE,332.0,1,0,1,0,1,0,0,0,0


In [25]:
lower = df_binary['Price (Thousands)'].quantile(0.005)
upper = df_binary['Price (Thousands)'].quantile(0.995)
df_binary = df_binary[(df_binary['Price (Thousands)'] >= lower) & (df_binary['Price (Thousands)'] <= upper)]
df_binary.describe()

Unnamed: 0,Price (Thousands),Freehold Tenure,Leasehold Tenure,Old Build,New Build,Detached,Flat,Other Property Type,Semi-detached,Terraced
count,7147613.0,7147613.0,7147613.0,7147613.0,7147613.0,7147613.0,7147613.0,7147613.0,7147613.0,7147613.0
mean,329.873,0.7680538,0.2319462,0.8883434,0.1116566,0.2332777,0.1775309,0.04977704,0.2673146,0.2720998
std,280.9675,0.4220749,0.4220749,0.3149435,0.3149435,0.4229176,0.3821174,0.217484,0.4425579,0.445041
min,12.895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,165.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,260.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,399.95,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
max,3075.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [27]:
df_binary.to_csv('ETL_Final.csv', index=False)