This notebook contains the cleaning and monthly alignment of the 8 macroeconomic variables and the target variable (HPI) used in the analysis. The link to the datasets on which cleaning has been performed can be found here:
1. Mortgage Approval: https://drive.google.com/file/d/1L6FKrkZi8020BkHmAFsN3majv0wxfb3q/view?usp=sharing
2. Mortgage Rate: https://drive.google.com/file/d/1PEX9OYMLA29D1B8qRhAe7__oL3_OJ_yd/view?usp=sharing
3. Bank Rate: https://drive.google.com/file/d/1MyxQ4UiUWSmYr8ylsjyE7nPCzDcf28tg/view?usp=sharing
4. Consumer Confidence: https://drive.google.com/file/d/1j-wjcWwbucjorkWbSsdidnm16-vld8jb/view?usp=sharing
5.  Construction Cost Index: https://docs.google.com/spreadsheets/d/1vcrJm24YzjtiBmNYxipByEaI-LMGgRps/edit?usp=sharing&ouid=111965315671490167461&rtpof=true&sd=true
6. Unemployment Rate: https://drive.google.com/file/d/16vheZDWYaNo3jbOkdE1CAwk9ekptruC8/view?usp=sharing
7. Inflation Rate: https://drive.google.com/file/d/1OGUo21bkeCLyr4NbtSCxxtPfiKr-lUe2/view?usp=sharing
8. Average Earnings: https://docs.google.com/spreadsheets/d/1bLGHyGJfkOvod1nnaY0rFIlTq9_JN6kJ/edit?usp=sharing&ouid=111965315671490167461&rtpof=true&sd=true
9. HPI: https://drive.google.com/file/d/1_AiqVg0Ay5qASLiWXuWLuvllPm3KogpR/view?usp=sharing

Macroeconomic Variables

1. Mortgage Approval

In [None]:
import pandas as pd
from google.colab import files

# Loading the mortgage approval file
in_path = "Mortgage Approval CLEANED.csv"
ma_raw = pd.read_csv(in_path)


# Building a monthly Date column
ma = ma_raw.copy()

def to_month_start(dt_series: pd.Series) -> pd.Series:
    dt = pd.to_datetime(dt_series, errors="coerce")
    return dt.dt.to_period("M").dt.to_timestamp(how="start")

if {"Year", "Month"}.issubset(ma.columns):
    # Year + Month as separate columns
    y = ma["Year"].astype(str).str.strip()
    m = ma["Month"].astype(str).str.strip()
    ma["Date"] = pd.to_datetime(y + "-" + m + "-01", errors="coerce")
    ma["Date"] = to_month_start(ma["Date"])

elif "Year" in ma.columns:
    s = ma["Year"].astype(str).str.strip()
    parsed = pd.to_datetime(s, format="%Y %b", errors="coerce")

    if parsed.isna().all():
        parsed = pd.to_datetime(s, format="%b %Y", errors="coerce")

    if parsed.isna().all():
        parsed = pd.to_datetime(s, errors="coerce")

    ma["Date"] = to_month_start(parsed)

elif "Date" in ma.columns:
    s = ma["Date"].astype(str).str.strip()
    parsed = pd.to_datetime(s, format="%Y %b", errors="coerce")

    if parsed.isna().all():
        parsed = pd.to_datetime(s, format="%b %Y", errors="coerce")

    if parsed.isna().all():
        parsed = pd.to_datetime(s, errors="coerce")

    ma["Date"] = to_month_start(parsed)

else:
    raise ValueError("I couldn't find Year/Month/Date columns to construct a monthly Date.")

# Finding the approvals value column and cleaning it

lower_cols = {c: c.lower() for c in ma.columns}
candidates = [
    c for c, lc in lower_cols.items()
    if ("approval" in lc or "approvals" in lc)
    and c not in ["Date", "Month", "Year"]
]

if len(candidates) == 0:
    raise ValueError(
        "Couldn't find an approvals column. "
        "Rename the value column to include 'approval' or 'approvals', then rerun."
    )

value_col = candidates[0]
ma[value_col] = (
    ma[value_col].astype(str)
    .str.replace(",", "", regex=False)
    .str.extract(r"([-+]?\d*\.?\d+)")[0]
    .astype(float)
)

# Keeping Date + MortgageApprovals only
ma_monthly = (
    ma[["Date", value_col]]
    .dropna(subset=["Date"])
    .rename(columns={value_col: "MortgageApprovals"})
    .sort_values("Date")
    .groupby("Date", as_index=False)
    .agg({"MortgageApprovals": "last"})
)

# Filtering for Jan 2005 — Jun 2025
start = pd.Timestamp("2005-01-01")
end   = pd.Timestamp("2025-06-01")

ma_monthly = ma_monthly.loc[
    (ma_monthly["Date"] >= start) & (ma_monthly["Date"] <= end)
].reset_index(drop=True)

print(f"Mortgage approvals cleaned: {len(ma_monthly)} monthly rows "
      f"({ma_monthly['Date'].min().date()} → {ma_monthly['Date'].max().date()})")
display(ma_monthly.head())

# Downloading the dataset

out_path = "MortgageApprovals_monthly_cleaned_2005_2025.csv"
ma_monthly.to_csv(out_path, index=False)
files.download(out_path)

Mortgage approvals cleaned: 246 monthly rows (2005-01-01 → 2025-06-01)


  parsed = pd.to_datetime(s, errors="coerce")


Unnamed: 0,Date,MortgageApprovals
0,2005-01-01,81665.0
1,2005-02-01,86267.0
2,2005-03-01,91692.0
3,2005-04-01,98253.0
4,2005-05-01,94811.0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

2. Mortgage Rate

In [None]:
# Loading the mortgage rate file

in_path = "Quoted household interest rate on 2-year fixed mortgages CLEANED.csv"
mort_raw = pd.read_csv(in_path)

mort = mort_raw.copy()

# Building a Date column
if "Date" in mort.columns:
    # Already has a date column (often daily)
    mort["Date"] = pd.to_datetime(mort["Date"], dayfirst=True, errors="coerce")

elif {"Year", "Month"}.issubset(mort.columns):
    # Separate Year / Month columns
    mort["Date"] = pd.to_datetime(
        mort["Year"].astype(str) + "-" + mort["Month"].astype(str) + "-01",
        errors="coerce"
    )

else:
    raise ValueError(
        "Could not identify a usable date column. "
        "Expected either 'Date' or ('Year', 'Month')."
    )

# Resampling to monthly (month start)
mort_monthly = (
    mort
    .set_index("Date")
    .asfreq("D")
    .ffill()
    .resample("MS")
    .last()
    .reset_index()
)

# Renaming the rate column
value_cols = [c for c in mort_monthly.columns if c != "Date"]

if len(value_cols) != 1:
    raise ValueError(
        f"Expected exactly one value column, found {len(value_cols)}: {value_cols}"
    )

mort_monthly = mort_monthly.rename(
    columns={value_cols[0]: "MortgageRate", "Date": "Date"}
)

# Filtering for Jan 2005 — Jun 2025
start = pd.Timestamp("2005-01-01")
end   = pd.Timestamp("2025-06-01")

mort_monthly = mort_monthly.loc[
    (mort_monthly["Date"] >= start) &
    (mort_monthly["Date"] <= end)
].reset_index(drop=True)

print(
    f"Mortgage rate series prepared: {len(mort_monthly)} monthly observations "
    f"({mort_monthly['Date'].min().date()} → {mort_monthly['Date'].max().date()})"
)
display(mort_monthly.head())

# Downloading
out_path = "MortgageRate_monthly_cleaned_2005_2025.csv"
mort_monthly.to_csv(out_path, index=False)
files.download(out_path)


Mortgage rate series prepared: 246 monthly observations (2005-01-01 → 2025-06-01)


  mort["Date"] = pd.to_datetime(mort["Date"], dayfirst=True, errors="coerce")


Unnamed: 0,Date,MortgageRate
0,2005-01-01,4.99
1,2005-02-01,5.07
2,2005-03-01,5.2
3,2005-04-01,5.16
4,2005-05-01,4.91


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

3. Bank Rate

In [None]:
# Loading the Bank Rate file

in_path = "Bank Rate CLEANED.csv"
br_raw = pd.read_csv(in_path)

br = br_raw.copy()

# Building a Date column
br["Date"] = pd.to_datetime(
    br["Day"].astype(str).str.strip() + " " +
    br["Month"].astype(str).str.strip() + " " +
    br["Year"].astype(str).str.strip(),
    dayfirst=True,
    errors="coerce"
)


# Keeping only Date + bank rate columns
rate_col = "Bank rate"
if rate_col not in br.columns:
    raise ValueError(f"Couldn't find '{rate_col}' in the file. Columns are: {br.columns.tolist()}")

br = (
    br[["Date", rate_col]]
    .dropna(subset=["Date"])
    .sort_values("Date")
    .reset_index(drop=True)
)

# Cleaning the bank rate column
br[rate_col] = (
    br[rate_col].astype(str)
    .str.replace("%", "", regex=False)
    .str.replace(",", "", regex=False)
    .str.extract(r"([-+]?\d*\.?\d+)")[0]
    .astype(float)
)

# Resampling to monthly (month start)

br_monthly = (
    br.set_index("Date")
      .asfreq("D")
      .ffill()
      .resample("MS")
      .last()
      .reset_index()
      .rename(columns={"Date": "Date", rate_col: "BankRate"})
)

# Filtering for Jan 2005 — Jun 2025

start = pd.Timestamp("2005-01-01")
end   = pd.Timestamp("2025-06-01")

br_monthly = br_monthly.loc[
    (br_monthly["Date"] >= start) & (br_monthly["Date"] <= end)
].reset_index(drop=True)

print(
    f"Bank Rate series prepared: {len(br_monthly)} monthly rows "
    f"({br_monthly['Date'].min().date()} → {br_monthly['Date'].max().date()})"
)
display(br_monthly.head(12))
display(br_monthly.tail(12))

# Downloading
out_path = "BankRate_monthly_cleaned_2005_2025.csv"
br_monthly.to_csv(out_path, index=False)
files.download(out_path)


Bank Rate series prepared: 246 monthly rows (2005-01-01 → 2025-06-01)


Unnamed: 0,Date,BankRate
0,2005-01-01,4.75
1,2005-02-01,4.75
2,2005-03-01,4.75
3,2005-04-01,4.75
4,2005-05-01,4.75
5,2005-06-01,4.75
6,2005-07-01,4.75
7,2005-08-01,4.5
8,2005-09-01,4.5
9,2005-10-01,4.5


Unnamed: 0,Date,BankRate
234,2024-07-01,5.25
235,2024-08-01,5.0
236,2024-09-01,5.0
237,2024-10-01,5.0
238,2024-11-01,4.75
239,2024-12-01,4.75
240,2025-01-01,4.75
241,2025-02-01,4.5
242,2025-03-01,4.5
243,2025-04-01,4.5


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

4. Consumer Confidence

In [None]:
# Loading the Consumer Confidence file

in_path = "Consumer confidence CLEANED.csv"
raw = pd.read_csv(in_path)
cc = raw.copy()

# Finding the date column and parsing it
date_names = {"time_period", "time period", "date", "month", "period", "time"}
date_col = None

for c in cc.columns:
    if c.lower().strip() in date_names:
        date_col = c
        break

if date_col is None:
    date_col = cc.columns[0]

cc["Date"] = pd.to_datetime(cc[date_col], errors="coerce", dayfirst=True)

# Find the best value column (most usable numeric values)

ignore_names = date_names | {"year"}

candidate_cols = [
    c for c in cc.columns
    if c != date_col and c.lower().strip() not in ignore_names
]

if not candidate_cols:
    raise ValueError(f"No candidate value columns found. Columns are: {cc.columns.tolist()}")

def clean_numeric(series: pd.Series) -> pd.Series:
    """Turn messy numeric strings into floats (keeps NaNs if parsing fails)."""
    s = (
        series.astype(str)
        .str.replace("%", "", regex=False)
        .str.replace(",", "", regex=False)
        .str.extract(r"([-+]?\d*\.?\d+)")[0]
    )
    return pd.to_numeric(s, errors="coerce")

best_col = None
best_count = -1

for c in candidate_cols:
    n_valid = clean_numeric(cc[c]).notna().sum()
    if n_valid > best_count:
        best_count = n_valid
        best_col = c

cc["ConsumerConfidence"] = clean_numeric(cc[best_col])

print(f"Using '{best_col}' as the Consumer Confidence series ({best_count} numeric values found).")

# Keeping a monthly series
out = cc[["Date", "ConsumerConfidence"]].dropna(subset=["Date"]).copy()

# Keeping the last value in each month
out["Date"] = out["Date"].dt.to_period("M").dt.to_timestamp(how="start")
out = (
    out.sort_values("Date")
       .groupby("Date", as_index=False)
       .agg({"ConsumerConfidence": "last"})
)

# Filtering for Jan 2005 — Jun 2025

start = pd.Timestamp("2005-01-01")
end   = pd.Timestamp("2025-06-01")

out = out.loc[(out["Date"] >= start) & (out["Date"] <= end)].reset_index(drop=True)

print(
    f"Consumer confidence prepared: {len(out)} monthly rows "
    f"({out['Date'].min().date()} → {out['Date'].max().date()})"
)
display(out.head(12))
display(out.tail(12))

# Downloading
out_path = "ConsumerConfidence_monthly_cleaned_2005_2025.csv"
out.to_csv(out_path, index=False)
files.download(out_path)


Using 'Consumer Confidence' as the Consumer Confidence series (299 numeric values found).
Consumer confidence prepared: 239 monthly rows (2005-01-01 → 2025-06-01)


Unnamed: 0,Date,ConsumerConfidence
0,2005-01-01,0.043352
1,2005-02-01,0.119166
2,2005-03-01,0.097383
3,2005-04-01,-0.108099
4,2005-05-01,-0.129858
5,2005-06-01,-0.097521
6,2005-07-01,0.130154
7,2005-08-01,-0.010832
8,2005-09-01,-0.054166
9,2005-10-01,-0.140909


Unnamed: 0,Date,ConsumerConfidence
227,2024-07-01,0.217988
228,2024-08-01,0.081568
229,2024-09-01,-0.896512
230,2024-10-01,0.027413
231,2024-11-01,0.246648
232,2024-12-01,0.109351
233,2025-01-01,-0.491544
234,2025-02-01,0.356758
235,2025-03-01,-0.027345
236,2025-04-01,-0.410292


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

5. Construction cost index

In [None]:
# Loading the Construction Cost Index file
in_path = "Building_Materials_and_Components CLEANED.xlsx"
bm_raw = pd.read_excel(in_path)
bm = bm_raw.copy()

# Building a monthly Date column

if not {"Year", "Month"}.issubset(bm.columns):
    raise ValueError("Expected 'Year' and 'Month' columns in the Excel file.")

bm["Date"] = pd.to_datetime(
    bm["Year"].astype(str).str.strip() + "-" + bm["Month"].astype(str).str.strip() + "-01",
    errors="coerce"
).dt.to_period("M").dt.to_timestamp(how="start")

# Dropping the original Year/Month columns
bm = bm.drop(columns=["Year", "Month"])

bm = bm[["Date"] + [c for c in bm.columns if c != "Date"]]

# Making numeric columns numeric
value_cols = [c for c in bm.columns if c != "Date"]

for c in value_cols:
    bm[c] = (
        bm[c].astype(str)
             .str.replace(",", "", regex=False)
             .str.extract(r"([-+]?\d*\.?\d+)")[0]
    )
    bm[c] = pd.to_numeric(bm[c], errors="coerce")

# Keeping the last value in each month

bm = (
    bm.sort_values("Date")
      .groupby("Date", as_index=False)
      .last()
)

# Filtering for Jan 2005 — Jun 2025
start = pd.Timestamp("2005-01-01")
end   = pd.Timestamp("2025-06-01")

bm = bm.loc[(bm["Date"] >= start) & (bm["Date"] <= end)].reset_index(drop=True)

print(
    f"Construction cost / building materials series prepared: "
    f"{len(bm)} monthly rows ({bm['Date'].min().date()} → {bm['Date'].max().date()})"
)
display(bm.head())
display(bm.tail())

# Downloading
out_path = "BuildingMaterials_monthly_cleaned_2005_2025.csv"
bm.to_csv(out_path, index=False)
files.download(out_path)


Construction cost / building materials series prepared: 241 monthly rows (2005-01-01 → 2025-01-01)


Unnamed: 0,Date,New Housing
0,2005-01-01,70.1
1,2005-02-01,71.1
2,2005-03-01,71.4
3,2005-04-01,71.8
4,2005-05-01,72.2


Unnamed: 0,Date,New Housing
236,2024-09-01,153.3
237,2024-10-01,152.9
238,2024-11-01,153.0
239,2024-12-01,152.9
240,2025-01-01,152.4


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

6. Unemployment Rate

In [None]:
# Loading the unemployment rate file
unemp = pd.read_csv("Unemployment Rate CLEANED.csv")

# Constructing a monthly Date column
if {"Year", "Month"}.issubset(unemp.columns):
    unemp["Date"] = pd.to_datetime(
        unemp["Year"].astype(str) + "-" + unemp["Month"].astype(str) + "-01",
        errors="coerce"
    )
elif "Date" in unemp.columns:
    unemp["Date"] = pd.to_datetime(unemp["Date"], errors="coerce")
else:
    raise ValueError("No Date or Year/Month columns found in unemployment file.")

# Anchor to month-start
unemp["Date"] = unemp["Date"].dt.to_period("M").dt.to_timestamp(how="start")


# Cleaning the unemployment rate column

rate_col = next(
    c for c in unemp.columns
    if ("unemploy" in c.lower()) or ("rate" in c.lower())
)

unemp[rate_col] = (
    unemp[rate_col]
      .astype(str)
      .str.replace("%", "", regex=False)
      .str.replace(",", "", regex=False)
      .str.extract(r"([-+]?\d*\.?\d+)")[0]
      .astype(float)
)

# Keeping date and Unemployment Rate

unemp_monthly = (
    unemp[["Date", rate_col]]
    .dropna(subset=["Date"])
    .rename(columns={rate_col: "UnemploymentRate"})
    .sort_values("Date")
    .groupby("Date", as_index=False)
    .agg({"UnemploymentRate": "last"})
)


# Filtering for Jan 2005 – Jun 2025
start = pd.Timestamp("2005-01-01")
end   = pd.Timestamp("2025-06-01")

unemp_monthly = (
    unemp_monthly[
        (unemp_monthly["Date"] >= start) &
        (unemp_monthly["Date"] <= end)
    ]
    .reset_index(drop=True)
)

print("Rows:", len(unemp_monthly))
print(unemp_monthly.head())

# Downloading
out_path = "UnemploymentRate_monthly_cleaned_2005_2025.csv"
unemp_monthly.to_csv(out_path, index=False)
files.download(out_path)


Rows: 245
        Date  UnemploymentRate
0 2005-01-01               4.8
1 2005-02-01               4.7
2 2005-03-01               4.7
3 2005-04-01               4.8
4 2005-05-01               4.8


  unemp["Date"] = pd.to_datetime(unemp["Date"], errors="coerce")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

7. Inflation Rate

In [None]:
# Loading cleaned CPI file
cpi = pd.read_csv("series-CPI CLEANED.csv")

# Constructing a monthly Date column
if {"Year", "Month"}.issubset(cpi.columns):
    cpi["Date"] = pd.to_datetime(
        cpi["Year"].astype(str) + "-" + cpi["Month"].astype(str) + "-01",
        errors="coerce"
    )

elif "Year" in cpi.columns:
    s = cpi["Year"].astype(str).str.strip()
    cpi["Date"] = (
        pd.to_datetime(s, format="%Y %b", errors="coerce")
        .fillna(pd.to_datetime(s, format="%b %Y", errors="coerce"))
        .fillna(pd.to_datetime(s, errors="coerce"))
    )

else:
    raise ValueError("No usable Year/Month columns found to construct dates.")

# Anchor all dates to month-start
cpi["Date"] = cpi["Date"].dt.to_period("M").dt.to_timestamp(how="start")


# Cleaning CPI value column
value_col = "CPI" if "CPI" in cpi.columns else next(
    c for c in cpi.columns if "cpi" in c.lower()
)

cpi[value_col] = (
    cpi[value_col]
        .astype(str)
        .str.replace(",", "", regex=False)
        .str.extract(r"([-+]?\d*\.?\d+)")[0]
        .astype(float)
)


# Keeping monthly CPI series
cpi_monthly = (
    cpi[["Date", value_col]]
        .dropna(subset=["Date"])
        .rename(columns={value_col: "CPI"})
        .sort_values("Date")
        .groupby("Date", as_index=False)
        .agg({"CPI": "last"})
)


# Filtering for Jan 2005- June 2025
start = pd.Timestamp("2005-01-01")
end   = pd.Timestamp("2025-06-01")

cpi_monthly = (
    cpi_monthly[
        (cpi_monthly["Date"] >= start) &
        (cpi_monthly["Date"] <= end)
    ]
    .reset_index(drop=True)
)

print("Rows after clipping:", len(cpi_monthly))
print(cpi_monthly.head())

# Downloading
out_path = "CPI_monthly_cleaned_2005_2025.csv"
cpi_monthly.to_csv(out_path, index=False)
files.download(out_path)


Rows after clipping: 246
        Date  CPI
0 2005-01-01  1.7
1 2005-02-01  1.7
2 2005-03-01  2.0
3 2005-04-01  1.9
4 2005-05-01  1.9


  .fillna(pd.to_datetime(s, errors="coerce"))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

8. Average Earnings

In [None]:
!pip -q install xlrd
# Loading the Average Earnings Excel file
file_path = "Average earnings-sept.xls"

earnings = None
for header_row in range(0, 5):
    temp = pd.read_excel(file_path, sheet_name=0, header=header_row)
    cols = {str(c).strip() for c in temp.columns}
    if {
        "Date",
        "Total Weekly Average Earnings",
        "Regular Weekly Average Earnings",
    }.issubset(cols):
        earnings = temp
        break
if earnings is None:
    earnings = pd.read_excel(file_path, sheet_name=0)

# Keeping only relevant columns
earnings = earnings[
    ["Date", "Total Weekly Average Earnings", "Regular Weekly Average Earnings"]
].copy()

# Parsing monthly dates and anchor to month-start
earnings["Date"] = pd.to_datetime(
    earnings["Date"], format="%b %y", errors="coerce"
)

earnings["Date"] = (
    earnings["Date"]
    .dt.to_period("M")
    .dt.to_timestamp(how="start")
)


# Cleaning the columns
earnings["Total Weekly Average Earnings"] = pd.to_numeric(
    earnings["Total Weekly Average Earnings"], errors="coerce"
)

earnings["Regular Weekly Average Earnings"] = pd.to_numeric(
    earnings["Regular Weekly Average Earnings"], errors="coerce"
)


# Building monthly dataset
ae_monthly = (
    earnings
        .dropna(subset=["Date"])
        .sort_values("Date")
        .groupby("Date", as_index=False)
        .agg({
            "Total Weekly Average Earnings": "last",
            "Regular Weekly Average Earnings": "last",
        })
        .rename(columns={
            "Total Weekly Average Earnings": "TotalWeeklyEarnings",
            "Regular Weekly Average Earnings": "RegularWeeklyEarnings",
        })
)

# Filtering for Jan 2005- June 2025
start = pd.Timestamp("2005-01-01")
end   = pd.Timestamp("2025-06-01")

ae_monthly = (
    ae_monthly[
        (ae_monthly["Date"] >= start) &
        (ae_monthly["Date"] <= end)
    ]
    .reset_index(drop=True)
)

print(ae_monthly.head())
print("Rows:", len(ae_monthly), "| Unique months:", ae_monthly["Date"].nunique())

# Downloading
out_path = "AverageEarnings_monthly_cleaned_2005_2025.csv"
ae_monthly.to_csv(out_path, index=False)
files.download(out_path)


        Date  TotalWeeklyEarnings  RegularWeeklyEarnings
0 2005-01-01           379.071339             353.799926
1 2005-02-01           370.950899             354.111354
2 2005-03-01           375.885979             356.510999
3 2005-04-01           378.563489             357.421050
4 2005-05-01           379.516459             358.281707
Rows: 246 | Unique months: 246


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

HPI (Target Variable)

In [None]:
# Loading the HPI file
from google.colab import files
import io
import pandas as pd

uploaded = files.upload()

fname = next(iter(uploaded))
hpi_df = pd.read_csv(io.BytesIO(uploaded[fname]))

print(f"Loaded: {fname}")
print(f"Rows: {len(hpi_df):,} | Columns: {hpi_df.shape[1]}")
print("Preview:")
display(hpi_df.head())

Saving UK-HPI-full-file-2025-06.csv to UK-HPI-full-file-2025-06 (1).csv
Loaded: UK-HPI-full-file-2025-06 (1).csv
Rows: 146,655 | Columns: 54
Preview:


Unnamed: 0,Date,RegionName,AreaCode,AveragePrice,Index,IndexSA,1m%Change,12m%Change,AveragePriceSA,SalesVolume,...,NewPrice,NewIndex,New1m%Change,New12m%Change,NewSalesVolume,OldPrice,OldIndex,Old1m%Change,Old12m%Change,OldSalesVolume
0,01/01/2004,Aberdeenshire,S12000034,84638,41.1,,,,,388.0,...,112843.0,40.7,,,103.0,81273.0,41.0,,,285.0
1,01/02/2004,Aberdeenshire,S12000034,84623,41.1,,0.0,,,326.0,...,113061.0,40.8,0.2,,107.0,81194.0,40.9,-0.1,,219.0
2,01/03/2004,Aberdeenshire,S12000034,86536,42.1,,2.3,,,453.0,...,115218.0,41.6,1.9,,140.0,83137.0,41.9,2.4,,313.0
3,01/04/2004,Aberdeenshire,S12000034,87373,42.5,,1.0,,,571.0,...,115247.0,41.6,0.0,,180.0,84241.0,42.5,1.3,,391.0
4,01/05/2004,Aberdeenshire,S12000034,89493,43.5,,2.4,,,502.0,...,117377.0,42.4,1.8,,167.0,86466.0,43.6,2.6,,335.0


In [None]:
import numpy as np

# Parsing the date column
hpi_df = hpi_df.copy()
hpi_df["Date"] = pd.to_datetime(hpi_df["Date"], format="%d/%m/%Y", errors="coerce")

# Anchoring everything to month-start to match the other macro series
hpi_df["Date"] = hpi_df["Date"].dt.to_period("M").dt.to_timestamp(how="start")

# Keeping only relevant columns
wanted = ["Date", "RegionName", "AreaCode", "AveragePrice", "Index", "SalesVolume"]
available = [c for c in wanted if c in hpi_df.columns]

if "Date" not in available or "RegionName" not in available:
    raise ValueError("HPI file must include at least 'Date' and 'RegionName' columns.")

hpi = hpi_df[available].copy()

# Making sure numeric columns are truly numeric
for col in ["AveragePrice", "Index", "SalesVolume"]:
    if col in hpi.columns:
        hpi[col] = pd.to_numeric(hpi[col], errors="coerce")

# Dropping unusable rows and filtering for project timeline
hpi = hpi.dropna(subset=["Date", "RegionName"]).copy()

start = pd.Timestamp(2005, 1, 1)
end   = pd.Timestamp(2025, 6, 1)  # month-start for June 2025
hpi = hpi[(hpi["Date"] >= start) & (hpi["Date"] <= end)].copy()

# De-duplicate and sort
hpi = (
    hpi.sort_values(["RegionName", "Date"])
       .drop_duplicates(subset=["RegionName", "Date"], keep="last")
       .reset_index(drop=True)
)

# Quick checks
print("HPI panel ready.")
print(f"Coverage: {hpi['Date'].min().date()} → {hpi['Date'].max().date()}")
print(f"Rows: {len(hpi):,} | Regions: {hpi['RegionName'].nunique():,}")

display(hpi.head())



HPI panel ready.
Coverage: 2005-01-01 → 2025-06-01
Rows: 99,630 | Regions: 405


Unnamed: 0,Date,RegionName,AreaCode,AveragePrice,Index,SalesVolume
0,2005-01-01,Aberdeenshire,S12000034,105489,51.3,400.0
1,2005-02-01,Aberdeenshire,S12000034,101145,49.2,289.0
2,2005-03-01,Aberdeenshire,S12000034,98196,47.7,435.0
3,2005-04-01,Aberdeenshire,S12000034,99302,48.3,499.0
4,2005-05-01,Aberdeenshire,S12000034,105412,51.2,515.0


In [None]:
# Saving and Downloading
out_path = "/content/hpi_clean_panel.csv"

hpi.to_csv(out_path, index=False)

print("HPI panel cleaned and saved successfully.")
print(f"File location: {out_path}")
print(
    f"Date range: {hpi['Date'].min().date()} → {hpi['Date'].max().date()} | "
    f"Regions: {hpi['RegionName'].nunique()} | "
    f"Rows: {len(hpi):,}"
)

files.download(out_path)

HPI panel cleaned and saved successfully.
File location: /content/hpi_clean_panel.csv
Date range: 2005-01-01 → 2025-06-01 | Regions: 405 | Rows: 99,630


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>