In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from dataclasses import dataclass, field
from jh_interview.models import TransactionModel, PropertyModel, PostcodeModel

from hashlib import md5

In [None]:
# Constants

DATA_DIR = Path('../data/')
"""Path to the data directory."""

CPI_INDEX_FILE = DATA_DIR / 'mm23.csv'

AVERAGE_PRICES_FILE = DATA_DIR / 'Average-prices-2021-03.csv'
"""Path to the average prices data file."""


In [None]:
def load_cpi_index_by_month(filepath: Path) -> pd.DataFrame:
    """Load the CPI index data."""
    df = pd.read_csv(filepath, parse_dates=True)

    cdid_rows = df[df.iloc[:, 0] == 'CDID']

    pattern = r'^\d{4}\s[A-Z]{3}$'
    df_filtered = df[df.iloc[:, 0].str.match(pattern)]

    month_dict = {'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06', 
              'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12'}
    
    df_filtered[['Year', 'Month']] = df_filtered.iloc[:, 0].str.split(' ', expand=True)
    df_filtered['Month'] = df_filtered['Month'].replace(month_dict)
    df_filtered.iloc[:, 0] = df_filtered['Year'] + '-' + df_filtered['Month']
    df_filtered = df_filtered.drop(['Year', 'Month'], axis=1)

    cdid_list = cdid_rows.iloc[0, :].values.tolist()

    cdid_list[0]= 'Date'

    df_filtered.columns = cdid_list

    return df_filtered

In [None]:
def load_average_prices_data(filepath: Path) -> pd.DataFrame:
    """
    Load average prices data from a CSV file.
    """
    column_names = [
        "Date", "Region_Name", "Area_Code", "Average_Price", "Monthly_Change", "Annual_Change", "Average_Price_SA",
    ]
    df = pd.read_csv(filepath, names=column_names, header=0)
    df = df[df['Region_Name'] == 'United Kingdom']

    df = df[['Date', 'Average_Price']]
    
    df['Date'] = df['Date'].str.slice(0, -3)
    return df

In [None]:

average_prices = load_average_prices_data(AVERAGE_PRICES_FILE)

cpi_data = load_cpi_index_by_month(CPI_INDEX_FILE)

average_prices.head()

In [None]:


cpi_data.head()

In [None]:
average_prices['Date'] = pd.to_datetime(average_prices['Date'])
cpi_data['Date'] = pd.to_datetime(cpi_data['Date'])


merged_data = pd.merge(average_prices, cpi_data, left_on='Date', right_on='Date')


correlations = merged_data.drop('Date', axis=1).corr()['Average_Price'].dropna()




In [None]:
top_5 = correlations.nlargest(6)

bottom_5 = correlations.nsmallest(5)

print(top_5)
print(bottom_5)