In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from dataclasses import dataclass, field
from jh_interview.models import TransactionModel, PropertyModel, PostcodeModel

from hashlib import md5

In [None]:
# Constants

DATA_DIR = Path('../data/')
"""Path to the data directory."""

PRICE_PAID_FILE_FIRST = DATA_DIR / 'pp-2019.csv'
"""Path to the first price paid data file."""

PRICE_PAID_FILE_SECOND = DATA_DIR / 'pp-2020.csv'
"""Path to the second price paid data file."""

POSTCODES_COORDINATES_FILE = DATA_DIR / 'National_Statistics_Postcode_Lookup_UK_Coordinates_20240406.csv'

In [None]:
def load_price_paid_data(filepaths: list[Path]) -> pd.DataFrame:
    """
    Load price paid data from a CSV file.
    """
    column_names = [
        'transaction_id', 'price', 'date_of_transfer', 'postcode', 'property_type',
        'old_new', 'duration', 'paon', 'saon', 'street', 'locality', 'town_city',
        'district', 'country', 'ppd_category_type', 'record_status'
        ]

    df = pd.concat(
        [
            pd.read_csv(
                filename,
                names=column_names,
            )
            for filename in filepaths
        ]
    )
    df['date_of_transfer'] = pd.to_datetime(df['date_of_transfer'])

    # Construct a property_id column
    df['property_id'] = (
        df[['postcode', 'paon', 'saon']].astype(str).apply(
            lambda x: md5(''.join(x).encode()).hexdigest(), axis=1
        )
    )

    return df

In [None]:
def load_postcodes_data(filepath: Path) -> pd.DataFrame:
    """
    Load postcodes data from a CSV file.
    """

    column_names = [
        'postcode_1', 'postcode_2', 'postcode_3', 'easting', 'northing', 'positional_quality', 'local_authority',
        'longtitude', 'latitude', 'spatial_accuracy', 'last_uploaded', 'location', 'socrata_id',
    ]

    df = pd.read_csv(filepath, names=column_names, header=0)
    return df

In [None]:
transactions_pp = load_price_paid_data(
    [PRICE_PAID_FILE_FIRST,
    PRICE_PAID_FILE_SECOND]
)


postcodes = load_postcodes_data(POSTCODES_COORDINATES_FILE)

In [None]:
postcodes.head()

In [None]:
merged_df = transactions_pp.merge(postcodes, left_on='postcode', right_on='postcode_1')

merged_df['year'] = merged_df['date_of_transfer'].dt.year


In [None]:

grouped_df = merged_df.groupby('year').agg({
    'price': 'sum',
    'transaction_id': 'count',
    'latitude': 'mean',
    'longtitude': 'mean'
}).reset_index()

grouped_df['weighted_lat_transactions'] = grouped_df['latitude'] * grouped_df['transaction_id']
grouped_df['weighted_lon_transactions'] = grouped_df['longtitude'] * grouped_df['transaction_id']

grouped_df['weighted_lat_value'] = grouped_df['latitude'] * grouped_df['price']
grouped_df['weighted_lon_value'] = grouped_df['longtitude'] * grouped_df['price']

grouped_df['centre_of_gravity_transactions'] = (
    grouped_df['weighted_lat_transactions'] / grouped_df['transaction_id'],
    grouped_df['weighted_lon_transactions'] / grouped_df['transaction_id']
)

grouped_df['centre_of_gravity_value'] = (grouped_df['weighted_lat_value'] / grouped_df['price'], grouped_df['weighted_lon_value'] / grouped_df['price'])



In [None]:
grouped_df['centre_of_gravity_transactions'] = grouped_df.apply(lambda row: (row['weighted_lat_transactions'] / row['transaction_id'], row['weighted_lon_transactions'] / row['transaction_id']), axis=1)
grouped_df['centre_of_gravity_value'] = grouped_df.apply(lambda row: (row['weighted_lat_value'] / row['price'], row['weighted_lon_value'] / row['price']), axis=1)


grouped_df['centre_of_gravity_transactions'] = grouped_df['centre_of_gravity_transactions'].apply(lambda x: f'({x[0]:.6f}, {x[1]:.6f})')
grouped_df['centre_of_gravity_value'] = grouped_df['centre_of_gravity_value'].apply(lambda x: f'({x[0]:.6f}, {x[1]:.6f})')


print(grouped_df[['year', 'centre_of_gravity_transactions', 'centre_of_gravity_value']].to_string(index=False))