In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from dataclasses import dataclass, field
from jh_interview.models import TransactionModel, PropertyModel, PostcodeModel

from hashlib import md5

In [2]:
# Constants

DATA_DIR = Path('../data/')
"""Path to the data directory."""

PRICE_PAID_FILE_FIRST = DATA_DIR / 'pp-2019.csv'
"""Path to the first price paid data file."""

PRICE_PAID_FILE_SECOND = DATA_DIR / 'pp-2020.csv'
"""Path to the second price paid data file."""

POSTCODES_COORDINATES_FILE = DATA_DIR / 'National_Statistics_Postcode_Lookup_UK_Coordinates_20240406.csv'

In [3]:
def load_price_paid_data(filepaths: list[Path]) -> pd.DataFrame:
    """
    Load price paid data from a CSV file.
    """
    column_names = [
        'transaction_id', 'price', 'date_of_transfer', 'postcode', 'property_type',
        'old_new', 'duration', 'paon', 'saon', 'street', 'locality', 'town_city',
        'district', 'country', 'ppd_category_type', 'record_status'
        ]

    df = pd.concat(
        [
            pd.read_csv(
                filename,
                names=column_names,
            )
            for filename in filepaths
        ]
    )
    df['date_of_transfer'] = pd.to_datetime(df['date_of_transfer'])

    # Construct a property_id column
    df['property_id'] = (
        df[['postcode', 'paon', 'saon']].astype(str).apply(
            lambda x: md5(''.join(x).encode()).hexdigest(), axis=1
        )
    )

    return df

In [4]:
def load_postcodes_data(filepath: Path) -> pd.DataFrame:
    """
    Load postcodes data from a CSV file.
    """

    column_names = [
        'postcode_1', 'postcode_2', 'postcode_3', 'easting', 'northing', 'positional_quality', 'local_authority',
        'longtitude', 'latitude', 'spatial_accuracy', 'last_uploaded', 'location', 'socrata_id',
    ]

    df = pd.read_csv(filepath, names=column_names, header=0)
    return df

In [5]:
transactions_pp = load_price_paid_data(
    [PRICE_PAID_FILE_FIRST,
    PRICE_PAID_FILE_SECOND]
)


postcodes = load_postcodes_data(POSTCODES_COORDINATES_FILE)

In [6]:
postcodes.head()

Unnamed: 0,postcode_1,postcode_2,postcode_3,easting,northing,positional_quality,local_authority,longtitude,latitude,spatial_accuracy,last_uploaded,location,socrata_id
0,DY8 3BH,DY8 3BH,DY8 3BH,389380,283217,1,Dudley,-2.157674,52.446824,Postcode Level,05/03/2024,"(52.446824, -2.157674)",501629
1,NG165EQ,NG16 5EQ,NG16 5EQ,447191,350705,1,Ashfield,-1.297441,53.051548,Postcode Level,05/03/2024,"(53.051548, -1.297441)",1054005
2,SY245BS,SY24 5BS,SY24 5BS,262667,286629,1,Ceredigion,-4.022737,52.460289,Postcode Level,05/03/2024,"(52.460289, -4.022737)",1557305
3,GU7 2QE,GU7 2QE,GU7 2QE,494653,143659,1,Guildford,-0.647083,51.18433,Postcode Level,05/03/2024,"(51.18433, -0.647083)",648625
4,EX168NJ,EX16 8NJ,EX16 8NJ,285616,113191,1,Mid Devon,-3.628219,50.906823,Postcode Level,05/03/2024,"(50.906823, -3.628219)",549784


In [7]:
merged_df = transactions_pp.merge(postcodes, left_on='postcode', right_on='postcode_1')

merged_df['year'] = merged_df['date_of_transfer'].dt.year


In [8]:

grouped_df = merged_df.groupby('year').agg({
    'price': 'sum',
    'transaction_id': 'count',
    'latitude': 'mean',
    'longtitude': 'mean'
}).reset_index()

grouped_df['weighted_lat_transactions'] = grouped_df['latitude'] * grouped_df['transaction_id']
grouped_df['weighted_lon_transactions'] = grouped_df['longtitude'] * grouped_df['transaction_id']

grouped_df['weighted_lat_value'] = grouped_df['latitude'] * grouped_df['price']
grouped_df['weighted_lon_value'] = grouped_df['longtitude'] * grouped_df['price']

grouped_df['centre_of_gravity_transactions'] = (
    grouped_df['weighted_lat_transactions'] / grouped_df['transaction_id'],
    grouped_df['weighted_lon_transactions'] / grouped_df['transaction_id']
)

grouped_df['centre_of_gravity_value'] = (grouped_df['weighted_lat_value'] / grouped_df['price'], grouped_df['weighted_lon_value'] / grouped_df['price'])



In [9]:
grouped_df['centre_of_gravity_transactions'] = grouped_df.apply(lambda row: (row['weighted_lat_transactions'] / row['transaction_id'], row['weighted_lon_transactions'] / row['transaction_id']), axis=1)
grouped_df['centre_of_gravity_value'] = grouped_df.apply(lambda row: (row['weighted_lat_value'] / row['price'], row['weighted_lon_value'] / row['price']), axis=1)


grouped_df['centre_of_gravity_transactions'] = grouped_df['centre_of_gravity_transactions'].apply(lambda x: f'({x[0]:.6f}, {x[1]:.6f})')
grouped_df['centre_of_gravity_value'] = grouped_df['centre_of_gravity_value'].apply(lambda x: f'({x[0]:.6f}, {x[1]:.6f})')


print(grouped_df[['year', 'centre_of_gravity_transactions', 'centre_of_gravity_value']].to_string(index=False))

 year centre_of_gravity_transactions centre_of_gravity_value
 2019         (52.475182, -1.410200)  (52.475182, -1.410200)
 2020         (52.470821, -1.397611)  (52.470821, -1.397611)
