In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from dataclasses import dataclass, field
from jh_interview.models import TransactionModel, PropertyModel, PostcodeModel

from hashlib import md5

In [None]:
# Constants

DATA_DIR = Path('../data/')
"""Path to the data directory."""

PRICE_PAID_FILE_FIRST = DATA_DIR / 'pp-2019.csv'
"""Path to the first price paid data file."""

PRICE_PAID_FILE_SECOND = DATA_DIR / 'pp-2020.csv'
"""Path to the second price paid data file."""

POSTCODES_COORDINATES_FILE = DATA_DIR / 'National_Statistics_Postcode_Lookup_UK_Coordinates_20240406.csv'

In [None]:
def load_price_paid_data(filepaths: list[Path]) -> pd.DataFrame:
    """
    Load price paid data from a CSV file.
    """
    column_names = [
        'transaction_id', 'price', 'date_of_transfer', 'postcode', 'property_type',
        'old_new', 'duration', 'paon', 'saon', 'street', 'locality', 'town_city',
        'district', 'country', 'ppd_category_type', 'record_status'
        ]

    df = pd.concat(
        [
            pd.read_csv(
                filename,
                names=column_names,
            )
            for filename in filepaths
        ]
    )
    df['date_of_transfer'] = pd.to_datetime(df['date_of_transfer'])

    # Construct a property_id column
    df['property_id'] = (
        df[['postcode', 'paon', 'saon']].astype(str).apply(
            lambda x: md5(''.join(x).encode()).hexdigest(), axis=1
        )
    )

    return df

In [None]:
def load_postcodes_data(filepath: Path) -> pd.DataFrame:
    """
    Load postcodes data from a CSV file.
    """

    column_names = [
        'postcode_1', 'postcode_2', 'postcode_3', 'easting', 'northing', 'positional_quality', 'local_authority',
        'longtitude', 'latitude', 'spatial_accuracy', 'last_uploaded', 'location', 'socrata_id',
    ]

    df = pd.read_csv(filepath, names=column_names, header=0)
    return df

In [None]:
transactions_pp = load_price_paid_data(
    [PRICE_PAID_FILE_FIRST,
    PRICE_PAID_FILE_SECOND]
)


postcodes = load_postcodes_data(POSTCODES_COORDINATES_FILE)

In [None]:
merged_df = transactions_pp.merge(postcodes, left_on='postcode', right_on='postcode_2')

merged_df['year'] = merged_df['date_of_transfer'].dt.year




In [None]:
ec1a = merged_df[merged_df['postcode'].str.startswith('EC1A')]

center_ec1a = (ec1a['longtitude'].mean(), ec1a['latitude'].mean())

print(center_ec1a)


In [None]:
merged_df['distance_from_ec1a'] = np.sqrt((merged_df['longtitude'] - center_ec1a[0])**2 + (merged_df['latitude'] - center_ec1a[1])**2)



In [None]:
grouped = merged_df.groupby('postcode').agg({'price': 'mean', 'distance_from_ec1a': 'mean'}).round(2)

# Filter out rows where distance is greater than 6
grouped = grouped.loc[grouped['distance_from_ec1a'] <= 6]

grouped.head()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

grouped_mean = grouped.groupby('distance_from_ec1a', as_index=False).mean()

x = grouped_mean['distance_from_ec1a']
y = grouped_mean['price']

from scipy.optimize import curve_fit
def exp_func(x, a, b):
    return a * np.exp(b * x)

# Fit the data to the exponential function
popt, pcov = curve_fit(exp_func, x, y)

# Calculate y values for the fitted curve
y_fit = exp_func(x, *popt)

plt.figure(figsize=(10, 6))
plt.scatter(x, y, s=1)

plt.plot(x, y_fit, color='red')

plt.title('Average Transaction Price vs Distance from EC1A')
plt.xlabel('Distance from EC1A')
plt.ylabel('Average Transaction Price')

plt.show()