In [31]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

In [32]:
df = pd.read_csv("data/knn/data-knn-2024-04-07.csv")

In [33]:
df["id"] = df["id"].astype(int)

In [37]:
import pandas as pd
from sklearn.neighbors import BallTree

# Sample real estate dataframe (replace this with your actual dataframe)
data = {
    'lng': [-118.41, -118.43, -118.45, -118.39, -118.42],
    'lat': [34.05, 34.07, 34.06, 34.04, 34.08],
    'price': [500000, 550000, 480000, 510000, 600000]
}
# real_estate_df = pd.DataFrame(data)
real_estate_df = df[["lng", "lat", "price"]]

# Function to calculate average price of 5 nearest neighbors
def calculate_average_price(row, tree, df, k=5):
    # Ensure k is within valid range
    k = min(k, len(df) - 1)  # Ensure k is not greater than number of samples - 1
    # Find the indices of the k nearest neighbors
    dist, ind = tree.query([row[['lng', 'lat']].values], k=k+1)
    # Exclude the sample itself (first nearest neighbor)
    neighbors = ind.flatten()[1:]
    # Get prices of the neighboring houses
    neighbor_prices = df.iloc[neighbors]['price']
    # Calculate average price
    avg_price = neighbor_prices.mean()
    return avg_price

# Create a BallTree for nearest neighbor search based on lng and lat
tree = BallTree(real_estate_df[['lng', 'lat']].values, leaf_size=15)

# Apply the function row-wise to calculate average price of neighbors
df['average_price'] = real_estate_df.apply(lambda row: calculate_average_price(row, tree, real_estate_df), axis=1)

# Display the updated dataframe with the new column
# print(real_estate_df)


In [39]:
df.head(5)

Unnamed: 0,id,id_mls,avg_price_5,bathrooms_total,bedrooms_extra,bedrooms,stories_total,size_interior,building_type,agency_name,...,ownership_type,ownership_type_group_ids,land_size,parkings,page_url,timestamp,postal_code,province,price,average_price
0,26639235,24005541,478340.0,3.0,0.0,4.0,2.5,2810.0,House,RE/MAX PREFERRED REALTY LTD. - 585,...,Freehold,1,45X119.25,0.0,https://realtor.ca/real-estate/26639235/794-de...,6.384645e+17,N8Y2M1,Ontario,849900.0,478340.0
1,26639230,24005774,664900.0,2.0,0.0,2.0,1.0,1072.0,Apartment,ROYAL LEPAGE BINDER REAL ESTATE - 640,...,Condominium/Strata,2,,0.0,https://realtor.ca/real-estate/26639230/1655-g...,6.384645e+17,N9E4W4,Ontario,399900.0,664900.0
2,26639229,24005776,949919.8,3.0,0.0,4.0,2.0,1965.881818,House,RE/MAX CAPITAL DIAMOND REALTY - 821,...,Freehold,1,53.92XIRREG FT,0.0,https://realtor.ca/real-estate/26639229/2580-m...,6.384645e+17,N9J2M6,Ontario,959900.0,949919.8
3,26639225,24005786,349180.0,1.0,0.0,4.0,1.0,1040.426146,House,REMO VALENTE REAL ESTATE (1990) LIMITED - 790,...,Freehold,1,25X109.25,0.0,https://realtor.ca/real-estate/26639225/722-br...,6.384645e+17,N9B2M6,Ontario,299900.0,349180.0
4,26638943,24005775,319360.0,1.0,0.0,3.0,1.0,3718.455676,House,RE/MAX CAPITAL DIAMOND REALTY - 821,...,Freehold,1,40X98.32,0.0,https://realtor.ca/real-estate/26638943/911-br...,6.384645e+17,N9B2M9,Ontario,399000.0,319360.0


In [42]:
def calculate_average_neighbor_price(df, lng, lat, k=5):
    """
    Calculate the average price of the k nearest neighboring houses based on lng and lat.

    Parameters:
    - df (DataFrame): The original dataframe containing 'lng', 'lat', and 'price' columns.
    - lng (float): Longitude of the target location.
    - lat (float): Latitude of the target location.
    - k (int): Number of nearest neighbors to consider (default is 5).

    Returns:
    - float: Average price of the k nearest neighboring houses.
    """
    # Create a BallTree for nearest neighbor search based on lng and lat
    tree = BallTree(df[['lng', 'lat']].values, leaf_size=15)

    # Find the indices of the k nearest neighbors (including the sample itself)
    dist, ind = tree.query([[lng, lat]], k=k)

    # Extract indices of the neighbors
    neighbors = ind.flatten()

    # Get prices of the neighboring houses
    neighbor_prices = df.iloc[neighbors]['price']
    print(neighbor_prices)

    # Calculate average price of neighboring houses
    avg_price = neighbor_prices.mean()

    return avg_price

In [43]:
target_longitude = -118.41
target_latitude = 34.05
average_neighbor_price = calculate_average_neighbor_price(real_estate_df, target_longitude, target_latitude, k=5)
print(f"Average price of 5 neighboring houses at ({target_longitude}, {target_latitude}): ${average_neighbor_price:.2f}")

43714    1289000.0
43754     789000.0
43650     489000.0
90865     899000.0
43728    1990000.0
Name: price, dtype: float64
Average price of 5 neighboring houses at (-118.41, 34.05): $1091200.00
