In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

In [3]:
df = pd.read_csv("data/knn/data-knn-2024-04-07.csv")

In [4]:
df["id"] = df["id"].astype(int)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94208 entries, 0 to 94207
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        94208 non-null  int64  
 1   id_mls                    94208 non-null  object 
 2   bathrooms_total           94208 non-null  float64
 3   bedrooms_extra            94208 non-null  float64
 4   bedrooms                  94208 non-null  float64
 5   stories_total             94208 non-null  float64
 6   size_interior             94208 non-null  float64
 7   building_type             94208 non-null  object 
 8   agency_name               94208 non-null  object 
 9   agency_type               94208 non-null  object 
 10  property_type             94208 non-null  object 
 11  lng                       94208 non-null  float64
 12  lat                       94208 non-null  float64
 13  ownership_type            94208 non-null  object 
 14  owners

In [6]:
df_geo_price = df[["id", "lng", "lat", "price"]]

In [7]:
df_geo_price

Unnamed: 0,id,lng,lat,price
0,26639235,-83.010728,42.321421,849900.0
1,26639230,-83.029633,42.265783,399900.0
2,26639229,-83.087577,42.228649,959900.0
3,26639225,-83.056441,42.304613,299900.0
4,26638943,-83.055635,42.302468,399000.0
...,...,...,...,...
94203,26691527,-114.425597,62.450687,875000.0
94204,26691140,-114.409428,62.441894,409900.0
94205,26689306,-114.405637,62.443042,629900.0
94206,26686534,-114.366803,62.465253,824900.0


In [8]:
df_geo_price[df_geo_price["id"]==26639235]

Unnamed: 0,id,lng,lat,price
0,26639235,-83.010728,42.321421,849900.0


In [9]:
df_geo_price.drop(df_geo_price[df_geo_price["id"]==26639235].index)

Unnamed: 0,id,lng,lat,price
1,26639230,-83.029633,42.265783,399900.0
2,26639229,-83.087577,42.228649,959900.0
3,26639225,-83.056441,42.304613,299900.0
4,26638943,-83.055635,42.302468,399000.0
5,26635894,-83.038214,42.309424,389000.0
...,...,...,...,...
94203,26691527,-114.425597,62.450687,875000.0
94204,26691140,-114.409428,62.441894,409900.0
94205,26689306,-114.405637,62.443042,629900.0
94206,26686534,-114.366803,62.465253,824900.0


In [15]:
import pandas as pd
from sklearn.neighbors import BallTree

# Sample real estate dataframe (replace this with your actual dataframe)
data = {
    'lng': [-118.41, -118.43, -118.45, -118.39, -118.42],
    'lat': [34.05, 34.07, 34.06, 34.04, 34.08],
    'price': [500000, 550000, 480000, 510000, 600000]
}
# real_estate_df = pd.DataFrame(data)
real_estate_df = df[["lng", "lat", "price"]]

# Function to calculate average price of 5 nearest neighbors
def calculate_average_price(row, tree, df, k=5):
    # Ensure k is within valid range
    k = min(k, len(df) - 1)  # Ensure k is not greater than number of samples - 1
    # Find the indices of the k nearest neighbors
    dist, ind = tree.query([row[['lng', 'lat']].values], k=k+1)
    # Exclude the sample itself (first nearest neighbor)
    neighbors = ind.flatten()[1:]
    # Get prices of the neighboring houses
    neighbor_prices = df.iloc[neighbors]['price']
    # Calculate average price
    avg_price = neighbor_prices.mean()
    return avg_price

# Create a BallTree for nearest neighbor search based on lng and lat
tree = BallTree(real_estate_df[['lng', 'lat']].values, leaf_size=15)

# Apply the function row-wise to calculate average price of neighbors
df['average_price'] = real_estate_df.apply(lambda row: calculate_average_price(row, tree, real_estate_df), axis=1)

# Display the updated dataframe with the new column
# print(real_estate_df)


In [21]:
def calculate_average_neighbor_price(df, lng, lat, k=5):
    """
    Calculate the average price of the k nearest neighboring houses based on lng and lat.

    Parameters:
    - df (DataFrame): The original dataframe containing 'lng', 'lat', and 'price' columns.
    - lng (float): Longitude of the target location.
    - lat (float): Latitude of the target location.
    - k (int): Number of nearest neighbors to consider (default is 5).

    Returns:
    - float: Average price of the k nearest neighboring houses.
    """
    # Create a BallTree for nearest neighbor search based on lng and lat
    tree = BallTree(df[['lng', 'lat']].values, leaf_size=15)

    # Find the indices of the k nearest neighbors (including the sample itself)
    dist, ind = tree.query([[lng, lat]], k=k+1)

    # Extract indices of the neighbors excluding the sample itself
    neighbors = ind.flatten()[1:]  # Exclude the first index which is the sample itself

    # Get prices of the neighboring houses
    neighbor_prices = df.iloc[neighbors]['price']

    # Calculate average price of neighboring houses
    avg_price = neighbor_prices.mean()

    return avg_price

In [22]:
target_longitude = -118.41
target_latitude = 34.05
average_neighbor_price = calculate_average_neighbor_price(real_estate_df, target_longitude, target_latitude, k=5)

print(f"Average price of 5 neighboring houses at ({target_longitude}, {target_latitude}): ${average_neighbor_price:.2f}")

Average price of 5 neighboring houses at (-118.41, 34.05): $916400.00
