In [None]:
import pandas as pd
import plotly.express as px
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans 
import numpy as np

pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('../data/train_imputed.csv')

df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)

neighborhood_cols = [col for col in df.columns if col.startswith('neighbourhood_cleansed')]
print(neighborhood_cols)

In [None]:
df['neighborhood'] = df[neighborhood_cols].idxmax(axis=1).str.replace('neighbourhood_cleansed_', '')
neighborhood_counts = df['neighborhood'].value_counts()
filtered_neighborhoods = neighborhood_counts[neighborhood_counts > 5].index

avg_price_per_person = df[df['neighborhood'].isin(filtered_neighborhoods)].groupby('neighborhood').apply(lambda x: ((x['price']) / x['accommodates']).mean()).to_dict()
# avg_price_per_person = df[df['neighborhood'].isin(filtered_neighborhoods)].groupby('neighborhood').apply(lambda x: (x['price'] / x['accommodates']).median()).to_dict()

print(avg_price_per_person)

In [None]:
df['avg_ppp_neighborhood'] = df['neighborhood'].map(avg_price_per_person)
df = df.drop(columns=neighborhood_cols)

In [None]:
n_neighbors = 5

knn = NearestNeighbors(n_neighbors=n_neighbors)
knn.fit(df[['latitude', 'longitude']])
distances, indices = knn.kneighbors(df[['latitude', 'longitude']])

In [None]:
fig = px.scatter_mapbox(df, lat="latitude", lon="longitude", hover_name="avg_ppp_neighborhood", hover_data=["neighborhood"],
                        color="avg_ppp_neighborhood", 
                        color_continuous_scale=px.colors.sequential.Jet, 
                        size_max=30, zoom=9.5, height=600)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
missing_values = df[df['avg_ppp_neighborhood'].isnull()]
non_missing_values = df[df['avg_ppp_neighborhood'].notnull()]

nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
nn.fit(non_missing_values[['latitude', 'longitude']])
distances, indices = nn.kneighbors(missing_values[['latitude', 'longitude']])
df.loc[missing_values.index, 'avg_ppp_neighborhood'] = non_missing_values.iloc[indices.flatten()]['avg_ppp_neighborhood'].values

In [None]:
fig = px.scatter_mapbox(df, lat="latitude", lon="longitude", hover_name="avg_ppp_neighborhood", hover_data=["neighborhood"],
                        color="avg_ppp_neighborhood", 
                        color_continuous_scale=px.colors.sequential.Jet, 
                        size_max=30, zoom=9.5, height=600)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
# apply the same transformation to the test set
df_test = pd.read_csv('../data/test_imputed.csv')

neighborhood_cols = [col for col in df_test.columns if col.startswith('neighbourhood_cleansed')]

df_test['neighborhood'] = df_test[neighborhood_cols].idxmax(axis=1).str.replace('neighbourhood_cleansed_', '')
df_test['avg_ppp_neighborhood'] = df_test['neighborhood'].map(avg_price_per_person)
df_test = df_test.drop(columns=neighborhood_cols)

missing_values = df_test[df_test['avg_ppp_neighborhood'].isnull()]

nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
nn.fit(df[['latitude', 'longitude']])
distances, indices = nn.kneighbors(missing_values[['latitude', 'longitude']])
df_test.loc[missing_values.index, 'avg_ppp_neighborhood'] = df.iloc[indices.flatten()]['avg_ppp_neighborhood'].values

distances, indices = knn.kneighbors(df_test[['latitude', 'longitude']])

In [None]:
fig = px.scatter_mapbox(df_test, lat="latitude", lon="longitude", hover_name="avg_ppp_neighborhood", hover_data=["neighborhood"],
                        color="avg_ppp_neighborhood", 
                        color_continuous_scale=px.colors.sequential.Jet, 
                        size_max=30, zoom=9.5, height=600)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
landmarks = {
    'Statue of Liberty': [40.6892, -74.0445],
    'Central Park': [40.785091, -73.968285],
    'Empire State Building': [40.748817, -73.985428],
    'Brooklyn Bridge': [40.706086, -73.996864],
    'Times Square': [40.7580, -73.9855],
    'Rockefeller Center': [40.7587, -73.9787],
}

for landmark, coords in landmarks.items():
    df[f'distance_to_{landmark}'] = np.sqrt((df['latitude'] - coords[0])**2 + (df['longitude'] - coords[1])**2)
    df_test[f'distance_to_{landmark}'] = np.sqrt((df_test['latitude'] - coords[0])**2 + (df_test['longitude'] - coords[1])**2)

landmark_cols = [col for col in df.columns if col.startswith('distance_to_')]
df['distance_to_closest_landmark'] = df[landmark_cols].min(axis=1)
df_test['distance_to_closest_landmark'] = df_test[landmark_cols].min(axis=1)

In [None]:
final_train_df = df.drop(columns=['latitude', 'longitude','neighborhood'])
final_train_df.head()

In [None]:
df_test = df_test.drop(columns=['latitude', 'longitude','neighborhood'])
df_test.head()

In [None]:
print(set(final_train_df.columns) - set(df_test.columns))
print(set(df_test.columns) - set(final_train_df.columns))

In [None]:
final_train_df.to_csv('../data/train_location.csv', index=False)
df_test.to_csv('../data/test_location.csv', index=False)