In [49]:
import pandas as pd
import numpy as np
import requests
import time
import os
import pickle
import math
from concurrent.futures import ThreadPoolExecutor

# Load the original dataset
df = pd.read_csv('cleaned_datasets/neighbourhood_warszawa_updated.csv')
print(f"Loaded dataset with {len(df)} properties")

# Define POI types and their max distances
poi_types = {
    'tramStation': 0.75,
    'hospital': 1.0,
    'trainStation': 2.0,
    'outlinePark': 0.5,
    'outlineRiver': 1.0,
    'outlineAirport': 2.0,
    'outlineExpressWay': 0.5
}

Loaded dataset with 25861 properties


In [50]:
# Function to calculate distance using Euclidean approximation
def calculate_distance(lat1, lon1, lat2, lon2):
    # For Warsaw latitude (~52 degrees), 1 degree of latitude ≈ 111.1 km
    # 1 degree of longitude ≈ 67.4 km
    lat_km = 111.1
    lon_km = 67.4
    
    # Calculate distance using Euclidean approximation
    dx = (lon2 - lon1) * lon_km
    dy = (lat2 - lat1) * lat_km
    
    return math.sqrt(dx*dx + dy*dy)

# Helper function for single property distance calculation
def _calculate_single_min_distance(prop_lat, prop_lon, poi_lats, poi_lons, max_distance):
    import numpy as np
    
    # Calculate squared distances first (avoids unnecessary sqrt operations)
    lat_km = 111.1
    lon_km = 67.4
    
    dx = (poi_lons - prop_lon) * lon_km
    dy = (poi_lats - prop_lat) * lat_km
    
    # Calculate squared distances
    squared_distances = dx*dx + dy*dy
    
    # Find the minimum distance
    min_squared_dist = np.min(squared_distances)
    
    # Only calculate sqrt for the minimum
    min_dist = np.sqrt(min_squared_dist)
    
    # Return None if distance is greater than max_distance
    return min_dist if min_dist <= max_distance else None

# Function to calculate min distance to nearest POI - optimized with vectorization
def calculate_min_distance(properties_df, poi_locations, max_distance):
    if not poi_locations:
        return [None] * len(properties_df)
        
    # Convert POI locations to numpy arrays for faster computation
    import numpy as np
    poi_lats = np.array([loc[0] for loc in poi_locations])
    poi_lons = np.array([loc[1] for loc in poi_locations])
    
    # For larger datasets, use vectorized operations
    results = []
    chunk_size = 1000  # Process in chunks to avoid memory issues
    
    for i in range(0, len(properties_df), chunk_size):
        chunk = properties_df.iloc[i:i+chunk_size]
        prop_lats = chunk['latitude'].values
        prop_lons = chunk['longitude'].values
        
        # Calculate distances for all properties to all POIs
        chunk_results = []
        for j in range(len(chunk)):
            min_dist = _calculate_single_min_distance(
                prop_lats[j], prop_lons[j], poi_lats, poi_lons, max_distance
            )
            chunk_results.append(min_dist)
        
        results.extend(chunk_results)
    
    return results

In [51]:
def fetch_osm_data(poi_type, city="Warszawa"):
    import os
    import pickle
    import random
    
    # Create cache directory if it doesn't exist
    cache_dir = "osm_cache"
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    
    # Check if cached data exists
    cache_file = os.path.join(cache_dir, f"{poi_type}_{city}.pkl")
    if os.path.exists(cache_file):
        with open(cache_file, 'rb') as f:
            return pickle.load(f)
    
    overpass_url = "https://overpass-api.de/api/interpreter"
    
    # Different queries based on POI type
    if poi_type == "tramStation":
        overpass_query = """
        [out:json];
        (
          node["public_transport"="stop_position"]["tram"="yes"](52.1, 20.8, 52.4, 21.3);
          node["railway"="tram_stop"](52.1, 20.8, 52.4, 21.3);
        );
        out body;
        """
    elif poi_type == "hospital":
        overpass_query = """
        [out:json];
        (
          node["amenity"="hospital"](52.1, 20.8, 52.4, 21.3);
          way["amenity"="hospital"](52.1, 20.8, 52.4, 21.3);
          node["amenity"="clinic"](52.1, 20.8, 52.4, 21.3);
        );
        out center;
        """
    elif poi_type == "trainStation":
        overpass_query = """
        [out:json];
        (
          node["railway"="station"](52.1, 20.8, 52.4, 21.3);
          way["railway"="station"](52.1, 20.8, 52.4, 21.3);
          node["railway"="halt"](52.1, 20.8, 52.4, 21.3);
        );
        out center;
        """
    elif poi_type == "outlinePark":
        overpass_query = """
        [out:json];
        (
          way["leisure"="park"](52.1, 20.8, 52.4, 21.3);
          way["landuse"="forest"](52.1, 20.8, 52.4, 21.3);
        );
        out geom;
        """
    elif poi_type == "outlineRiver":
        overpass_query = """
        [out:json];
        (
          way["waterway"="river"](52.1, 20.8, 52.4, 21.3);
          way["natural"="water"](52.1, 20.8, 52.4, 21.3);
        );
        out geom;
        """
    elif poi_type == "outlineAirport":
        overpass_query = """
        [out:json];
        (
          way["aeroway"="aerodrome"](52.1, 20.8, 52.4, 21.3);
          way["aeroway"="runway"](52.1, 20.8, 52.4, 21.3);
        );
        out geom;
        """
    elif poi_type == "outlineExpressWay":
        overpass_query = """
        [out:json];
        (
          way["highway"="motorway"](52.1, 20.8, 52.4, 21.3);
          way["highway"="trunk"](52.1, 20.8, 52.4, 21.3);
          way["highway"="primary"](52.1, 20.8, 52.4, 21.3);
        );
        out geom;
        """
    else:
        return []

    # Add proper headers
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
        'User-Agent': 'Python/Requests'
    }
    
    # Try multiple times with error handling
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = requests.post(overpass_url, data=overpass_query, headers=headers, timeout=30)
            if response.status_code != 200:
                print(f"  Warning: Status code {response.status_code} for {poi_type}, attempt {attempt+1}")
                if attempt < max_retries - 1:
                    time.sleep(5)  # Wait before retry
                    continue
            
            data = response.json()
            break
        except Exception as e:
            print(f"  Error on attempt {attempt+1} for {poi_type}: {e}")
            if attempt < max_retries - 1:
                time.sleep(5)  # Wait before retry
            else:
                print(f"  Failed to fetch data for {poi_type} after {max_retries} attempts")
                return []
    
    locations = []
    
    # For non-outline POIs, process normally with centers
    if not poi_type.startswith("outline"):
        for element in data.get('elements', []):
            if element.get('type') == 'node':
                lat = element.get('lat')
                lon = element.get('lon')
                if lat and lon:
                    locations.append((lat, lon))
            else:  # way or relation with center point
                center = element.get('center', {})
                if center:
                    lat = center.get('lat')
                    lon = center.get('lon')
                    if lat and lon:
                        locations.append((lat, lon))
    # For outline features, extract all points forming the geometry
    else:
        for element in data.get('elements', []):
            if 'geometry' in element:
                for geo_point in element['geometry']:
                    lat = geo_point.get('lat')
                    lon = geo_point.get('lon')
                    if lat and lon:
                        locations.append((lat, lon))
    
    # Log the number of points found
    print(f"  Retrieved {len(locations)} coordinate points for {poi_type}")
    
    # Subsample large point sets to improve performance
    if poi_type.startswith("outline") and len(locations) > 1000:
        locations = random.sample(locations, 1000)
        print(f"  Sampled down to {len(locations)} points for performance")
    
    # Cache the results
    with open(cache_file, 'wb') as f:
        pickle.dump(locations, f)
            
    return locations

In [52]:
def process_poi_type(poi_type_info):
    poi_type, max_distance = poi_type_info
    start_time = time.time()
    print(f"Processing {poi_type}...")
    
    # Fetch POI locations (cached if available)
    poi_locations = fetch_osm_data(poi_type)
    print(f"  Found {len(poi_locations)} {poi_type} locations")
    
    # Calculate minimum distances for each property
    column_name = f"minDistTo{poi_type.replace('outline', '')}"
    
    # Use the optimized function
    distances = calculate_min_distance(df, poi_locations, max_distance)
    
    # Count properties with this POI within range
    count_within_range = sum(1 for d in distances if d is not None)
    elapsed_time = time.time() - start_time
    print(f"  {count_within_range} properties have a {poi_type} within {max_distance}km (took {elapsed_time:.2f}s)")
    
    return poi_type, column_name, distances

In [53]:
# Process POI types in parallel
print(f"Starting parallel processing of {len(poi_types)} POI types...")
start_total = time.time()

with ThreadPoolExecutor(max_workers=min(len(poi_types), 4)) as executor:
    # Submit all tasks
    future_to_poi = {
        executor.submit(process_poi_type, (poi_type, max_distance)): poi_type
        for poi_type, max_distance in poi_types.items()
    }
    
    # Process results as they complete
    for future in future_to_poi:
        try:
            poi_type, column_name, distances = future.result()
            # Update DataFrame with results
            df[column_name] = distances
            # Create categorical column
            df[poi_type] = df[column_name].apply(
                lambda x: x if x is not None else None
            )
        except Exception as e:
            print(f"Error processing {future_to_poi[future]}: {e}")

elapsed_total = time.time() - start_total
print(f"All POI types processed in {elapsed_total:.2f} seconds")

# Nullify existing columns based on distance thresholds
df['cityCentre'] = df['mindistocenter'].apply(lambda x: x if pd.notnull(x) else None)
df['metroStation'] = df['mindistometro'].apply(lambda x: x if pd.notnull(x) and x <= 2.0 else None)
df['uni'] = df['mindistouni'].apply(lambda x: x if pd.notnull(x) and x <= 2.0 else None)
df['mall'] = df['mindistotradecenter'].apply(lambda x: x if pd.notnull(x) and x <= 1.0 else None)

# Save the updated dataset
df.to_csv('cleaned_datasets/neighbourhood_warszawa_updated_with_pois.csv', index=False)

print("Dataset updated successfully!")

Starting parallel processing of 7 POI types...
Processing tramStation...
Processing hospital...
Processing trainStation...
Processing outlinePark...
  Found 657 tramStation locations
  Found 236 hospital locations
  Found 1000 outlinePark locations
  Found 117 trainStation locations
  20302 properties have a hospital within 1.0km (took 0.45s)  21061 properties have a trainStation within 2.0km (took 0.46s)
Processing outlineRiver...

Processing outlineAirport...
  Found 802 outlineAirport locations
  Found 1000 outlineRiver locations
  16524 properties have a tramStation within 0.75km (took 4.61s)
Processing outlineExpressWay...
  Found 1000 outlineExpressWay locations
  3343 properties have a outlineAirport within 2.0km (took 4.86s)
  11648 properties have a outlinePark within 0.5km (took 5.54s)
  18044 properties have a outlineRiver within 1.0km (took 5.10s)
  6993 properties have a outlineExpressWay within 0.5km (took 1.17s)
All POI types processed in 5.79 seconds
Dataset updated suc

In [54]:
# Rename distance columns to camelCase with proper capitalization
column_renames = {
    # Original columns
    'mindistometro': 'minDistToMetro',
    'mindistouni': 'minDistToUni',
    'mindistotradecenter': 'minDistToMall',
    'mindistocenter': 'minDistToCenter',
    
    # New columns - fix capitalization
    'minDistTotramStation': 'minDistToTramStation',
    'minDistTohospital': 'minDistToHospital',
    'minDistTotrainStation': 'minDistToTrainStation',
    'minDistToPark': 'minDistToPark',
    'minDistToRiver': 'minDistToRiver',
    'minDistToAirport': 'minDistToAirport',
    'minDistToExpressWay': 'minDistToExpressWay'
}

# Apply the renames to existing columns only
for old_name, new_name in column_renames.items():
    if old_name in df.columns:
        df = df.rename(columns={old_name: new_name})

# Print all columns to debug
print("Available columns:", df.columns.tolist())

# Create a list of columns to keep
columns_to_keep = []

# Add longitude and latitude to columns_to_keep if they exist
if 'longitude' in df.columns:
    columns_to_keep.append('longitude')
else:
    print("Warning: 'longitude' column not found in the dataset")
    
if 'latitude' in df.columns:
    columns_to_keep.append('latitude')
else:
    print("Warning: 'latitude' column not found in the dataset")

# Add pricePerSqm if it exists
if 'pricePerSqm' in df.columns:
    columns_to_keep.append('pricePerSqm')
else:
    print("Warning: 'pricePerSqm' column not found in the dataset")

# Add all distance columns that exist
columns_to_keep.extend([col for col in df.columns if col.startswith('minDistTo')])

# Verify all expected distance columns exist
expected_columns = [
    'minDistToMetro', 'minDistToUni', 'minDistToMall', 'minDistToCenter',
    'minDistToTramStation', 'minDistToHospital', 'minDistToTrainStation',
    'minDistToPark', 'minDistToRiver', 'minDistToAirport', 'minDistToExpressWay'
]

# Check for any missing distance columns
missing_columns = [col for col in expected_columns if col not in df.columns]
if missing_columns:
    print(f"Warning: These expected columns are missing: {', '.join(missing_columns)}")

# Keep only the selected columns (that exist)
if columns_to_keep:
    df = df[columns_to_keep]
    
    # Save the final dataset
    df.to_csv('cleaned_datasets/neighbourhood_warszawa_distances.csv', index=False)
    
    print("Final dataset saved with distance columns!")
    print(f"Columns in final dataset: {', '.join(df.columns)}")
else:
    print("Error: No columns to keep. Check column names in the dataset.")

Available columns: ['pricePerSqm', 'latitude', 'longitude', 'minDistToMetro', 'minDistToUni', 'minDistToMall', 'minDistToCenter', 'minDistToTramStation', 'tramStation', 'minDistToHospital', 'hospital', 'minDistToTrainStation', 'trainStation', 'minDistToPark', 'outlinePark', 'minDistToRiver', 'outlineRiver', 'minDistToAirport', 'outlineAirport', 'minDistToExpressWay', 'outlineExpressWay', 'cityCentre', 'metroStation', 'uni', 'mall']
Final dataset saved with distance columns!
Columns in final dataset: longitude, latitude, pricePerSqm, minDistToMetro, minDistToUni, minDistToMall, minDistToCenter, minDistToTramStation, minDistToHospital, minDistToTrainStation, minDistToPark, minDistToRiver, minDistToAirport, minDistToExpressWay


In [55]:
pip install osmnx matplotlib

Note: you may need to restart the kernel to use updated packages.
