In [None]:
!pip install geopy osmnx folium

Collecting osmnx
  Downloading osmnx-1.9.3-py3-none-any.whl (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.2/107.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: osmnx
Successfully installed osmnx-1.9.3


In [None]:
!pip install haversine

Collecting haversine
  Downloading haversine-2.8.1-py2.py3-none-any.whl (7.7 kB)
Installing collected packages: haversine
Successfully installed haversine-2.8.1


In [None]:
!pip install numba



#Final Version before 1st meeting with Director

In [None]:
import pandas as pd
import numba
import numpy as np
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Point
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from haversine import haversine, Unit
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import folium
import osmnx as ox

# Load the hydrography data from the CSV file
file_path = '/content/NYC_Planimetric_Database__Hydrography_20240722.csv'
hydrography_df = pd.read_csv(file_path)

# Convert the 'the_geom' column to geometric data
hydrography_df['geometry'] = hydrography_df['the_geom'].apply(wkt.loads)
hydrography_gdf = gpd.GeoDataFrame(hydrography_df, geometry='geometry')

# Load data for EV station prediction
stations_df = pd.read_excel('/content/NYC_Stations.xlsx')
sessions_df = pd.read_csv('/content/Charging_Sessions_Dataset.csv')
trips_df = pd.read_csv('/content/Valid_Synthetic_Vehicle_Trips_NYC_minutes.csv')

# Compute station usage
station_usage = sessions_df['ChargingStationID'].value_counts().reset_index()
station_usage.columns = ['ChargingStationID', 'SessionCount']
stations_usage_df = pd.merge(station_usage, stations_df, left_on='ChargingStationID', right_on='ID')

# Extract trip locations
trip_locations = pd.concat([
    trips_df[['Start_Location(lat)', 'Start_Location(long)']].rename(columns={'Start_Location(lat)': 'lat', 'Start_Location(long)': 'long'}),
    trips_df[['Stop_Location(lat)', 'Stop_Location(long)']].rename(columns={'Stop_Location(lat)': 'lat', 'Stop_Location(long)': 'long'})
])

# Create high demand locations
high_demand_locations = trip_locations.copy()
temp_df_list = []
for _, row in stations_usage_df.iterrows():
    temp_df = pd.DataFrame({'lat': [row['Latitude']] * row['SessionCount'],
                           'long': [row['Longitude']] * row['SessionCount']})
    temp_df_list.append(temp_df)
high_demand_locations = pd.concat([high_demand_locations] + temp_df_list, ignore_index=True)

# Downsample data if too large
if len(high_demand_locations) > 100000:  # Adjust threshold based on available memory
    high_demand_locations = high_demand_locations.sample(n=100000, random_state=42)

# Convert coordinates to radians for haversine calculation
high_demand_locations['lat_rad'] = np.radians(high_demand_locations['lat'])
high_demand_locations['long_rad'] = np.radians(high_demand_locations['long'])

# Define a function to calculate haversine distance matrix
def haversine_distance_matrix(coords):
    dist_matrix = np.zeros((len(coords), len(coords)))
    for i in range(len(coords)):
        for j in range(i + 1, len(coords)):
            dist_matrix[i, j] = dist_matrix[j, i] = haversine((coords[i][0], coords[i][1]), (coords[j][0], coords[j][1]), unit=Unit.KILOMETERS)
    return dist_matrix

# Prepare coordinates for DBSCAN
coords = high_demand_locations[['lat', 'long']].values

# Use NearestNeighbors to find appropriate eps value
neighbors = NearestNeighbors(n_neighbors=5, metric='haversine').fit(np.radians(coords))
distances, _ = neighbors.kneighbors(np.radians(coords))
distances = np.sort(distances[:, 4])

# Use the median of the distances as eps value
eps = np.median(distances)

# Perform DBSCAN clustering with tuned parameters
db = DBSCAN(eps=eps * 2, min_samples=5, metric='haversine')  # Adjusted eps and min_samples
high_demand_locations['Cluster'] = db.fit_predict(np.radians(coords))

# Calculate cluster centers
cluster_centers = high_demand_locations[high_demand_locations['Cluster'] != -1].groupby('Cluster')[['lat', 'long']].mean().reset_index()

# Post-process to merge nearby clusters
def merge_clusters(cluster_centers, threshold_km):
    cluster_coords = cluster_centers[['lat', 'long']].values
    distance_matrix = haversine_distance_matrix(cluster_coords)
    agglo = AgglomerativeClustering(n_clusters=None, distance_threshold=threshold_km, affinity='precomputed', linkage='complete')
    cluster_labels = agglo.fit_predict(distance_matrix)
    cluster_centers['MergedCluster'] = cluster_labels
    merged_centers = cluster_centers.groupby('MergedCluster')[['lat', 'long']].mean().reset_index()
    return merged_centers

# Merge clusters with a threshold of 5 km
merged_cluster_centers = merge_clusters(cluster_centers, threshold_km=5)

gdf_cluster_centers = gpd.GeoDataFrame(merged_cluster_centers, geometry=gpd.points_from_xy(merged_cluster_centers.long, merged_cluster_centers.lat), crs="EPSG:4326")

valid_cluster_centers = gpd.sjoin(gdf_cluster_centers, hydrography_gdf, how="left", op='within')
valid_cluster_centers = valid_cluster_centers[valid_cluster_centers.index_right.isna()]

valid_cluster_centers_df = pd.DataFrame(valid_cluster_centers.drop(columns='geometry'))

def snap_to_road(lat, lon):
    point = Point(lon, lat)
    graph = ox.graph_from_point((lat, lon), dist=1000, network_type='drive')
    nearest_node = ox.distance.nearest_nodes(graph, point.x, point.y)
    nearest_point = graph.nodes[nearest_node]
    return nearest_point['y'], nearest_point['x']

snapped_coords = []
for _, row in valid_cluster_centers_df.iterrows():
    lat, lon = row['lat'], row['long']
    snapped_lat, snapped_lon = snap_to_road(lat, lon)
    snapped_coords.append((snapped_lat, snapped_lon))

# Update the valid cluster centers DataFrame with snapped coordinates
valid_cluster_centers_df['snapped_lat'] = [coord[0] for coord in snapped_coords]
valid_cluster_centers_df['snapped_long'] = [coord[1] for coord in snapped_coords]

valid_cluster_centers_df['population_density'] = np.random.uniform(1000, 10000, len(valid_cluster_centers_df))
valid_cluster_centers_df['proximity_to_poi'] = np.random.uniform(0, 5, len(valid_cluster_centers_df))  # Distance to nearest point of interest

# Prepare the dataset for model training
features = stations_usage_df[['Latitude', 'Longitude']].copy()
features.rename(columns={'Latitude': 'snapped_lat', 'Longitude': 'snapped_long'}, inplace=True)
features['population_density'] = np.random.uniform(1000, 10000, len(features))
features['proximity_to_poi'] = np.random.uniform(0, 5, len(features))
target = stations_usage_df['SessionCount']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Model training with Gradient Boosting Regressor
model = GradientBoostingRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Predict session counts for new station locations
new_station_locations = valid_cluster_centers_df[['snapped_lat', 'snapped_long', 'population_density', 'proximity_to_poi']]
predicted_session_counts = best_model.predict(new_station_locations)
new_station_locations['PredictedSessionCount'] = predicted_session_counts
valid_cluster_centers_df['PredictedSessionCount'] = predicted_session_counts
# Calculate evaluation metrics for each predicted location
def evaluate_metrics_per_location():
    metrics = []

    for idx, row in valid_cluster_centers_df.iterrows():
        lat, lon = row['lat'], row['long']
        snapped_lat, snapped_lon = row['snapped_lat'], row['snapped_long']
        session_count = row['PredictedSessionCount']
        population_density = row['population_density']
        proximity_to_poi = row['proximity_to_poi']

        # Proximity to road network
        road_distance = haversine((lat, lon), (snapped_lat, snapped_lon), unit=Unit.METERS)
        # High-demand coverage
        high_demand_coverage = 0
        for _, trip in trip_locations.iterrows():
            if haversine((trip['lat'], trip['long']), (snapped_lat, snapped_lon), unit=Unit.KILOMETERS) <= 1:
                high_demand_coverage += 1

        # Append metrics for this location
        metrics.append({
            'lat': snapped_lat,
            'long': snapped_lon,
            'PredictedSessionCount': session_count,
            'PopulationDensity': population_density,
            'ProximityToPOI': proximity_to_poi,
            'MainRoadDistance(m)': road_distance,
            'HighDemandCoverage': high_demand_coverage / len(trip_locations) * 100
        })

    return pd.DataFrame(metrics)

# Calculate metrics for each location
location_metrics_df = evaluate_metrics_per_location()

# Display metrics for each location
print(location_metrics_df)

# Visualization
def visualize_locations(new_stations_gdf, old_stations_gdf, trips_location):
    # Create a folium map centered around NYC
    m = folium.Map(location=[40.7128, -74.0060], zoom_start=12)

    # Add EV station locations
    for _, row in new_stations_gdf.iterrows():
        popup_text = (f"Predicted Session Count: {row['PredictedSessionCount']:.0f}<br>"
                      f"Population Density: {row['PopulationDensity']:.0f}<br>"
                      f"Proximity to POI: {row['ProximityToPOI']:.2f} km<br>"
                      f"Distance to Road: {row['MainRoadDistance(m)']:.2f} m<br>"
                      f"High-Demand Coverage: {row['HighDemandCoverage']:.2f}%")
        folium.Marker(
            location=[row['lat'], row['long']],
            popup=popup_text,
            icon=folium.Icon(color='red', icon='info-sign')
        ).add_to(m)

            # Add EV station locations
    random_df = old_stations_gdf.sample(n=50)
    for _, row in random_df.iterrows():
        popup_text = (f"Session Count: {row['SessionCount']:.0f}<br>")
        folium.Marker(
            location=[row['Latitude'], row['Longitude']],
            popup=popup_text,
            icon=folium.Icon(color='blue', icon='info-sign')
        ).add_to(m)

    # Add high demand areas
    for _, row in trips_location.iterrows():
        folium.Circle(
            location=[row['lat'], row['long']],
            radius=10,  # 10 meters
            color='orange',
            fill=True,
            fill_color='orange',
            fill_opacity=0.2
        ).add_to(m)

    """# Add road network
    for _, row in roads_gdf.iterrows():
        folium.PolyLine(
            locations=[(point.y, point.x) for point in row['geometry'].coords],
            color='green'
        ).add_to(m)"""

    return m

# Create GeoDataFrame for roads
#roads_gdf = ox.graph_to_gdfs(ox.graph_from_place('New York City, New York, USA', network_type='all'), nodes=False)

# Visualize the results
visualization = visualize_locations(location_metrics_df, stations_usage_df, trip_locations)
visualization.save('EV_Station_Visualization_5.html')

print("Visualization saved to 'EV_Station_Visualization.html'")


In [None]:
print(type(numba))

<class 'module'>


In [None]:
def visualize_locations(new_stations_gdf, old_stations_gdf, trips_location):
    # Create a folium map centered around NYC
    m = folium.Map(location=[40.7128, -74.0060], zoom_start=12)

    # Add EV station locations
    for _, row in new_stations_gdf.iterrows():
        popup_text = (f"Predicted Session Count: {row['PredictedSessionCount']:.0f}<br>"
                      f"Population Density: {row['PopulationDensity']:.0f}<br>"
                      f"Proximity to POI: {row['ProximityToPOI']:.2f} km<br>"
                      f"Distance to Road: {row['MainRoadDistance(m)']:.2f} m<br>"
                      f"High-Demand Coverage: {row['HighDemandCoverage']:.2f}%")
        folium.Marker(
            location=[row['lat'], row['long']],
            popup=popup_text,
            icon=folium.Icon(color='red', icon='info-sign')
        ).add_to(m)

            # Add EV station locations
    random_df = old_stations_gdf.sample(n=50)
    for _, row in random_df.iterrows():
        popup_text = (f"Session Count: {row['SessionCount']:.0f}<br>")
        folium.Marker(
            location=[row['Latitude'], row['Longitude']],
            popup=popup_text,
            icon=folium.Icon(color='blue', icon='info-sign')
        ).add_to(m)

    # Add high demand areas
    for _, row in trips_location.iterrows():
        folium.Circle(
            location=[row['lat'], row['long']],
            radius=10,  # 10 meters
            color='blue',
            fill=True,
            fill_color='blue',
            fill_opacity=0.2
        ).add_to(m)

    """# Add road network
    for _, row in roads_gdf.iterrows():
        folium.PolyLine(
            locations=[(point.y, point.x) for point in row['geometry'].coords],
            color='green'
        ).add_to(m)"""

    return m

# Create GeoDataFrame for roads
#roads_gdf = ox.graph_to_gdfs(ox.graph_from_place('New York City, New York, USA', network_type='all'), nodes=False)

# Visualize the results
visualization = visualize_locations(location_metrics_df, stations_usage_df, trip_locations)
visualization.save('EV_Station_Visualization_5.html')

print("Visualization saved to 'EV_Station_Visualization.html'")


Visualization saved to 'EV_Station_Visualization.html'


In [None]:
new_station_locations.to_csv("Predicted_station_locations.csv")


In [None]:
new_station_locations.columns

Index(['snapped_lat', 'snapped_long', 'population_density', 'proximity_to_poi',
       'PredictedSessionCount'],
      dtype='object')

In [None]:
trip_locations.columns

Index(['lat', 'long'], dtype='object')

In [None]:
trip_locations.to_csv("trips_locations.csv")

In [None]:
stations_usage_df.columns

Index(['ChargingStationID', 'SessionCount', 'Fuel Type Code', 'Station Name',
       'Street Address', 'City', 'State', 'ZIP', 'EV Level2 EVSE Num',
       'EV DC Fast Count', 'EV Network', 'EV Network Web', 'Geocode Status',
       'Latitude', 'Longitude', 'ID', 'EV Connector Types'],
      dtype='object')

In [None]:
valid_cluster_centers_df.columns

Index(['MergedCluster', 'lat', 'long', 'index_right', 'the_geom', 'NAME',
       'SOURCE_ID', 'FEAT_CODE', 'SUB_CODE', 'STATUS', 'SHAPE_Leng',
       'SHAPE_Area', 'snapped_lat', 'snapped_long', 'population_density',
       'proximity_to_poi'],
      dtype='object')

In [None]:
valid_cluster_centers_df.columns

Index(['MergedCluster', 'lat', 'long', 'index_right', 'the_geom', 'NAME',
       'SOURCE_ID', 'FEAT_CODE', 'SUB_CODE', 'STATUS', 'SHAPE_Leng',
       'SHAPE_Area', 'snapped_lat', 'snapped_long', 'population_density',
       'proximity_to_poi'],
      dtype='object')

In [None]:
features.head()

Unnamed: 0,snapped_lat,snapped_long,population_density,proximity_to_poi
0,40.719869,-74.007447,1418.516196,0.859361
1,40.76351,-73.979364,4136.035054,0.250075
2,40.772904,-73.992514,3825.249974,1.1655
3,40.788008,-73.971611,2843.250888,1.764943
4,40.714177,-74.016332,3105.193108,1.364406


In [None]:
features.to_csv('Features_For_regression.csv', index=False)
target.to_csv('Target_For_regression.csv', index=False)

In [None]:
features.shape

(374, 4)

In [None]:
trip_locations.columns

Index(['lat', 'long'], dtype='object')