In [None]:
'''Step 1:
  Conacanate AFDC historical data files into a single DataFrame.
    Load cleaned geojson files into GeoDataFrames.
'''
#Loading cleaned data
import pandas as pd
import geopandas as gpd
import os
import glob

processed_path = "../data/processed/"
afdc_files = glob.glob(os.path.join(processed_path, "alt_fuel_stations_historical_day*.csv"))

if not afdc_files:
    raise FileNotFoundError("No AFDC historical data files found in the processed data directory.")

#Create a list to hold each year's DataFrame
else:
    afdc_df_list = []
    for file in afdc_files:
        try:
            year = int(file.split('(')[-1].split(')')[0].split(' ')[-1])
            temp_df = pd.read_csv(file)
            temp_df['Year'] = year
            afdc_df_list.append(temp_df)
        except Exception as e:
            print(f"Error processing file {file}: {e}")

#Combine all years into single DataFrame
if afdc_df_list:
    afdc_data = pd.concat(afdc_df_list, ignore_index=True)
    print(f"Successfully loaded and combined {len(afdc_files)} AFDC files into a single DF.")
    print(f"Total records: {len(afdc_data)}")
    print(f"Years found: {sorted(afdc_data['Year'].unique())}")
else:
    
    print("Warning: No data was loaded into the afdc_df_list.")
    afdc_data = pd.DataFrame()

#Load geojson data
try:
    gdf_chargers = gpd.read_file(os.path.join(processed_path, "chargingStationWashington_cleaned.geojson"))
    gdf_roads = gpd.read_file(os.path.join(processed_path, "majorRoadsWashington_cleaned.geojson"))
    gdf_shops = gpd.read_file(os.path.join(processed_path, "shopsWashington_cleaned.geojson"))
    gdf_leisure = gpd.read_file(os.path.join(processed_path, "leisureWashington_cleaned.geojson"))
    gdf_residential = gpd.read_file(os.path.join(processed_path, "residentialWashington_cleaned.geojson"))
    gdf_amenities = gpd.read_file(os.path.join(processed_path, "amenitiesWashington_cleaned.geojson"))


    print(f"Loaded {len(gdf_chargers)} charging stations.")
    print(f"Loaded {len(gdf_roads)} road segments.")
    print(f"Loaded {len(gdf_shops)} shops.")
    print(f"Loaded {len(gdf_leisure)} leisure spots.")
    print(f"Loaded {len(gdf_residential)} residential buildings.")
    print(f"Loaded {len(gdf_amenities)} amenities.")

except FileNotFoundError as e:
    print(f"Error loading files: {e}. Make sure all cleaned files are in the 'processed' directory.")

Successfully loaded and combined 11 AFDC files into a single DF.
Total records: 16721
Years found: [np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
Loaded 676 charging stations.
Loaded 47785 road segments.
Loaded 29895 shops.
Loaded 53416 leisure spots.
Loaded 125104 residential buildings.
Loaded 145434 amenities.


In [6]:
poi_list = [gdf_shops, gdf_leisure, gdf_residential, gdf_amenities]
gdf_pois = pd.concat(poi_list, ignore_index=True)
print(f"Combined all POIs into a single dataframe with {len(gdf_pois)} records.")

#Set CRS (coordinate reference system) to ESPG:32148, Washington's (NAD83) projected CRS
target_crs = "EPSG:32148"
gdf_chargers = gdf_chargers.to_crs(target_crs)
gdf_roads = gdf_roads.to_crs(target_crs)
gdf_pois = gdf_pois.to_crs(target_crs)

print("All GeoDataFrames reprojected to EPSG:32148.")


Combined all POIs into a single dataframe with 353849 records.
All GeoDataFrames reprojected to EPSG:32148.


In [None]:
"""
Calculating proximity and density metrics, using TQDM, cool side node
tqdm's name derives from the Arabic word taqadum which means progress because
tqdm is a fast, extensible progress bar for Python and CLI."""
from tqdm import tqdm
tqdm.pandas(desc = "Calculating distances")  # Enable tqdm for pandas

major_road_types = ['motorway', 'trunk', 'primary', 'secondary']
gdf_major_roads = gdf_roads[gdf_roads['highway'].isin(major_road_types)]

print(f"Filtered roads down to {len(gdf_major_roads)} major road segments.")

print("Calculating disrtance to nearest highway from charging station")
"""
Code below works like this. Firstly, we add a new column 'dist_to_major_road_m' to gdf_chargers.
then we select the geometry key from the gdf_chargers GeoDataFrame, which contains the point geometries of the charging stations.
the line progress apply tells pandas to apply a defined function to each point in the geometry column, while also displaying a progress bar using tqdm.
lambda point is like an anonymous mini-function that you dont make a funtion for. Then it calculates the distance from point(the charging station)
to all geometries in gdf_major_roads using gdf_major_roads.distance(point). min function is there since the function finds distances to all roads
in the dataset and we only want the minimum distance, which is the nearest road.

"""
gdf_chargers['dist_to_major_road_m'] = gdf_chargers.geometry.progress_apply(
    lambda point: gdf_major_roads.distance(point).min()
)

print("Distance to nearest highway calculated.")

print("\n--- Feature Engineering Complete ---")
print("first 5 rows of gdf_chargers with new features:")
print(gdf_chargers[['dist_to_major_road_m', 'poi_density_500m']].head())
