In [19]:
import pandas as pd
import datetime as dt
from datetime import timedelta
import numpy as np
import calendar
import requests
from bs4 import BeautifulSoup
import warnings

warnings.filterwarnings("ignore")

In [20]:
df = pd.read_csv('../../Dataset/BOULDER_Electric_Vehicle_Charging_Station_Data.csv')
df.head()

Unnamed: 0,ObjectId2,Station_Name,Address,City,State_Province,Zip_Postal_Code,Start_Date___Time,Start_Time_Zone,End_Date___Time,End_Time_Zone,Total_Duration__hh_mm_ss_,Charging_Time__hh_mm_ss_,Energy__kWh_,GHG_Savings__kg_,Gasoline_Savings__gallons_,Port_Type,ObjectID
0,1,BOULDER / JUNCTION ST1,2280 Junction Pl,Boulder,Colorado,80301,1/1/2018 17:49,MDT,1/1/2018 19:52,MDT,2:03:02,2:02:44,6.504,2.732,0.816,Level 2,0
1,2,BOULDER / JUNCTION ST1,2280 Junction Pl,Boulder,Colorado,80301,1/2/2018 8:52,MDT,1/2/2018 9:16,MDT,0:24:34,0:24:19,2.481,1.042,0.311,Level 2,1
2,3,BOULDER / JUNCTION ST1,2280 Junction Pl,Boulder,Colorado,80301,1/2/2018 21:11,MDT,1/3/2018 6:23,MDT,9:12:21,3:40:52,15.046,6.319,1.888,Level 2,2
3,4,BOULDER / ALPINE ST1,1275 Alpine Ave,Boulder,Colorado,80304,1/3/2018 9:19,MDT,1/3/2018 11:14,MDT,1:54:51,1:54:29,6.947,2.918,0.872,Level 2,3
4,5,BOULDER / BASELINE ST1,900 Baseline Rd,Boulder,Colorado,80302,1/3/2018 14:13,MDT,1/3/2018 14:30,MDT,0:16:58,0:16:44,1.8,0.756,0.226,Level 2,4


In [21]:
df.duplicated().sum()

0

In [22]:
# These tables contains the indexes of weather data tables for the period Jan-2018 to Nov-2023
tables_year = {'2018': [num for num in range(195, 218, 2)],
               '2019': [num for num in range(219, 242, 2)],
               '2020': [num for num in range(243, 266, 2)],
               '2021': [num for num in range(267, 290, 2)],
               '2022': [num for num in range(291, 314, 2)],
               '2023': [num for num in range(315, 336, 2)]}

# Source of Implementation: - https://towardsdatascience.com/a-guide-to-scraping-html-tables-with-pandas-and-beautifulsoup-7fc24c331cf7

#                           - https://github.com/SwethaSrikari/Predicting-EV-charging-demand/blob/main/Web_scraping_Colorado_weather.ipynb


# Function to scrape weather data table for a specific year from a given URL
def get_weather_data_table_basedOnYear(url, year):
    # Send a GET request to the specified URL
    page = requests.get(url) 
    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(page.text, 'html.parser') 
    
    rows = [] # Initialize an empty list to store rows of data

    # Loop through the tables associated with the specified year
    for table in tables_year[year]: 

        # Loop through the children of the table element
        for i, child in enumerate(soup.find_all('table')[table].children): 
            row = [] # Initialize an empty list to store data for each row

            # Loop through the cells (td elements) in the row
            for td in child: 
                try:
                    row.append(td.text) # Attempt to extract text content from each cell and append to the row list
                except:
                    continue # If an exception occurs (e.g., if the cell is not a td element), continue to the next iteration

            # Check if the row contains any data (i.e., if it's not an empty row)
            if len(row) > 0:
                rows.append(row) # Append the non-empty row to the list of rows

    # Create a DataFrame using the extracted rows, specifying columns and dropping duplicate rows
    df = pd.DataFrame(rows[1:], columns=rows[0]).drop_duplicates(keep=False)

    return df

# Define the url source
url = 'https://psl.noaa.gov/boulder/data.daily.html'

# Take the Weather Data Tables for each year
df_2018 = get_weather_data_table_basedOnYear(url, str(2018))
df_2019 = get_weather_data_table_basedOnYear(url, str(2019))
df_2020 = get_weather_data_table_basedOnYear(url, str(2020))
df_2021 = get_weather_data_table_basedOnYear(url, str(2021))
df_2022 = get_weather_data_table_basedOnYear(url, str(2022))
df_2023 = get_weather_data_table_basedOnYear(url, str(2023))


# Function to check for leap years
def is_leap_year(year):
    return calendar.isleap(year)


# Function to remove leading and trailing spaces from a string value
def strip_spaces(value):
    # Check if the value is a string before applying strip()
    return value.strip() if isinstance(value, str) else value



# Function to prepare weather data
def prepare_weather_data(df):
    
    # Clean column names by removing extra spaces
    df.columns = df.columns.str.strip()

    # Remove extra spaces from all rows in the DataFrame
    df = df.applymap(strip_spaces)

    # Handle 'T' as a trace (less than 0.01 inches for precipitation and 0.1 for snow)
    df["Snow"] = df["Snow"].replace('T', 0.099)
    df["Precipitation"] = df["Precipitation"].replace('T', 0.0099)

    # Exclude rows with 'Miss' from the DataFrame
    df = df.loc[~(df == 'Miss').any(axis=1)]

    # Convert year, month, day to numeric
    df[["Year", "Month", "Day"]] = df[["Year", "Month", "Day"]].apply(pd.to_numeric)

    # Handle February based on leap year
    mask = (df["Month"] == 2) & (df["Day"] == 29) & ~df["Year"].apply(is_leap_year)
    # If the mask is true for a row, set the value of the "Day" column to 28
    df.loc[mask, "Day"] = 28

    # Convert specified columns to numeric and remove 'Snow Depth' column
    df = df[["Year", "Month", "Day", "Maximum T", "Minimum T", "Precipitation", "Snow"]].apply(pd.to_numeric)

    # Create a date column
    df["Date"] = pd.to_datetime(df[["Year", "Month", "Day"]]).dt.date.astype("datetime64")

    return df

# Clean the Weather Data Tables for each year
df_2018 = prepare_weather_data(df_2018)
df_2019 = prepare_weather_data(df_2019)
df_2020 = prepare_weather_data(df_2020)
df_2021 = prepare_weather_data(df_2021)
df_2022 = prepare_weather_data(df_2022)
df_2023 = prepare_weather_data(df_2023)


# Concat all weather dataframes into one DataFrame
weather_df = pd.concat([df_2018, df_2019, df_2020, df_2021, df_2022, df_2023], ignore_index=True)
weather_df.sort_values('Date', inplace=True)
print(weather_df['Date'].is_monotonic_increasing)

len(df_2018) + len(df_2019) + len(df_2020) + len(df_2021) + len(df_2022) + len(df_2023)  == len(weather_df)

True


True

In [23]:
# sorted(df['Station_Name'].unique())

In [24]:
def date_columns_to_datetime(df, date_columns):
    for column in date_columns:
        df[column] = df[column].apply(lambda x: pd.to_datetime(x, errors='coerce'))

    return df

# Must have as a seperator ':'
def date_columns_to_timedelta(time_str):
    try:
        hours, minutes, seconds = map(int, time_str.split(':'))
        return pd.Timedelta(hours=hours, minutes=minutes, seconds=seconds)
    except ValueError:
        return pd.NaT  # Handle invalid time format  
    
# Make the columns datetimes and timedelta
df1 = df.copy()

date_columns_to_dt= ['Start_Date___Time','End_Date___Time']
df1 = date_columns_to_datetime(df1, date_columns_to_dt)

date_columns_to_td = ['Total_Duration__hh_mm_ss_','Charging_Time__hh_mm_ss_']
for column in date_columns_to_td:
    df1[column] = df1[column].apply(date_columns_to_timedelta)

In [25]:
# Create a Date column to use it like a foreign key for the weather DataFrames
df1["Date"] = df1['Start_Date___Time'].dt.date.astype("datetime64")

# Sort increasing the dates per stations 
print(df1['Date'].is_monotonic_increasing)
#print(df1.groupby('Station_Name')['Date'].apply(lambda x: x.is_monotonic_increasing).all())

df1.sort_values(by='Date', inplace=True)
print(df1['Date'].is_monotonic_increasing)
#df1.sort_values(by=['Station_Name', 'Date'], inplace=True)
#print(df1.groupby('Station_Name')['Date'].apply(lambda x: x.is_monotonic_increasing).all())

False
True


In [26]:
# Merge the main Dataframe with the Weather Data
print(df1.shape)
df1 = df1.merge(weather_df, on='Date', how='left')
df1.shape

(148136, 18)


(148136, 25)

In [27]:
def data_transformation(df):

    # Convert inches to millimeters
    df['Snow'] = df['Snow'] * 25.4
    df['Precipitation'] = df['Precipitation'] * 25.4

    # Make the Month and Day to categorical variables 
    df['Weekday'] = df['Date'].dt.day_name()
    df['Month'] = df['Date'].dt.month_name()

    # Remove null (0) values in Engery_kwh_
    df = df[df['Energy__kWh_'] > 0]

    # Change datatype from float to integer
    df[["Year", "Day"]] = df[["Year", "Day"]].astype(np.int64)
    
    return df

df2 = df1.copy()
df2 = data_transformation(df1)
print('We drop the',round((df1.shape[0] - df2.shape[0]) / df1.shape[0],3),'% of the Dataset.')

We drop the 0.109 % of the Dataset.


In [29]:
# The paper approach duration (January 2018 to August 2022)
print('DataFrame before the filtering:\nStart and End Datetime:',df2.Date.min(), df2.Date.max(),'\nLength:',len(df2),'\n')
#df2 = df2[df2['Date'] < '2022-08-01']
df2['Date'] = df2.loc[:, 'Date'].apply(pd.to_datetime)

columns = ['Energy__kWh_','Date','Day','Year','Weekday','Month','Minimum T','Maximum T','Snow','Precipitation']
df2.drop_duplicates(subset=columns, keep='first', inplace=True)
print('DataFrame after the filtering:\nStart and End Datetime:',df2.Date.min(), df2.Date.max(),'\nLength:',len(df2),'\n')

DataFrame before the filtering:
Start and End Datetime: 2018-01-01 00:00:00 2023-11-30 00:00:00 
Length: 132023 

DataFrame after the filtering:
Start and End Datetime: 2018-01-01 00:00:00 2023-11-30 00:00:00 
Length: 69061 



In [30]:
df2.to_csv('../../Dataset/EntireBoulderWithWeather.csv')