In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/EntireBoulderWithWeather.csv')
df.shape

(69061, 27)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,ObjectId2,Station_Name,Address,City,State_Province,Zip_Postal_Code,Start_Date___Time,Start_Time_Zone,End_Date___Time,...,ObjectID,Date,Year,Month,Day,Maximum T,Minimum T,Precipitation,Snow,Weekday
0,0,1,BOULDER / JUNCTION ST1,2280 Junction Pl,Boulder,Colorado,80301,2018-01-01 17:49:00,MDT,2018-01-01 19:52:00,...,0,2018-01-01,2018,January,1,30,12,0.0,0.0,Monday
1,2,3,BOULDER / JUNCTION ST1,2280 Junction Pl,Boulder,Colorado,80301,2018-01-02 21:11:00,MDT,2018-01-03 06:23:00,...,2,2018-01-02,2018,January,2,46,12,0.0,0.0,Tuesday
2,3,70196,BOULDER / JUNCTION ST1,2280 Junction Pl,Boulder,Colorado,80301,2018-01-02 08:52:00,MDT,2018-01-02 09:16:00,...,1,2018-01-02,2018,January,2,46,12,0.0,0.0,Tuesday
3,6,4,BOULDER / ALPINE ST1,1275 Alpine Ave,Boulder,Colorado,80304,2018-01-03 09:19:00,MDT,2018-01-03 11:14:00,...,3,2018-01-03,2018,January,3,50,20,0.0,0.0,Wednesday
4,7,5,BOULDER / BASELINE ST1,900 Baseline Rd,Boulder,Colorado,80302,2018-01-03 14:13:00,MDT,2018-01-03 14:30:00,...,4,2018-01-03,2018,January,3,50,20,0.0,0.0,Wednesday


In [4]:
df.columns

Index(['Unnamed: 0', 'ObjectId2', 'Station_Name', 'Address', 'City',
       'State_Province', 'Zip_Postal_Code', 'Start_Date___Time',
       'Start_Time_Zone', 'End_Date___Time', 'End_Time_Zone',
       'Total_Duration__hh_mm_ss_', 'Charging_Time__hh_mm_ss_', 'Energy__kWh_',
       'GHG_Savings__kg_', 'Gasoline_Savings__gallons_', 'Port_Type',
       'ObjectID', 'Date', 'Year', 'Month', 'Day', 'Maximum T', 'Minimum T',
       'Precipitation', 'Snow', 'Weekday'],
      dtype='object')

In [5]:
data = df[['Station_Name', 'Address', 'Start_Date___Time',
       'Start_Time_Zone', 'End_Date___Time', 'End_Time_Zone',
       'Total_Duration__hh_mm_ss_', 'Charging_Time__hh_mm_ss_', 'Energy__kWh_',
       'ObjectID', 'Date', 'Maximum T', 'Minimum T',
       'Precipitation', 'Snow']]

data.head()

Unnamed: 0,Station_Name,Address,Start_Date___Time,Start_Time_Zone,End_Date___Time,End_Time_Zone,Total_Duration__hh_mm_ss_,Charging_Time__hh_mm_ss_,Energy__kWh_,ObjectID,Date,Maximum T,Minimum T,Precipitation,Snow
0,BOULDER / JUNCTION ST1,2280 Junction Pl,2018-01-01 17:49:00,MDT,2018-01-01 19:52:00,MDT,0 days 02:03:02,0 days 02:02:44,6.504,0,2018-01-01,30,12,0.0,0.0
1,BOULDER / JUNCTION ST1,2280 Junction Pl,2018-01-02 21:11:00,MDT,2018-01-03 06:23:00,MDT,0 days 09:12:21,0 days 03:40:52,15.046,2,2018-01-02,46,12,0.0,0.0
2,BOULDER / JUNCTION ST1,2280 Junction Pl,2018-01-02 08:52:00,MDT,2018-01-02 09:16:00,MDT,0 days 00:24:34,0 days 00:24:19,2.481,1,2018-01-02,46,12,0.0,0.0
3,BOULDER / ALPINE ST1,1275 Alpine Ave,2018-01-03 09:19:00,MDT,2018-01-03 11:14:00,MDT,0 days 01:54:51,0 days 01:54:29,6.947,3,2018-01-03,50,20,0.0,0.0
4,BOULDER / BASELINE ST1,900 Baseline Rd,2018-01-03 14:13:00,MDT,2018-01-03 14:30:00,MDT,0 days 00:16:58,0 days 00:16:44,1.8,4,2018-01-03,50,20,0.0,0.0


In [6]:
# Convert 'Charging_Time__hh_mm_ss_' to Timedelta
data['Charging_Time__hh_mm_ss_'] = pd.to_timedelta(data['Charging_Time__hh_mm_ss_'])

# Convert Timedelta to minutes
data['Total_Duration_minutes'] = data['Charging_Time__hh_mm_ss_'].dt.total_seconds() / 60

# Calculate the median and standard deviation of the duration in minutes
median_duration = data['Total_Duration_minutes'].median()
std_duration = data['Total_Duration_minutes'].std()

# Define the threshold for outliers
upper_threshold = median_duration + 3 * std_duration
lower_threshold = median_duration - 3 * std_duration

# Filter out the outliers
filtered_data = data[(data['Total_Duration_minutes'] >= lower_threshold) & (data['Total_Duration_minutes'] <= upper_threshold)]

# Check the percentage of rows removed
percent_removed = (1 - len(filtered_data) / len(data)) * 100
print(f'{percent_removed:.2f}% of outliers were removed.')

2.35% of outliers were removed.


In [7]:
data = data[['Date', 'Maximum T', 'Minimum T', 'Precipitation', 'Snow', 'Energy__kWh_']].copy()
data.shape

(69061, 6)

In [8]:
data.head()

Unnamed: 0,Date,Maximum T,Minimum T,Precipitation,Snow,Energy__kWh_
0,2018-01-01,30,12,0.0,0.0,6.504
1,2018-01-02,46,12,0.0,0.0,15.046
2,2018-01-02,46,12,0.0,0.0,2.481
3,2018-01-03,50,20,0.0,0.0,6.947
4,2018-01-03,50,20,0.0,0.0,1.8


In [9]:
# Group by 'Date' and aggregate
daily = df.groupby('Date').agg({
    'Maximum T': 'mean',
    'Minimum T': 'mean',
    'Precipitation': 'mean',
    'Snow': 'mean',
    'Energy__kWh_': 'mean'
}).reset_index()

daily.shape

(2155, 6)

In [10]:
daily.head()

Unnamed: 0,Date,Maximum T,Minimum T,Precipitation,Snow,Energy__kWh_
0,2018-01-01,30.0,12.0,0.0,0.0,6.504
1,2018-01-02,46.0,12.0,0.0,0.0,8.7635
2,2018-01-03,50.0,20.0,0.0,0.0,3.742333
3,2018-01-04,52.0,24.0,0.0,0.0,8.291333
4,2018-01-05,62.0,25.0,0.0,0.0,1.478


In [11]:
# Ensure 'Date' is in datetime format
daily['Date'] = pd.to_datetime(daily['Date'])

# Generate the full date range
full_date_range = pd.date_range(start=daily['Date'].min(), end=daily['Date'].max())

# Check for missing dates
missing_dates = full_date_range.difference(daily['Date'])

if not missing_dates.empty:
    print("Missing dates:")
    print(missing_dates)
else:
    print("No missing dates in the dailyset.")

Missing dates:
DatetimeIndex(['2020-03-19', '2020-03-20', '2020-04-02', '2020-04-13',
               '2021-03-14'],
              dtype='datetime64[ns]', freq=None)


In [12]:
def linear_interpolation(g, g1, g2, d1, d2):
    """
    Linear Interpolation Function
    """
    return d1 + ((g - g1) * (d2 - d1) / (g2 - g1))

def interpolate_missing_values(df, columns_to_interpolate):
    """
    Interpolates missing values for specified columns in the DataFrame.
    
    Parameters:
    - df: The DataFrame with missing dates filled in.
    - columns_to_interpolate: List of columns to apply interpolation.
    
    Returns:
    - DataFrame with interpolated values for missing entries.
    """
    for column in columns_to_interpolate:
        null_values = df[df[column].isnull()]
        null_indexes = null_values.index.to_list()

        while len(null_indexes) > 0:
            predict_date = null_indexes.pop(0) # The g in interpolate
            prev_date = df.loc[:predict_date].dropna(subset=[column]).iloc[-1].name # The g1 in interpolate
            next_date = df.loc[predict_date:].dropna(subset=[column]).iloc[0].name # The g2 in interpolate

            d1 = df.loc[prev_date, column]
            d2 = df.loc[next_date, column]
            
            df.loc[predict_date, column] = linear_interpolation(predict_date, prev_date, next_date, d1, d2)

    return df

# Set 'Date' as the index for easy date-based operations
daily.set_index('Date', inplace=True)

# Reindex the DataFrame to include the missing dates
daily_full = daily.reindex(full_date_range)

# Interpolate missing values
columns_to_interpolate = ['Maximum T', 'Minimum T', 'Precipitation', 'Snow', 'Energy__kWh_']
daily_interpolated = interpolate_missing_values(daily_full, columns_to_interpolate)

# Reset index to make 'Date' a column again
daily_interpolated.reset_index(inplace=True)
daily_interpolated.rename(columns={'index': 'Date'}, inplace=True)

daily_interpolated.isnull().sum()

Date             0
Maximum T        0
Minimum T        0
Precipitation    0
Snow             0
Energy__kWh_     0
dtype: int64

In [13]:
daily_interpolated[daily_interpolated['Date'].isin(missing_dates)]

Unnamed: 0,Date,Maximum T,Minimum T,Precipitation,Snow,Energy__kWh_
808,2020-03-19,58.0,24.666667,0.0,0.0,7.746667
809,2020-03-20,55.0,19.333333,0.0,0.0,9.130833
822,2020-04-02,56.0,28.5,0.381,3.81,10.61175
833,2020-04-13,54.5,14.5,7.23773,113.0173,4.539
1168,2021-03-14,40.0,23.0,5.588,40.64,5.121375


In [14]:
def process_charging_data(df):
    
    def past_chg_occ_state(df):
        
        df.rename(columns={'Energy__kWh_': 'y'}, inplace=True)
        # Add the y_t_1 column
        df['y_t_1'] = df['y'].shift(1).astype('float64')
        return df

    def add_weekdays_and_weekends(df):
        df = past_chg_occ_state(df)

        # Convert 'Date' to datetime to get the day of the week
        df['Date'] = pd.to_datetime(df['Date'])
        # Extract the name of the day
        df['NameOfDay'] = df['Date'].dt.day_name()
        # Adjust DayOfWeek to start with Sunday=0, Monday=1, ..., Saturday=6
        df['dayofweek'] = (df['Date'].dt.dayofweek + 1) % 7
        # Determine if it is a weekend
        df['weekend'] = df['dayofweek'].apply(lambda x: 1 if x in [0, 6] else 0)

        return df

    def add_day_of_month(df):
        df = add_weekdays_and_weekends(df)

        # Extract the day of the month and add as column 't'
        df['t'] = df['Date'].dt.day

        return df

    df = add_day_of_month(df)

    print(len(df) + 1)
    return df

In [15]:
a = process_charging_data(daily_interpolated)
df_with_weather = a[['Date', 't', 'NameOfDay', 'dayofweek', 'weekend', 'Maximum T', 'Minimum T', 'Precipitation', 'Snow', 'y', 'y_t_1']]
df_no_weather = a[['Date', 't', 'NameOfDay', 'dayofweek', 'weekend', 'y', 'y_t_1']]

2161


In [16]:
df_no_weather[['t', 'dayofweek', 'weekend', 'y', 'y_t_1']].dropna().to_csv('occ_app_for_chg_dmg_data\df_no_weather.csv', index=False)

In [17]:
df_no_weather[['t', 'dayofweek', 'weekend', 'y', 'y_t_1']]

Unnamed: 0,t,dayofweek,weekend,y,y_t_1
0,1,1,0,6.504000,
1,2,2,0,8.763500,6.504000
2,3,3,0,3.742333,8.763500
3,4,4,0,8.291333,3.742333
4,5,5,0,1.478000,8.291333
...,...,...,...,...,...
2155,26,0,1,11.170091,15.062656
2156,27,1,0,12.157439,11.170091
2157,28,2,0,9.745620,12.157439
2158,29,3,0,10.713364,9.745620


In [18]:
def calculate_energy_rate(df):
    # Filter DataFrame
    weekends = df[df['NameOfDay'].isin(['Saturday', 'Sunday'])]
    weekdays = df[~df['NameOfDay'].isin(['Saturday', 'Sunday'])]

    # Pivot and calculate mean for weekdays
    mean_y_weekday = weekdays.pivot(index='Date', columns='t', values='y').fillna(0).mean()

    # Pivot and calculate mean for weekends
    mean_y_weekend = weekends.pivot(index='Date', columns='t', values='y').fillna(0).mean()

    # Combine the mean values into a DataFrame
    data_chg_pred_occ_t = pd.DataFrame({
        'weekday': mean_y_weekday,
        'weekend': mean_y_weekend
    }).reset_index()

    return data_chg_pred_occ_t.drop(columns='t')

In [19]:
df_no_weather_pred = calculate_energy_rate(df_no_weather)
df_no_weather_pred.to_csv('occ_app_for_chg_dmg_data\df_no_weather_pred.csv', index=False)

In [20]:
df_no_weather_pred

Unnamed: 0,weekday,weekend
0,0.317912,0.282021
1,0.317002,0.270882
2,0.313248,0.302579
3,0.295502,0.320241
4,0.295303,0.306273
5,0.30927,0.275886
6,0.314899,0.282174
7,0.310315,0.283981
8,0.302006,0.29728
9,0.321703,0.279053


In [21]:
# def calculate_energy_rate_based_on_(df):
#     # Ensure the 'Date' column is a datetime object for proper handling
#     df['Date'] = pd.to_datetime(df['Date'])

#     # Calculate the average energy consumption for each day of the week
#     daily_averages = df.groupby('NameOfDay')['y'].mean()
    
#     # Compute the average energy consumption for weekdays (Monday to Friday)
#     weekday_avg = daily_averages.loc[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']].mean()
    
#     # Compute the average energy consumption for weekends (Saturday and Sunday)
#     weekend_avg = daily_averages.loc[['Saturday', 'Sunday']].mean()

#     # Calculate the energy rate for each day
#     # For weekends, divide the daily energy consumption by the average weekend consumption
#     # For weekdays, divide the daily energy consumption by the average weekday consumption
#     df['energy_rate'] = np.where(df['NameOfDay'].isin(['Saturday', 'Sunday']),
#                                  df['y'] / weekend_avg,    # Normalizing with weekend average
#                                  df['y'] / weekday_avg)    # Normalizing with weekday average

#     return df


# def calculate_daily_rate(df):
#     # Ensure the 'Date' column is a datetime object
#     df['Date'] = pd.to_datetime(df['Date'])
    
#     # Extract the day of the week and add it as a new column
#     df['NameOfDay'] = df['Date'].dt.day_name()
    
#     # Calculate the average y for each day of the week
#     avg_per_day = df.groupby('NameOfDay')['y'].mean().to_dict()
    
#     # Calculate the rate of y for each row
#     df['Rate'] = df.apply(lambda row: row['y'] / avg_per_day[row['NameOfDay']], axis=1)
    
#     return df

# df_with_rate = calculate_daily_rate(df_with_rate)