In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/data_chg_all.csv')
df.columns

Index(['Chargingevent', 'CPID', 'Connector', 'StartDate', 'StartTime',
       'EndDate', 'EndTime', 'StartDate_num', 'StartTime_num', 'EndDate_num',
       'EndTime_num', 'duration', 'TotalkWh', 'Cost', 'Site', 'Group', 'Model',
       'Model1'],
      dtype='object')

In [3]:
# Convert 'StartDate' to datetime with the correct format
df['StartDate'] = pd.to_datetime(df['StartDate'], format='%m/%d/%Y')

# Define the date range
start_date = pd.Timestamp('2018-03-05')
end_date = pd.Timestamp('2018-06-04')

# Filter the dataset to include only rows within the date range
df = df[(df['StartDate'] >= start_date) & (df['StartDate'] <= end_date)]

print('Number of rows in dataset:', len(df))

all_dates = pd.date_range(start=start_date, end=end_date, freq='D')
print('Number of days:', len(all_dates))

Number of rows in dataset: 15901
Number of days: 92


In [4]:
grouped_counts = df.groupby(['Model1'])

print("Charger Counts:")
print(grouped_counts['CPID'].nunique())

Charger Counts:
Model1
Slow     40
fast      8
rapid     9
Name: CPID, dtype: int64


In [5]:
# Count the number of each type of charger
charger_counts = df['Model1'].value_counts()

# Calculate the total number of charging sessions
total_sessions = len(df)

# Calculate the percentage of sessions for each type of charger
charger_percentages = (charger_counts / total_sessions) * 100

print("Charger Percentages:")
print(charger_percentages)

Charger Percentages:
rapid    56.229168
Slow     35.236778
fast      8.534054
Name: Model1, dtype: float64


In [6]:
# Keep only the rapid chargers
rapid_chargers_df = df[df['Model1'] == 'rapid']

# Calculate the median and standard deviation of the 'duration' column for rapid chargers
median_duration = rapid_chargers_df['duration'].median()
std_duration = rapid_chargers_df['duration'].std()

# Define the threshold for outliers
upper_threshold = median_duration + 3 * std_duration
lower_threshold = median_duration - 3 * std_duration

# Filter out the outliers
filtered_df = rapid_chargers_df[(rapid_chargers_df['duration'] >= lower_threshold) & (rapid_chargers_df['duration'] <= upper_threshold )]

# Check the percentage of rows removed
percent_removed = (1 - len(filtered_df) / len(rapid_chargers_df)) * 100
print(f'{percent_removed:.2f}% of outliers were removed.')

0.79% of outliers were removed.


In [7]:
rapid_chargers_df['duration'].describe()

count    8941.000000
mean       28.313723
std        36.412521
min         5.000000
25%        15.000000
50%        24.000000
75%        34.000000
max      1145.000000
Name: duration, dtype: float64

In [8]:
filtered_df['duration'].describe()

count    8870.000000
mean       25.860203
std        14.171344
min         5.000000
25%        15.000000
50%        24.000000
75%        33.000000
max       127.000000
Name: duration, dtype: float64

### **Create dataset for hybrid lstm**

In [9]:
df = filtered_df.copy()

In [10]:
# Convert 'StartDate' and 'EndDate' to datetime to get the day of the week
df['StartDate'] = pd.to_datetime(df['StartDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])

# Extract the name of the day
df['NameOfDay'] =  df['StartDate'].dt.day_name()

# Adjust DayOfWeek to start with Sunday=0, Monday=1, ..., Saturday=6
df['DayOfWeek'] = (df['StartDate'].dt.dayofweek + 1) % 7

# Determine if it is a weekend
df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x in [0, 6] else 0)

In [11]:
print(df['CPID'].nunique())
df['CPID'].unique().tolist()

9


[50692, 50339, 50911, 50349, 50338, 51547, 51550, 51549, 51548]

In [12]:
# Group by CPID
grouped = df.groupby('CPID')
columns = ['StartDate', 'StartTime', 'EndDate', 'EndTime', 'StartDate_num', 'StartTime_num', 'EndDate_num', 'EndTime_num', 'NameOfDay', 'DayOfWeek', 'IsWeekend']

# Create a dictionary where keys are CPID and values are DataFrames of each CPID
cpid_dict = {cpid: group.drop(columns='CPID') for cpid, group in grouped}

# Assuming cpid_dict is already defined
for idx, (cpid, df) in enumerate(cpid_dict.items(), start=1):
    globals()[f'df_{idx}'] = df[columns]
    #globals()[f'df_{idx}'] = df
    print(len(globals()[f'df_{idx}']))

172
771
897
1389
1669
1113
1322
891
646


In [13]:
def process_charging_data(df_transactions):
    # Ensure columns are in string format
    df_transactions['StartDate'] = df_transactions['StartDate'].astype(str)
    df_transactions['StartTime'] = df_transactions['StartTime'].astype(str)
    df_transactions['EndDate'] = df_transactions['EndDate'].astype(str)
    df_transactions['EndTime'] = df_transactions['EndTime'].astype(str)

    # Combine StartDate and StartTime into a single datetime column
    df_transactions['StartDate_and_StartTime'] = pd.to_datetime(df_transactions['StartDate'] + ' ' + df_transactions['StartTime'])
    df_transactions['EndDate_and_EndTime'] = pd.to_datetime(df_transactions['EndDate'] + ' ' + df_transactions['EndTime'])

    # Extract unique dates
    unique_dates = df_transactions[['StartDate']].drop_duplicates()

    # Function to generate 10-minute intervals for a given date
    def generate_intervals(date):
        start_time = pd.Timestamp(date)
        intervals_start = pd.date_range(start=start_time, periods=144, freq='10T')
        intervals_end = intervals_start + pd.Timedelta(minutes=10)
        return intervals_start, intervals_end

    # Generate the intervals for all unique dates and create the DataFrame
    intervals_list = []

    for idx, row in unique_dates.iterrows():
        date = row['StartDate']
        intervals_start, intervals_end = generate_intervals(date)
        data_chg = pd.DataFrame({
            'StartDate': date,
            'IntervalStart': intervals_start,
            'IntervalEnd': intervals_end,
            't': np.arange(1, 145)
        })
        intervals_list.append(data_chg)

    # Concatenate all the individual DataFrames into one
    data_chg = pd.concat(intervals_list).reset_index(drop=True)

    # Add transaction indicator
    def add_transaction_y_indicator(intervals_df, transactions_df):
        y_values = []

        for idx, row in intervals_df.iterrows():
            interval_start = row['IntervalStart']
            interval_end = row['IntervalEnd']
            
            # Check if any transaction falls within the interval
            transaction_exists = transactions_df[
                (transactions_df['StartDate_and_StartTime'] < interval_end) & 
                (transactions_df['EndDate_and_EndTime'] > interval_start)
            ].shape[0] > 0
            
            y_values.append(1 if transaction_exists else 0)
        
        intervals_df['y'] = y_values
        return intervals_df
    
    def past_chg_occ_state(data_chg, df_transactions):
        data_chg = add_transaction_y_indicator(data_chg, df_transactions)
    
        # Add the y_t_1 column
        data_chg['y_t_1'] = data_chg['y'].shift(1).astype('Int64')

        return data_chg

    def add_weekdays_and_weekends(data_chg, df_transactions):
        data_chg = past_chg_occ_state(data_chg, df_transactions)

        # Convert 'StartDate' and 'EndDate' to datetime to get the day of the week
        data_chg['StartDate'] = pd.to_datetime(data_chg['StartDate'])
        # Extract the name of the day
        data_chg['NameOfDay'] =  data_chg['StartDate'].dt.day_name()
        # Adjust DayOfWeek to start with Sunday=0, Monday=1, ..., Saturday=6
        data_chg['dayofweek'] = (data_chg['StartDate'].dt.dayofweek + 1) % 7
        # Determine if it is a weekend
        data_chg['weekend'] = data_chg['dayofweek'].apply(lambda x: 1 if x in [0, 6] else 0)

        return data_chg

    data_chg = add_weekdays_and_weekends(data_chg, df_transactions).dropna()[:-144]

    print(len(data_chg) + 1)
    return data_chg 

In [14]:
data_chg_1 = process_charging_data(df_1)[['t','StartDate', 'NameOfDay', 'dayofweek', 'weekend', 'y_t_1', 'y']]
data_chg_2 = process_charging_data(df_2)[['t','StartDate', 'NameOfDay', 'dayofweek', 'weekend', 'y_t_1', 'y']]
data_chg_3 = process_charging_data(df_3)[['t','StartDate', 'NameOfDay', 'dayofweek', 'weekend', 'y_t_1', 'y']]
data_chg_4 = process_charging_data(df_4)[['t','StartDate', 'NameOfDay', 'dayofweek', 'weekend', 'y_t_1', 'y']]
data_chg_5 = process_charging_data(df_5)[['t','StartDate', 'NameOfDay', 'dayofweek', 'weekend', 'y_t_1', 'y']]
data_chg_6 = process_charging_data(df_6)[['t','StartDate', 'NameOfDay', 'dayofweek', 'weekend', 'y_t_1', 'y']]
data_chg_7 = process_charging_data(df_7)[['t','StartDate', 'NameOfDay', 'dayofweek', 'weekend', 'y_t_1', 'y']]
data_chg_8 = process_charging_data(df_8)[['t','StartDate', 'NameOfDay', 'dayofweek', 'weekend', 'y_t_1', 'y']]
data_chg_9 = process_charging_data(df_9)[['t','StartDate', 'NameOfDay', 'dayofweek', 'weekend', 'y_t_1', 'y']]

8784
13104
12672
12816
13104
8496
8496
8496
8496


In [15]:
data_chg_1[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']]

Unnamed: 0,t,dayofweek,weekend,y_t_1,y
1,2,1,0,0,0
2,3,1,0,0,0
3,4,1,0,0,0
4,5,1,0,0,0
5,6,1,0,0,0
...,...,...,...,...,...
8779,140,5,0,0,0
8780,141,5,0,0,0
8781,142,5,0,0,0
8782,143,5,0,0,0


In [16]:
chg_1 = pd.read_csv('data/data_chg/data_chg_1.csv')
chg_2 = pd.read_csv('data/data_chg/data_chg_2.csv')
chg_2.rename(columns={'y.1': 'y_t_1'}, inplace=True)
chg_2 = chg_2[['t','dayofweek', 'weekend', 'y_t_1', 'y']]
chg_3 = pd.read_csv('data/data_chg/data_chg_3.csv')
chg_4 = pd.read_csv('data/data_chg/data_chg_4.csv')
chg_5 = pd.read_csv('data/data_chg/data_chg_5.csv')
chg_6 = pd.read_csv('data/data_chg/data_chg_6.csv')
chg_7 = pd.read_csv('data/data_chg/data_chg_7.csv')
chg_8 = pd.read_csv('data/data_chg/data_chg_8.csv')
chg_9 = pd.read_csv('data/data_chg/data_chg_9.csv')

In [17]:
def reset_index_and_compare_dfs(df1, df2):
    # Reset index for both DataFrames
    df1_reset = df1.reset_index(drop=True)
    df2_reset = df2.reset_index(drop=True)

    # Perform the comparison
    comparison = df1_reset != df2_reset

    # Identify rows with at least one differing value
    rows_with_differences = comparison.any(axis=1)

    # Get the rows from df1 and df2 where differences exist
    df1_differing_rows = df1_reset[rows_with_differences]
    df2_differing_rows = df2_reset[rows_with_differences]

    print('Number of diff rows is:', len(df1_differing_rows))
    
    #return df1_differing_rows, df2_differing_rows


reset_index_and_compare_dfs(data_chg_1[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], chg_1)
reset_index_and_compare_dfs(data_chg_2[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], chg_2)
reset_index_and_compare_dfs(data_chg_3[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], chg_3)
reset_index_and_compare_dfs(data_chg_4[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], chg_4)
reset_index_and_compare_dfs(data_chg_5[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], chg_5)
reset_index_and_compare_dfs(data_chg_6[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], chg_6)
reset_index_and_compare_dfs(data_chg_7[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], chg_7)
reset_index_and_compare_dfs(data_chg_8[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], chg_8)
reset_index_and_compare_dfs(data_chg_9[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], chg_9)

Number of diff rows is: 8
Number of diff rows is: 1281
Number of diff rows is: 162
Number of diff rows is: 181
Number of diff rows is: 235
Number of diff rows is: 146
Number of diff rows is: 213
Number of diff rows is: 137
Number of diff rows is: 102


In [18]:
def calculate_occupancy_rate(df):
    # Filter DataFrame
    weekends = df[df['NameOfDay'].isin(['Saturday', 'Sunday'])]
    weekdays = df[~df['NameOfDay'].isin(['Saturday', 'Sunday'])]

    # Pivot and calculate mean for weekdays
    mean_y_weekday = weekdays.pivot(index='StartDate', columns='t', values='y').fillna(0).mean()

    # Pivot and calculate mean for weekends
    mean_y_weekend = weekends.pivot(index='StartDate', columns='t', values='y').fillna(0).mean()

    # Combine the mean values into a DataFrame
    data_chg_pred_occ_t = pd.DataFrame({
        'weekday': mean_y_weekday,
        'weekend': mean_y_weekend
    }).reset_index()

    return data_chg_pred_occ_t.drop(columns='t')

In [19]:
data_chg_pred_occ_t_1 = calculate_occupancy_rate(data_chg_1)
data_chg_pred_occ_t_2 = calculate_occupancy_rate(data_chg_2)
data_chg_pred_occ_t_3 = calculate_occupancy_rate(data_chg_3)
data_chg_pred_occ_t_4 = calculate_occupancy_rate(data_chg_4)
data_chg_pred_occ_t_5 = calculate_occupancy_rate(data_chg_5)
data_chg_pred_occ_t_6 = calculate_occupancy_rate(data_chg_6)
data_chg_pred_occ_t_7 = calculate_occupancy_rate(data_chg_7)
data_chg_pred_occ_t_8 = calculate_occupancy_rate(data_chg_8)
data_chg_pred_occ_t_9 = calculate_occupancy_rate(data_chg_9)

In [20]:
data_chg_pred_occ_t_1

Unnamed: 0,weekday,weekend
0,0.018519,0.142857
1,0.018519,0.000000
2,0.018519,0.000000
3,0.018519,0.142857
4,0.000000,0.142857
...,...,...
139,0.000000,0.142857
140,0.018519,0.142857
141,0.018519,0.142857
142,0.018519,0.285714


In [21]:
chg_pred_occ_t_1 = pd.read_csv('data/data_chg_pred_occ_t/data_chg_pred_occ_t_1.csv')
chg_pred_occ_t_2 = pd.read_csv('data/data_chg_pred_occ_t/data_chg_pred_occ_t_2.csv')
chg_pred_occ_t_3 = pd.read_csv('data/data_chg_pred_occ_t/data_chg_pred_occ_t_3.csv')
chg_pred_occ_t_4 = pd.read_csv('data/data_chg_pred_occ_t/data_chg_pred_occ_t_4.csv')
chg_pred_occ_t_5 = pd.read_csv('data/data_chg_pred_occ_t/data_chg_pred_occ_t_5.csv')
chg_pred_occ_t_6 = pd.read_csv('data/data_chg_pred_occ_t/data_chg_pred_occ_t_6.csv')
chg_pred_occ_t_7 = pd.read_csv('data/data_chg_pred_occ_t/data_chg_pred_occ_t_7.csv')
chg_pred_occ_t_8 = pd.read_csv('data/data_chg_pred_occ_t/data_chg_pred_occ_t_8.csv')
chg_pred_occ_t_9 = pd.read_csv('data/data_chg_pred_occ_t/data_chg_pred_occ_t_9.csv')

In [22]:
reset_index_and_compare_dfs(data_chg_pred_occ_t_1, chg_pred_occ_t_1)
reset_index_and_compare_dfs(data_chg_pred_occ_t_2, chg_pred_occ_t_2)
reset_index_and_compare_dfs(data_chg_pred_occ_t_3, chg_pred_occ_t_3)
reset_index_and_compare_dfs(data_chg_pred_occ_t_4, chg_pred_occ_t_4)
reset_index_and_compare_dfs(data_chg_pred_occ_t_5, chg_pred_occ_t_5)
reset_index_and_compare_dfs(data_chg_pred_occ_t_6, chg_pred_occ_t_6)
reset_index_and_compare_dfs(data_chg_pred_occ_t_7, chg_pred_occ_t_7)
reset_index_and_compare_dfs(data_chg_pred_occ_t_8, chg_pred_occ_t_8)
reset_index_and_compare_dfs(data_chg_pred_occ_t_9, chg_pred_occ_t_9)

Number of diff rows is: 103
Number of diff rows is: 144
Number of diff rows is: 144
Number of diff rows is: 143
Number of diff rows is: 143
Number of diff rows is: 144
Number of diff rows is: 144
Number of diff rows is: 144
Number of diff rows is: 144


### **Store the Dataframes in csv files**

In [24]:
# Obtain the charging data
data_frames = [
    (process_charging_data(df_1)[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], 'data/nikos_data/data_chg_1.csv'),
    (process_charging_data(df_2)[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], 'data/nikos_data/data_chg_2.csv'),
    (process_charging_data(df_3)[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], 'data/nikos_data/data_chg_3.csv'),
    (process_charging_data(df_4)[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], 'data/nikos_data/data_chg_4.csv'),
    (process_charging_data(df_5)[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], 'data/nikos_data/data_chg_5.csv'),
    (process_charging_data(df_6)[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], 'data/nikos_data/data_chg_6.csv'),
    (process_charging_data(df_7)[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], 'data/nikos_data/data_chg_7.csv'),
    (process_charging_data(df_8)[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], 'data/nikos_data/data_chg_8.csv'),
    (process_charging_data(df_9)[['t', 'dayofweek', 'weekend', 'y_t_1', 'y']], 'data/nikos_data/data_chg_9.csv')
]

for df, filename in data_frames:
    df.to_csv(filename, index=False)  # Save to CSV without row indices

8784
13104
12672
12816
13104
8496
8496
8496
8496


In [25]:
# Calculate the occupancy rate profiles for each DataFrame
data_chg_pred_occ_t_list = [
    (calculate_occupancy_rate(data_chg_1), 'data/nikos_data/data_chg_pred_occ_t_1.csv'),
    (calculate_occupancy_rate(data_chg_2), 'data/nikos_data/data_chg_pred_occ_t_2.csv'),
    (calculate_occupancy_rate(data_chg_3), 'data/nikos_data/data_chg_pred_occ_t_3.csv'),
    (calculate_occupancy_rate(data_chg_4), 'data/nikos_data/data_chg_pred_occ_t_4.csv'),
    (calculate_occupancy_rate(data_chg_5), 'data/nikos_data/data_chg_pred_occ_t_5.csv'),
    (calculate_occupancy_rate(data_chg_6), 'data/nikos_data/data_chg_pred_occ_t_6.csv'),
    (calculate_occupancy_rate(data_chg_7), 'data/nikos_data/data_chg_pred_occ_t_7.csv'),
    (calculate_occupancy_rate(data_chg_8), 'data/nikos_data/data_chg_pred_occ_t_8.csv'),
    (calculate_occupancy_rate(data_chg_9), 'data/nikos_data/data_chg_pred_occ_t_9.csv')
]

for df, filename in data_chg_pred_occ_t_list:
    df.to_csv(filename, index=False)  # Save to CSV without row indices