# Load SOMC_AC

In [1]:
import numpy as np
import pandas as pd

In [19]:
# Load the CSV file
path = "../Raw Data/SOMC_AC_20240505_20250505_3600.csv"
df = pd.read_csv(path)

df.head(5)

Unnamed: 0,toll_booth,gantry,data_time,ac_a_current,ac_b_current,rs485_temperature,mains_power_220_current,mains_power_l1_voltage,mains_power_l2_voltage,smr_dc_current,smr_dc_voltage
0,泰山,01F-000.5N,2024-06-01 00:00:05,0.1,0.2,16.4,5.6,79.3,155.3,36.0,26.7
1,泰山,01F-000.5S,2024-06-01 00:00:05,0.1,2.0,22.1,7.1,115.0,122.4,35.3,26.7
2,泰山,01F-001.7S,2024-06-01 00:00:05,0.1,1.0,19.6,8.8,115.6,115.7,55.5,26.8
3,泰山,01F-002.9S,2024-06-01 00:00:05,0.1,0.2,21.4,8.7,117.1,117.2,57.7,26.8
4,泰山,01F-006.1S,2024-06-01 00:00:05,1.9,0.2,17.7,11.3,116.7,118.4,66.4,26.8


# Filters the DataFrame to return only rows where the toll_booth matches the given name

In [6]:
def get_toll_booth_data(df, toll_booth_name):
    return df[df['toll_booth'] == toll_booth_name]

In [20]:
taishan_df = get_toll_booth_data(df, "泰山")
taishan_df.tail(5)

Unnamed: 0,toll_booth,gantry,data_time,ac_a_current,ac_b_current,rs485_temperature,mains_power_220_current,mains_power_l1_voltage,mains_power_l2_voltage,smr_dc_current,smr_dc_voltage
1703394,泰山,01H-033.3N,2025-05-04 23:00:30,1.7,0.1,19.9,7.1,116.6,116.4,36.8,26.4
1703395,泰山,01H-033.4S,2025-05-04 23:00:30,1.6,0.1,20.6,7.2,115.9,115.9,36.8,26.4
1703396,泰山,01H-044.7N,2025-05-04 23:00:30,1.4,0.1,18.9,6.9,116.2,116.1,38.6,26.8
1703397,泰山,01H-044.7S,2025-05-04 23:00:30,0.2,0.1,22.8,6.0,114.5,114.4,38.9,26.7
1703399,泰山,01H-057.9S,2025-05-04 23:00:32,1.5,0.1,17.6,7.4,115.9,115.6,40.8,26.7


## Aggregate reading by Day

In [13]:
# Convert to datetime
df['data_time'] = pd.to_datetime(df['data_time'])

# Extract date
df['date'] = df['data_time'].dt.date

# Group by date (optional: add 'toll_booth' or 'gantry' for finer granularity)
daily_agg = df.groupby(['toll_booth', 'gantry', 'date']).agg({
    'ac_a_current': 'mean',
    'ac_b_current': 'mean',
    'rs485_temperature': 'mean',
    'mains_power_220_current': 'mean',
    'mains_power_l1_voltage': 'mean',
    'mains_power_l2_voltage': 'mean',
    'smr_dc_current': 'mean',
    'smr_dc_voltage': 'mean'
}).reset_index()

In [25]:
daily_agg.head(10)

Unnamed: 0,toll_booth,gantry,date,ac_a_current,ac_b_current,rs485_temperature,mains_power_220_current,mains_power_l1_voltage,mains_power_l2_voltage,smr_dc_current,smr_dc_voltage
0,后里,01F-104.5N,2024-06-01,0.1,2.095833,18.820833,11.554167,114.704167,115.029167,66.220833,26.758333
1,后里,01F-104.5N,2024-06-02,1.608333,0.1,18.158333,10.9,116.066667,116.4125,66.1375,26.75
2,后里,01F-104.5N,2024-06-03,1.579167,0.1,18.308333,10.7375,116.391667,116.704167,66.120833,26.7625
3,后里,01F-104.5N,2024-06-04,0.204167,1.516667,17.9875,10.845833,114.833333,115.083333,65.708333,26.758333
4,后里,01F-104.5N,2024-06-05,1.604167,0.1,17.9875,10.866667,114.891667,115.208333,65.408333,26.758333
5,后里,01F-104.5N,2024-06-06,0.1,1.6125,17.629167,10.85,115.295833,115.579167,65.804167,26.75
6,后里,01F-104.5N,2024-06-07,1.6875,0.1,18.308333,11.070833,115.004167,115.345833,66.116667,26.754167
7,后里,01F-104.5N,2024-06-08,0.1,2.029167,17.795833,11.508333,114.270833,114.554167,66.591667,26.766667
8,后里,01F-104.5N,2024-06-09,1.854167,0.1,18.6125,11.145833,116.245833,116.608333,66.204167,26.741667
9,后里,01F-104.5N,2024-06-10,2.158333,0.1,19.195833,11.516667,114.854167,115.241667,66.308333,26.733333


# Get all toll booths 

In [27]:
unique_toll_booths = daily_agg['toll_booth'].unique()
print(unique_toll_booths)

['后里' '員林' '新市' '楊梅' '樹林' '泰山' '田寮']


# Toll Booths:

['后里' '員林' '新市' '楊梅' '樹林' '泰山' '田寮']

- 后里
- 員林
- 新市
- 楊梅
- 樹林
- 泰山
- 田寮

# Get The range of time on the Data Set

In [None]:
# Convert 'date' column to datetime if it's not already
daily_agg['date'] = pd.to_datetime(daily_agg['date'])

# Get the min and max dates
start_date = daily_agg['date'].min()
end_date = daily_agg['date'].max()

print(f"Dataset covers from {start_date.date()} to {end_date.date()}")


Dataset covers from 2024-06-01 to 2025-05-04


# Export aggregated Reading as csv

In [None]:
output_path = "../Clean Data/aggregated_day_SOMC_AC_20240505_20250505_3600.csv"
daily_agg.to_csv(output_path, index=False)

# Aggregate by month

In [56]:
daily_df = pd.read_csv("../Clean Data/aggregated_day_SOMC_AC_20240505_20250505_3600.csv")

# Ensure 'date' is datetime type
daily_df['date'] = pd.to_datetime(daily_df['date'])

# Create a 'month' column (format: YYYY-MM)
daily_df['month'] = daily_df['date'].dt.to_period('M').astype(str)

# Group by toll_booth, gantry, and month
monthly_agg = daily_df.groupby(['toll_booth', 'gantry', 'month']).agg({
    'ac_a_current': 'mean',
    'ac_b_current': 'mean',
    'rs485_temperature': 'mean',
    'mains_power_220_current': 'mean',
    'mains_power_l1_voltage': 'mean',
    'mains_power_l2_voltage': 'mean',
    'smr_dc_current': 'mean',
    'smr_dc_voltage': 'mean'
}).reset_index()

In [57]:
monthly_agg.head()

Unnamed: 0,toll_booth,gantry,month,ac_a_current,ac_b_current,rs485_temperature,mains_power_220_current,mains_power_l1_voltage,mains_power_l2_voltage,smr_dc_current,smr_dc_voltage
0,后里,01F-104.5N,2024-06,1.055139,1.031389,20.154306,15.451111,118.868333,119.134861,65.935972,26.7475
1,后里,01F-104.5N,2024-07,1.060349,0.977554,21.93414,27.169892,129.722715,129.988844,65.906452,26.743414
2,后里,01F-104.5N,2024-08,1.022715,0.869489,22.820565,11.132258,115.007527,114.723118,66.077419,26.752419
3,后里,01F-104.5N,2024-09,1.034167,0.734306,23.279444,13.752083,117.823194,117.107222,66.108194,26.745417
4,后里,01F-104.5N,2024-10,0.761559,0.881989,23.460484,20.084409,124.584409,123.723118,65.858602,26.74422


In [58]:
output_path = "../Clean Data/aggregated_month_SOMC_AC_20240505_20250505_3600.csv"
monthly_agg.to_csv(output_path, index=False)

# Electricity Data of the toll station phase 1

In [38]:
path = "../Raw Data/Electricity of the toll station-phase 1 simplified.csv"

electricity_toll_station_df = pd.read_csv(path, skiprows=1)

electricity_toll_station_df.head(5)

Unnamed: 0,Location ID,Latitude,Longitude,Equipment Lane Count,Lane Count,Electricity Number,Billing Month,Electricity Consumption (kWh),Feeder Line Category,Power Outage Group,...,Lighting Usage Category,Number of Usage Days Last Year (Second Half),Billed kWh - Regular (Peak),Number of Usage Days Same Period Last Year,Average Electricity Consumption,Electricity Usage Same Period Last Year,Electricity Usage Last Year (Second Half),Number of Usage Days This Period,Unnamed: 21,Maintenance Station
0,01F0233N,25.073019,121.530703,5.0,4.0,00-03-9910-00-3,11307,2453,ZQ55,C,...,B21,60,2453,61,2453,2457,2472,59,01F-023.3N,北區泰山
1,01F0233N,25.073019,121.530703,5.0,4.0,00-03-9910-00-3,11309,2672,ZQ55,C,...,B21,61,2672,60,2672,2472,2375,63,01F-023.3N,北區泰山
2,01F0233N,25.073019,121.530703,5.0,4.0,00-03-9910-00-3,11311,2605,ZQ55,C,...,B21,63,2605,61,2605,2375,2573,62,01F-023.3N,北區泰山
3,01F0233N,25.073019,121.530703,5.0,4.0,00-03-9910-00-3,11401,2294,ZQ55,C,...,B21,61,2294,63,2294,2573,2418,58,01F-023.3N,北區泰山
4,01F0256N,25.078067,121.509106,6.0,4.0,00-21-9981-00-1,11305,2645,SA39,A,...,B21,61,2645,56,983,2213,2454,66,01F-025.6N,北區泰山


In [41]:
# Get all unique Maintenance Stations

unique_stations = electricity_toll_station_df['Maintenance Station'].unique()
print(unique_stations)

['北區泰山' '北區樹林' '北區楊梅' '行政部門' '中區后里' '中區員林' '南區新市' '南區田寮' 'IT機房']


# Normalize the Mantainance Station names

In [43]:
# Define allowed station names (as per your target list)
valid_stations = ['后里', '員林', '新市', '楊梅', '樹林', '泰山', '田寮']

# Function to extract matching station name
def clean_station_name(name):
    for station in valid_stations:
        if station in name:
            return station
    return None  # Filter out unwanted entries

# Apply the cleaning
electricity_toll_station_df['Station'] = electricity_toll_station_df['Maintenance Station'].apply(clean_station_name)

# Drop rows with None (i.e., unmatched stations like 行政部門, IT機房)
electricity_toll_station_df = electricity_toll_station_df.dropna(subset=['Station'])

# Optional: check unique cleaned names
print(electricity_toll_station_df['Station'].unique())


['泰山' '樹林' '楊梅' '后里' '員林' '新市' '田寮']


# Normalize Dates

In [49]:
# Function to convert Minguo date (e.g. 11307 or 1130624) to Gregorian YYYY-MM or YYYY-MM-DD
def convert_minguo_date(value):
    if pd.isna(value):
        return None
    value_str = str(int(value))
    if len(value_str) == 5:  # e.g., 11307 → YYYY-MM
        year = int(value_str[:3]) + 1911
        month = int(value_str[3:])
        return f"{year:04d}-{month:02d}"
    elif len(value_str) == 7:  # e.g., 1130624 → YYYY-MM-DD
        year = int(value_str[:3]) + 1911
        month = int(value_str[3:5])
        day = int(value_str[5:])
        return f"{year:04d}-{month:02d}-{day:02d}"
    else:
        return None

In [50]:
# Columns with Minguo dates
minguo_date_columns = [
    'Billing Month', 'Billing Period (Start)', 'Billing Period (End)', 'Billing Date'
]

# Apply conversion
for col in minguo_date_columns:
    electricity_toll_station_df[col] = electricity_toll_station_df[col].apply(convert_minguo_date)

# Preview result
print(electricity_toll_station_df[[col for col in minguo_date_columns]].head())

  Billing Month Billing Period (Start) Billing Period (End) Billing Date
0       2024-07             2024-04-29           2024-06-26   2024-07-03
1       2024-09             2024-06-27           2024-08-28   2024-09-04
2       2024-11             2024-08-29           2024-10-29   2024-11-05
3       2025-01             2024-10-30           2024-12-26   2025-01-06
4       2024-05             2024-02-27           2024-05-02   2024-05-09


In [51]:
electricity_toll_station_df.head()

Unnamed: 0,Location ID,Latitude,Longitude,Equipment Lane Count,Lane Count,Electricity Number,Billing Month,Electricity Consumption (kWh),Feeder Line Category,Power Outage Group,...,Number of Usage Days Last Year (Second Half),Billed kWh - Regular (Peak),Number of Usage Days Same Period Last Year,Average Electricity Consumption,Electricity Usage Same Period Last Year,Electricity Usage Last Year (Second Half),Number of Usage Days This Period,Unnamed: 21,Maintenance Station,Station
0,01F0233N,25.073019,121.530703,5.0,4.0,00-03-9910-00-3,2024-07,2453,ZQ55,C,...,60,2453,61,2453,2457,2472,59,01F-023.3N,北區泰山,泰山
1,01F0233N,25.073019,121.530703,5.0,4.0,00-03-9910-00-3,2024-09,2672,ZQ55,C,...,61,2672,60,2672,2472,2375,63,01F-023.3N,北區泰山,泰山
2,01F0233N,25.073019,121.530703,5.0,4.0,00-03-9910-00-3,2024-11,2605,ZQ55,C,...,63,2605,61,2605,2375,2573,62,01F-023.3N,北區泰山,泰山
3,01F0233N,25.073019,121.530703,5.0,4.0,00-03-9910-00-3,2025-01,2294,ZQ55,C,...,61,2294,63,2294,2573,2418,58,01F-023.3N,北區泰山,泰山
4,01F0256N,25.078067,121.509106,6.0,4.0,00-21-9981-00-1,2024-05,2645,SA39,A,...,61,2645,56,983,2213,2454,66,01F-025.6N,北區泰山,泰山


# Export Electricity Data of toll 

In [54]:
output_path = "../Clean Data/cleaned Electricity of the toll station-phase 1.csv"
electricity_toll_station_df.to_csv(output_path, index=False)

# Compare Gantries

In [67]:
electricity_path = "../Clean Data/cleaned Electricity of the toll station-phase 1.csv"
electricity_df = pd.read_csv(electricity_path)

electricity_df.head()

Unnamed: 0,Latitude,Longitude,Equipment Lane Count,Lane Count,Billing Month,Electricity Consumption (kWh),Feeder Line Category,Power Outage Group,Billing Period (Start),Billing Period (End),Lighting Usage Category,Number of Usage Days Last Year (Second Half),Billed kWh - Regular (Peak),Number of Usage Days Same Period Last Year,Average Electricity Consumption,Electricity Usage Same Period Last Year,Electricity Usage Last Year (Second Half),Number of Usage Days This Period,gantry,Station
0,25.073019,121.530703,5.0,4.0,2024-07,2453,ZQ55,C,2024-04-29,2024-06-26,B21,60,2453,61,2453,2457,2472,59,01F-023.3N,泰山
1,25.073019,121.530703,5.0,4.0,2024-09,2672,ZQ55,C,2024-06-27,2024-08-28,B21,61,2672,60,2672,2472,2375,63,01F-023.3N,泰山
2,25.073019,121.530703,5.0,4.0,2024-11,2605,ZQ55,C,2024-08-29,2024-10-29,B21,63,2605,61,2605,2375,2573,62,01F-023.3N,泰山
3,25.073019,121.530703,5.0,4.0,2025-01,2294,ZQ55,C,2024-10-30,2024-12-26,B21,61,2294,63,2294,2573,2418,58,01F-023.3N,泰山
4,25.078067,121.509106,6.0,4.0,2024-05,2645,SA39,A,2024-02-27,2024-05-02,B21,61,2645,56,983,2213,2454,66,01F-025.6N,泰山


In [62]:
somc_path = "../Clean Data/aggregated_month_SOMC_AC_20240505_20250505_3600.csv"
somc_df = pd.read_csv(somc_path)

somc_df.head()

Unnamed: 0,toll_booth,gantry,month,ac_a_current,ac_b_current,rs485_temperature,mains_power_220_current,mains_power_l1_voltage,mains_power_l2_voltage,smr_dc_current,smr_dc_voltage
0,后里,01F-104.5N,2024-06,1.055139,1.031389,20.154306,15.451111,118.868333,119.134861,65.935972,26.7475
1,后里,01F-104.5N,2024-07,1.060349,0.977554,21.93414,27.169892,129.722715,129.988844,65.906452,26.743414
2,后里,01F-104.5N,2024-08,1.022715,0.869489,22.820565,11.132258,115.007527,114.723118,66.077419,26.752419
3,后里,01F-104.5N,2024-09,1.034167,0.734306,23.279444,13.752083,117.823194,117.107222,66.108194,26.745417
4,后里,01F-104.5N,2024-10,0.761559,0.881989,23.460484,20.084409,124.584409,123.723118,65.858602,26.74422


In [66]:
to_print = 50

print(electricity_df['gantry'].head(to_print))

print("\n\n")

print(somc_df['gantry'].head(to_print))

0      01F-023.3N
1      01F-023.3N
2      01F-023.3N
3      01F-023.3N
4      01F-025.6N
5      01F-025.6N
6      01F-025.6N
7      01F-025.6N
8      01F-025.6N
9      01H-020.6S
10     01H-020.6S
11     01H-020.6S
12     01H-020.6S
13     01F-021.3N
14     01F-021.3N
15     01F-021.3N
16     01F-021.3N
17     01H-020.0N
18     01H-020.0N
19     01H-020.0N
20     01H-020.0N
21     01H-020.8N
22     01H-020.8N
23     01H-020.8N
24     01H-020.8N
25    03A-001.5SN
26    03A-001.5SN
27    03A-001.5SN
28    03A-001.5SN
29    03F-030.1SN
30    03F-030.1SN
31    03F-030.1SN
32    03F-030.1SN
33    03F-030.1SN
34    03F-030.1SN
35    03F-030.1SN
36    03F-030.1SN
37     03F-033.7S
38     03F-033.7S
39     03F-033.7S
40     03F-033.7S
41     03F-033.8N
42     03F-033.8N
43     03F-033.8N
44     03F-033.8N
45    03F-030.1SN
46    03F-030.1SN
47    03F-030.1SN
48    03F-030.1SN
49    03F-039.4SN
Name: gantry, dtype: object



0     01F-104.5N
1     01F-104.5N
2     01F-104.5N
3     01F-104.5N
4

In [69]:
def get_monthly_entries(toll_booth, gantry, month, electricity_df, somc_df):
    """
    Returns rows from both datasets matching toll_booth, gantry, and month.
    
    Args:
        toll_booth (str): The name of the toll booth (e.g., '泰山')
        gantry (str): The gantry ID (e.g., '01F-023.3N')
        month (str): The month in 'YYYY-MM' format (e.g., '2024-07')
        electricity_df (pd.DataFrame): Cleaned electricity billing data
        somc_df (pd.DataFrame): Aggregated SOMC readings per month

    Returns:
        tuple: (electricity_row(s), somc_row(s)) as DataFrames
    """
    elec_match = electricity_df[
        (electricity_df['Station'] == toll_booth) &
        (electricity_df['gantry'] == gantry) &
        (electricity_df['Billing Month'] == month)
    ]
    
    somc_match = somc_df[
        (somc_df['toll_booth'] == toll_booth) &
        (somc_df['gantry'] == gantry) &
        (somc_df['month'] == month)
    ]
    
    return elec_match, somc_match


In [82]:
# Load the CSVs if not already loaded
electricity_path = "../Clean Data/cleaned Electricity of the toll station-phase 1.csv"
electricity_df = pd.read_csv(electricity_path)

somc_path = "../Clean Data/aggregated_month_SOMC_AC_20240505_20250505_3600.csv"
somc_df = pd.read_csv(somc_path)

# Query for a specific example
elec_row, somc_row = get_monthly_entries("泰山", "01F-023.3N", "2024-07", electricity_df, somc_df)


In [83]:
elec_row

Unnamed: 0,Latitude,Longitude,Equipment Lane Count,Lane Count,Billing Month,Electricity Consumption (kWh),Feeder Line Category,Power Outage Group,Billing Period (Start),Billing Period (End),Lighting Usage Category,Number of Usage Days Last Year (Second Half),Billed kWh - Regular (Peak),Number of Usage Days Same Period Last Year,Average Electricity Consumption,Electricity Usage Same Period Last Year,Electricity Usage Last Year (Second Half),Number of Usage Days This Period,gantry,Station
0,25.073019,121.530703,5.0,4.0,2024-07,2453,ZQ55,C,2024-04-29,2024-06-26,B21,60,2453,61,2453,2457,2472,59,01F-023.3N,泰山


In [84]:
somc_row

Unnamed: 0,toll_booth,gantry,month,ac_a_current,ac_b_current,rs485_temperature,mains_power_220_current,mains_power_l1_voltage,mains_power_l2_voltage,smr_dc_current,smr_dc_voltage
1909,泰山,01F-023.3N,2024-07,0.930511,0.789785,20.134543,13.735349,117.693011,117.785618,44.797043,26.475


In [76]:
# From SOMC monthly dataset (or use electricity_df if needed)
unique_gantries = somc_df[['toll_booth', 'gantry']].drop_duplicates().sort_values(['toll_booth', 'gantry'])

# Reset index for a clean display
unique_gantries = unique_gantries.reset_index(drop=True)

# Display
print(unique_gantries)

unique_gantries.to_csv("../Clean Data/list_of_gantries_by_toll_booth.csv", index=False)



    toll_booth      gantry
0           后里  01F-104.5N
1           后里  01F-112.3N
2           后里  01F-129.2S
3           后里  01F-138.9S
4           后里  01F-146.5N
..         ...         ...
205         田寮  03F-416.8N
206         田寮  03F-416.8S
207         田寮  03F-423.2S
208         田寮  03F-425.9N
209         田寮  03F-426.3S

[210 rows x 2 columns]


# Have Date YYYY-MM y Month MM

In [110]:
somc_path = "../Clean Data/aggregated_month_SOMC_AC_20240505_20250505_3600.csv"
somc_df = pd.read_csv(somc_path)
somc_df

print(somc_df.head(1))

  toll_booth      gantry  month  ac_a_current  ac_b_current  \
0         后里  01F-104.5N      6      1.055139      1.031389   

   rs485_temperature  mains_power_220_current  mains_power_l1_voltage  \
0          20.154306                15.451111              118.868333   

   mains_power_l2_voltage  smr_dc_current  smr_dc_voltage  
0              119.134861       65.935972         26.7475  


In [93]:
electricity_path = "../Clean Data/cleaned Electricity of the toll station-phase 1.csv"
electricity_df = pd.read_csv(electricity_path)
electricity_df.head()

Unnamed: 0,Latitude,Longitude,Equipment Lane Count,Lane Count,Billing Month,Electricity Consumption (kWh),Feeder Line Category,Power Outage Group,Billing Period (Start),Billing Period (End),Lighting Usage Category,Number of Usage Days Last Year (Second Half),Billed kWh - Regular (Peak),Number of Usage Days Same Period Last Year,Average Electricity Consumption,Electricity Usage Same Period Last Year,Electricity Usage Last Year (Second Half),Number of Usage Days This Period,gantry,Station
0,25.073019,121.530703,5.0,4.0,2024-07,2453,ZQ55,C,2024-04-29,2024-06-26,B21,60,2453,61,2453,2457,2472,59,01F-023.3N,泰山
1,25.073019,121.530703,5.0,4.0,2024-09,2672,ZQ55,C,2024-06-27,2024-08-28,B21,61,2672,60,2672,2472,2375,63,01F-023.3N,泰山
2,25.073019,121.530703,5.0,4.0,2024-11,2605,ZQ55,C,2024-08-29,2024-10-29,B21,63,2605,61,2605,2375,2573,62,01F-023.3N,泰山
3,25.073019,121.530703,5.0,4.0,2025-01,2294,ZQ55,C,2024-10-30,2024-12-26,B21,61,2294,63,2294,2573,2418,58,01F-023.3N,泰山
4,25.078067,121.509106,6.0,4.0,2024-05,2645,SA39,A,2024-02-27,2024-05-02,B21,61,2645,56,983,2213,2454,66,01F-025.6N,泰山


In [94]:
# Step 1: Extract current column list
cols = list(electricity_df.columns)

# Step 2: Define desired front columns
front = ['Station', 'gantry', 'Latitude', 'Longitude']

# Step 3: Get remaining columns (excluding those already in 'front')
remaining = [col for col in cols if col not in front]

# Step 4: Reorder the DataFrame
electricity_df = electricity_df[front + remaining]


In [96]:
# Ensure 'Billing Month' is in datetime format (if not already)
electricity_df['Billing Month'] = pd.to_datetime(electricity_df['Billing Month'], format='%Y-%m')

# Find the earliest and latest billing months
start_month = electricity_df['Billing Month'].min()
end_month = electricity_df['Billing Month'].max()

print(f"Billing data covers from {start_month.strftime('%Y-%m')} to {end_month.strftime('%Y-%m')}")


Billing data covers from 2024-05 to 2025-01


In [98]:
# Convert to datetime first (if not already done)
electricity_df['Billing Month'] = pd.to_datetime(electricity_df['Billing Month'], errors='coerce')

# Extract just the month as a two-digit string
electricity_df['Billing Month'] = electricity_df['Billing Month'].dt.strftime('%m')


In [109]:
print(electricity_df.head(1))

  Station      gantry   Latitude   Longitude  Equipment Lane Count  \
0      泰山  01F-023.3N  25.073019  121.530703                   5.0   

   Lane Count Billing Month  Electricity Consumption (kWh)  \
0         4.0            07                           2453   

  Feeder Line Category Power Outage Group Billing Period (Start)  \
0                 ZQ55                  C             2024-04-29   

  Billing Period (End) Lighting Usage Category  \
0           2024-06-26                     B21   

  Number of Usage Days Last Year (Second Half) Billed kWh - Regular (Peak)  \
0                                           60                        2453   

  Number of Usage Days Same Period Last Year Average Electricity Consumption  \
0                                         61                            2453   

  Electricity Usage Same Period Last Year  \
0                                    2457   

  Electricity Usage Last Year (Second Half)  Number of Usage Days This Period  
0      

In [102]:
electricity_path = "../Clean Data/more cleaned Electricity of the toll station.csv"
electricity_df.to_csv(electricity_path)

X = lat, long, lane count, month => training data
Y = power consumption that month => output

# Prepare Electricty Dataset for Training

In [None]:
electricity_path = "../Clean Data/more more more cleaned Electricity of the toll station.csv"
electricity_df = pd.read_csv(electricity_path)

electricity_df.head()

Unnamed: 0,Station,gantry,Latitude,Longitude,Equipment Lane Count,Lane Count,Billing Month,Electricity Consumption (kWh),Feeder Line Category,Power Outage Group,Billing Period (Start),Billing Period (End),Lighting Usage Category,Number of Usage Days Last Year (Second Half),Billed kWh - Regular (Peak),Number of Usage Days Same Period Last Year,Average Electricity Consumption,Electricity Usage Same Period Last Year,Electricity Usage Last Year (Second Half),Number of Usage Days This Period
0,泰山,01F-023.3N,25.073019,121.530703,5.0,4.0,7,2453,ZQ55,C,2024-04-29,2024-06-26,B21,60,2453,61,2453,2457,2472,59
1,泰山,01F-023.3N,25.073019,121.530703,5.0,4.0,9,2672,ZQ55,C,2024-06-27,2024-08-28,B21,61,2672,60,2672,2472,2375,63
2,泰山,01F-023.3N,25.073019,121.530703,5.0,4.0,11,2605,ZQ55,C,2024-08-29,2024-10-29,B21,63,2605,61,2605,2375,2573,62
3,泰山,01F-023.3N,25.073019,121.530703,5.0,4.0,1,2294,ZQ55,C,2024-10-30,2024-12-26,B21,61,2294,63,2294,2573,2418,58
4,泰山,01F-025.6N,25.078067,121.509106,6.0,4.0,5,2645,SA39,A,2024-02-27,2024-05-02,B21,61,2645,56,983,2213,2454,66
