In [2]:
import numpy as np
import pandas as pd

In [13]:
# Function to convert Minguo date
def convert_minguo_date(value):
    if pd.isna(value):
        return None
    value_str = str(int(value))
    if len(value_str) == 5:
        month = int(value_str[3:])
        return month
    else:
        return None
# Function to convert from "999.0元" to 999.0
def convert_money_to_float(value):
    try:
        return float(str(value).replace("元", "").strip())
    except (ValueError, TypeError):
        return None


In [18]:
data_path = "../Raw Data/Electricity data of the toll station-phase 1.csv"
raw_data = pd.read_csv(data_path, skiprows=1)

# Extract useful columns
columns = ["Latitude", "Longitude", "Lane Count", "Billing Month", "Electricity Consumption (kWh)", "Total Amount Payable (Before Tax)"]
data_extracted = raw_data[columns].copy();

# Convert Billing Month to Gregorian month
data_extracted["Billing Month"] = data_extracted["Billing Month"].apply(convert_minguo_date)

# Convert Money to float
data_extracted["Total Amount Payable (Before Tax)"] = data_extracted["Total Amount Payable (Before Tax)"].apply(convert_money_to_float)

# Drop nan or '' columns
data_extracted = data_extracted.dropna()

# Divide the rows by month because billing is done every two months
data_divided = data_extracted.loc[data_extracted.index.repeat(2)].copy() # Duplicate

data_divided['Month'] = data_divided['Billing Month'] - data_divided.groupby(level=0).cumcount() - 2 # Assign previous months
data_divided['Month'] = data_divided['Month'].mod(12).replace(0, 12) # Keep months correctly wrapped around the year

# Divide the consumption and amount by two
data_divided['Estimated Monthly Consumption (kWh)'] = data_divided['Electricity Consumption (kWh)'] / 2
data_divided['Estimated Monthly Amount (Before Tax)'] = data_divided['Total Amount Payable (Before Tax)'] / 2

# Generate final df
final_columns = ['Latitude', 'Longitude', 'Lane Count', 'Month', 'Estimated Monthly Consumption (kWh)', 'Estimated Monthly Amount (Before Tax)']
processed_data = data_divided[final_columns].sort_values(by=['Latitude', 'Longitude', 'Month']).reset_index(drop=True)

# Export Data
processed_data.to_csv("../Clean Data/Clean_Electricity_Data_With_Cost.csv", index=False)

## Add Climate Type Predictions

In [3]:
import numpy as np
import pandas as pd
import joblib

# Load model
climate_model_path = "../Model/Exports/Climate Model/knn_model.joblib"
climate_model = joblib.load(climate_model_path)

# Define function
def predict_climate_score_row(row):
    query = np.array([[row['Latitude'], row['Longitude'], row['Month']]], dtype=float)
    return float(climate_model.predict(query)[0])

# Predict Climate Type
df = pd.read_csv("../Clean Data/Clean_Electricity_Data_With_Cost.csv")
df["Climate Type"] = df.apply(predict_climate_score_row, axis=1)

# Reorder the DataFrame
column_order = [
    "Latitude",
    "Longitude",
    "Lane Count",
    "Month",
    "Climate Type",
    "Estimated Monthly Consumption (kWh)",
    "Estimated Monthly Amount (Before Tax)"
]
df = df[column_order]

# Export
df.to_csv("../Clean Data/Clean_Regression_Dataset_With_Cost.csv", index=False)