In [22]:
import pandas as pd

#Upload the dataset and remove NaN values
data = pd.read_csv(
    'household_power_consumption.txt',
    sep=';',
    header=0,
    na_values='?',
    low_memory=False
)

# Time column cleaning: if it ends with ':', then add '00' for missing seconds
def fix_time(t):
    if isinstance(t, str) and t.endswith(':'):
        return t + '00'
    return t

data['Time'] = data['Time'].apply(fix_time)

# Combine Date and Time columns in a unique datedatime one, discard wrong parsings
data['Datetime'] = pd.to_datetime(
    data['Date'] + ' ' + data['Time'],
    dayfirst=True,
    errors='coerce'
)
data = data.dropna(subset=['Datetime'])

# Filter to mantain only the first year of measurements
start = pd.Timestamp('2006-12-16 17:24:00')
end   = pd.Timestamp('2007-12-16 17:24:00')
data = data[(data['Datetime'] >= start) & (data['Datetime'] <= end)]

# Select the Power consumption column
data = data[['Datetime', 'Global_active_power']].rename(
    columns={'Global_active_power': 'Power Consumption (kW)'}
)

# Extract Month, Day, Hour, Minute
data['Month']  = data['Datetime'].dt.month
data['Day']    = data['Datetime'].dt.day
data['Hour']   = data['Datetime'].dt.hour
data['Minute'] = data['Datetime'].dt.minute

# Remove null and zeros
data = data[data['Power Consumption (kW)'] > 0]

# Show first rows
print(data.head())



             Datetime  Power Consumption (kW)  Month  Day  Hour  Minute
0 2006-12-16 17:24:00                   4.216     12   16    17      24
1 2006-12-16 17:25:00                   5.360     12   16    17      25
2 2006-12-16 17:26:00                   5.374     12   16    17      26
3 2006-12-16 17:27:00                   5.388     12   16    17      27
4 2006-12-16 17:28:00                   3.666     12   16    17      28


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor


# Prepare the data for modeling
x = data[['Month', 'Day', 'Hour', 'Minute']]
y = data['Power Consumption (kW)']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

# Standardize the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

model = RandomForestRegressor(n_estimators=10, max_depth = 12, random_state=42)
model.fit(x_train_scaled, y_train)
y_pred = model.predict(x_test_scaled)
# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2  = r2_score(y_test, y_pred)
print(f"MSE: {mse:.2f} | R²: {r2:.2f}")

MSE: 0.59 | R²: 0.57


In [45]:
# Import necessary libraries for data handling and numerical operations
import pandas as pd
import numpy as np


# Load the dataset from a CSV file where values are separated by semicolons
# - 'na_values' defines '?' as missing data
# - 'low_memory=False' ensures pandas reads the file in one go, avoiding mixed types
data = pd.read_csv(
    'household_power_consumption.txt',
    sep=';',
    header=0,
    na_values='?',
    low_memory=False
)


# Define a function to fix time strings that might end with a colon (e.g., '18:')
# It appends '00' to correct incomplete time entries
def fix_time(t):
    if isinstance(t, str) and t.endswith(':'):
        return t + '00'
    return t


# Apply the fix_time function to the 'Time' column to clean up the time format
data['Time'] = data['Time'].apply(fix_time)


# Combine 'Date' and 'Time' columns into a single datetime column called 'Datetime'
# - 'dayfirst=True' assumes the date format is day/month/year
# - 'errors=coerce' converts invalid parsing to NaT (missing datetime)
data['Datetime'] = pd.to_datetime(
    data['Date'] + ' ' + data['Time'],
    dayfirst=True,
    errors='coerce'
)


# Drop any rows where 'Datetime' could not be parsed and is NaT
data = data.dropna(subset=['Datetime'])


# Filter the data to include only entries between two timestamps: Dec 16, 2006 and Dec 16, 2007
start = pd.Timestamp('2006-12-16 17:24:00')
end   = pd.Timestamp('2007-12-16 17:24:00')
data = data[(data['Datetime'] >= start) & (data['Datetime'] <= end)]


# Select only the 'Datetime' and 'Global_active_power' columns,
# renaming 'Global_active_power' to 'Power (kW)' for clarity
data = data[['Datetime', 'Global_active_power']].rename(
    columns={'Global_active_power': 'Power (kW)'}
)


# Convert the 'Power (kW)' column to float type for numerical operations
data['Power (kW)'] = data['Power (kW)'].astype(float)


# Extract the hour of the day from the 'Datetime' column to a new 'Hour' column
data['Hour'] = data['Datetime'].dt.hour


# Filter out rows where the power consumption is zero or negative
data = data[data['Power (kW)'] > 0]


# Define 'day' as data between 6 AM and 8 PM (inclusive)
day = data[(data['Hour'] >= 6) & (data['Hour'] <= 20)]


# Define 'night' as data before 6 AM or after 8 PM
night = data[(data['Hour'] < 6) | (data['Hour'] > 20)]


# Calculate the minimum and maximum power load during the day, rounded to 2 decimals
MIN_LOAD = round(day['Power (kW)'].min(), 2)
MAX_LOAD = round(day['Power (kW)'].max(), 2)


# Calculate the maximum power load during the night
MAX_NIGHT_LOAD = round(night['Power (kW)'].max(), 2)


# Define 'morning' as data between 6 AM and 1 PM (inclusive)
morning = data[(data['Hour'] >= 6) & (data['Hour'] <= 13)]


# Calculate the difference in power consumption between consecutive morning measurements
step_day = morning['Power (kW)'].diff().dropna()


# Calculate the average positive increment in morning power consumption,
# normalized by the range (MAX_LOAD - MIN_LOAD)
DAY_STEP = round(step_day[step_day > 0].mean() / (MAX_LOAD - MIN_LOAD), 3)


# Similarly, calculate the absolute average change in power consumption at night,
# normalized by the range (MAX_NIGHT_LOAD - MIN_LOAD)
step_night = night['Power (kW)'].diff().dropna()
NIGHT_STEP = round(step_night.abs().mean() / (MAX_NIGHT_LOAD - MIN_LOAD), 3)


# Print suggested macro constants based on the calculated values,
# which could be used in embedded systems or further analysis
print("Suggested macro values:\n")
print(f"#define MAX_LOAD {int(np.ceil(MAX_LOAD))} // Max load in W")
print(f"#define MIN_LOAD {int(np.floor(MIN_LOAD))} // Min load in W")
print(f"#define MAX_NIGHT_LOAD {int(np.ceil(MAX_NIGHT_LOAD))}")
print(f"#define DAY_STEP {DAY_STEP} // Relative increment")
print(f"#define NIGHT_STEP {NIGHT_STEP}")





Suggested macro values:

#define MAX_LOAD 11 // Max load in W
#define MIN_LOAD 0 // Min load in W
#define MAX_NIGHT_LOAD 10
#define DAY_STEP 0.015 // Relative increment
#define NIGHT_STEP 0.008


In [40]:
import emlearn

#Convert the model to C code for use in IoT devices
path = './load_prediction.h'
cmodel = emlearn.convert(model, method='inline')
cmodel.save(file=path, name='load_prediction')

print('Wrote model to', path)

Wrote model to ./load_prediction.h
