In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [66]:
ogTrainData = pd.read_csv('train.csv')
ogTestData = pd.read_csv('test.csv')

trainData = ogTrainData.copy()

In [67]:
def edit_data(df):
    df.rename(columns={
    "source_1_temperature" : "heater-1",
    "source_2_temperature" : "hvac-1",
    "source_3_temperature" : "hvac-2",
    "source_4_temperature" : "heater-2",
    }, inplace=True)
    df["measurement_time"] = pd.to_datetime(df.measurement_time)
    df.insert(1, "time", df.measurement_time.dt.hour)
    df["day_of_week"] = df.measurement_time.dt.dayofweek
    df["is_weekend"] = df.day_of_week >= 5
    df["month"] = df.measurement_time.dt.month
    df["total_consumption"] = df[["heater-1", "heater-2", "hvac-1", "hvac-2"]].sum(axis=1)
    df["temp_diff"] = df["outside_temperature"] - df["mean_room_temperature"]
    df["total_hvac_consump"] = df[["hvac-1", "hvac-2"]].sum(axis=1)
    radiation_cols = ["sun_radiation_east", "sun_radiation_west", "sun_radiation_south", "sun_radiation_north"]
    df["dominant_radiation"] = df[radiation_cols].max(axis=1)
    df["total_solar_radiation"] = df[radiation_cols].sum(axis=1)
    df["adjusted_radiation"] = df["total_solar_radiation"] * (1 - df["clouds"])
    df["wind_direction_radians"] = np.radians(df["wind_direction"])
    df["wind_y"] = df["wind_speed"] * np.sin(df["wind_direction_radians"])
    df["wind_x"] = df["wind_speed"] * np.cos(df["wind_direction_radians"])
    return df



features = [
'time',
'heater-1',
'hvac-1',
'hvac-2',
'heater-2',
'mean_room_temperature',
'sun_radiation_east',
'sun_radiation_west',
'sun_radiation_south',
'sun_radiation_north',
'sun_radiation_perpendicular',
'outside_temperature',
'wind_speed',
'wind_direction',
'day_of_week',
'is_weekend',
'month',
'total_consumption',
'temp_diff',
'total_hvac_consump',
'dominant_radiation',
'total_solar_radiation',
'adjusted_radiation',
'wind_direction_radians',
'wind_y',
'wind_x',
]

base_features = [
    'heater-1', 'hvac-1', 'heater-2', 'hvac-2',
    'sun_radiation_south', 'sun_radiation_north',
    'mean_room_temperature', 'sun_radiation_east', 'sun_radiation_west',
    'sun_radiation_perpendicular', 'outside_temperature', 'wind_speed',
    'wind_direction', 'clouds'
    ]


# Time Features
time_features = [
    'time', 'day_of_week', 'is_weekend', 'month'
]

# HVAC and Heater Features
hvac_heater_features = [
    'heater-1', 'hvac-1', 'hvac-2', 'heater-2', 'total_hvac_consump'
]

# Temperature Features
temperature_features = [
    'mean_room_temperature', 'outside_temperature', 'temp_diff'
]

# Solar Radiation Features
solar_radiation_features = [
    'sun_radiation_east', 'sun_radiation_west', 'sun_radiation_south',
    'sun_radiation_north', 'sun_radiation_perpendicular',
    'total_solar_radiation', 'dominant_radiation', 'adjusted_radiation'
]

# Wind Features
wind_features = [
    'wind_x', 'wind_y'
]

# Consumption Features
consumption_features = [
    'total_consumption'
]

trainData = edit_data(trainData)

In [None]:
trainingDataX = trainData[features]
trainingDataY = trainData["target"]

print("X train shape:", trainingDataX.shape)
print("Y train shape:", trainingDataY.shape)

X train shape: (7047, 26)
Y train shape: (7047,)


: 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit


X_train, X_test, y_train, y_test = train_test_split(
    trainingDataX, trainingDataY, test_size=0.3, random_state=42, shuffle=False
)

model = RandomForestRegressor()

tscv = TimeSeriesSplit(n_splits=5)

# Perform RFECV with TimeSeriesSplit
rfecv = RFECV(estimator=model, step=1, cv=tscv, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=2)
rfecv.fit(trainingDataX, trainingDataY)

selected_features = X_train.columns[rfecv.support_]

print("Optimal number of features:", rfecv.n_features_)
print("Selected features:", selected_features.tolist())

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

final_model = RandomForestRegressor()
final_model.fit(X_train_selected, y_train)

from sklearn.metrics import mean_absolute_error

y_pred = final_model.predict(X_test_selected)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error on Test Set:", mae)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = model.predict(X_train_base)
y_pred_2 = Tmodel.predict(X_train_engineered)

mse = mean_squared_error(y_train, y_pred)
mae = mean_absolute_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

Tmse = mean_squared_error(y_train, y_pred_2)
Tmae = mean_absolute_error(y_train, y_pred_2)
Tr2 = r2_score(y_train, y_pred_2)

print("Control: ")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")
print("Test: ")
print(f"Mean Squared Error: {Tmse}")
print(f"Mean Absolute Error: {Tmae}")
print(f"R-squared: {Tr2}")
print("")
print(f"Total Improvement: {mae - Tmae}")

Control: 
Mean Squared Error: 2.0080986970638794
Mean Absolute Error: 0.9358442894076047
R-squared: 0.9852680711961668
Test: 
Mean Squared Error: 2.0164810545033665
Mean Absolute Error: 0.9338910774546847
R-squared: 0.9852065760648829

Total Improvement: 0.0019532119529199754
