In [155]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tsfresh.feature_extraction.feature_calculators as tsf

In [156]:
ogTrainData = pd.read_csv('train.csv')
ogTestData = pd.read_csv('test.csv')

trainData = ogTrainData.copy()
ogTestData.isna().sum()

ID                              0
measurement_time                0
source_1_temperature            0
source_2_temperature            0
source_3_temperature            0
source_4_temperature            0
mean_room_temperature           0
sun_radiation_east              0
sun_radiation_west              0
sun_radiation_south             0
sun_radiation_north             0
sun_radiation_perpendicular     0
outside_temperature            13
wind_speed                     13
wind_direction                 13
clouds                         13
dtype: int64

In [157]:
def edit_data(df):
    df.rename(columns={
    "source_1_temperature" : "heater-1",
    "source_2_temperature" : "hvac-1",
    "source_3_temperature" : "hvac-2",
    "source_4_temperature" : "heater-2",
    }, inplace=True)
    
    df["measurement_time"] = pd.to_datetime(df.measurement_time)
    df.insert(1, "time", df.measurement_time.dt.hour.astype(float))
    df["day_of_week"] = df.measurement_time.dt.dayofweek
    df["is_weekend"] = df.day_of_week >= 5
    df["month"] = df.measurement_time.dt.month
    
    df["total_consumption"] = df[["heater-1", "heater-2", "hvac-1", "hvac-2"]].sum(axis=1)
    df["total_hvac_consumption"] = df[["hvac-1", "hvac-2"]].sum(axis=1)
    df["total_heater_consumption"] = df[["heater-1", "heater-2"]].sum(axis=1)
    
    df["temp_diff"] = df["outside_temperature"] - df["mean_room_temperature"]
    
    df["total_hvac_consump"] = df[["hvac-1", "hvac-2"]].sum(axis=1)
    
    radiation_cols = ["sun_radiation_east", "sun_radiation_west", "sun_radiation_south", "sun_radiation_north"]
    
    df["dominant_radiation"] = df[radiation_cols].max(axis=1)
    df["total_solar_radiation"] = df[radiation_cols].sum(axis=1)
    df["adjusted_radiation"] = df["total_solar_radiation"] * (1 - df["clouds"])
    
    df["wind_direction_radians"] = np.radians(df["wind_direction"])
    df["wind_y"] = df["wind_speed"] * np.sin(df["wind_direction_radians"])
    df["wind_x"] = df["wind_speed"] * np.cos(df["wind_direction_radians"])
    
    df["hvac_temp_interaction"] = df["total_hvac_consumption"] * df["mean_room_temperature"]
    df["heater_temp_interaction"] = df["total_heater_consumption"] * df["mean_room_temperature"]
    df["wind_temp_interaction"] = df["wind_speed"] * df["mean_room_temperature"]
    df["solar_temp_interaction"] = df["total_solar_radiation"] * df["mean_room_temperature"]
    
    for feature in lag_features:
        df[f"{feature}_lag_1"] = df[feature].shift(1)

    
    return df



allFeatures = [
'time',
'heater-1',
'hvac-1',
'hvac-2',
'heater-2',
'mean_room_temperature',
'sun_radiation_east',
'sun_radiation_west',
'sun_radiation_south',
'sun_radiation_north',
'sun_radiation_perpendicular',
'outside_temperature',
'wind_speed',
'wind_direction',
'day_of_week',
'is_weekend',
'month',
'total_consumption',
'temp_diff',
'total_hvac_consump',
'dominant_radiation',
'total_solar_radiation',
'adjusted_radiation',
'wind_direction_radians',
'wind_y',
'wind_x',
]

test_features = [
    'temp_diff_lag_1', 'total_consumption_lag_1', 'heater-1_lag_1', 'hvac-1_lag_1', 'heater-2_lag_1', 'hvac-2_lag_1'
]



lag_features = ['total_consumption', 'temp_diff', 'heater-1', 'hvac-1', 'heater-2', 'hvac-2', 'mean_room_temperature', 'outside_temperature']

base_features = [
    'heater-1', 'hvac-1', 'heater-2', 'hvac-2',
    'sun_radiation_south', 'sun_radiation_north',
    'mean_room_temperature', 'sun_radiation_east', 'sun_radiation_west',
    'sun_radiation_perpendicular', 'outside_temperature', 'wind_speed',
    'wind_direction', 'clouds' 
    ]


# Time Features
time_features = [
    'time', 'day_of_week', 'is_weekend', 'month' 
]

# HVAC and Heater Features
hvac_heater_features = [
    'heater-1', 'hvac-1', 'hvac-2', 'heater-2', 'total_hvac_consump', "hvac_temp_interaction", "heater_temp_interaction"]


# Temperature Features
temperature_features = [
    'mean_room_temperature', 'outside_temperature', 'temp_diff' 
]

# Solar Radiation Features
solar_radiation_features = [
    'sun_radiation_east', 'sun_radiation_west', 'sun_radiation_south',
    'sun_radiation_north', 'sun_radiation_perpendicular',
    'total_solar_radiation', 'dominant_radiation', 'adjusted_radiation' 
]

# Wind Features
wind_features = [
    'wind_x', 'wind_y' 
]

# Consumption Features
consumption_features = [
    'total_consumption' 
]

feature_sets = {
    "time_features": time_features,
    "hvac_heater_features": hvac_heater_features,
    "temperature_features": temperature_features,
    "solar_radiation_features": solar_radiation_features,
    "wind_features": wind_features,
    "consumption_features": consumption_features,
    "test_features": test_features
}

features = list(set(test_features + base_features + time_features + hvac_heater_features + temperature_features + solar_radiation_features + wind_features + consumption_features))

trainData = edit_data(trainData)

#show all columns

allFeatures = list(set(allFeatures + test_features + base_features + time_features + hvac_heater_features + temperature_features + solar_radiation_features + wind_features + consumption_features))


features.remove('sun_radiation_south')
features.remove('sun_radiation_north')
features.remove('sun_radiation_east')
features.remove('sun_radiation_west')
features.remove('sun_radiation_perpendicular')
features.remove('clouds')
features.remove('wind_direction')
features.remove('day_of_week')
features.remove('month')
features.remove('total_hvac_consump')
features.remove('adjusted_radiation')
features.remove('wind_x')
features.remove("heater_temp_interaction")
features.remove("hvac_temp_interaction")
features.remove("dominant_radiation")
features.remove("is_weekend")

pd.set_option('display.max_columns', None)
print(allFeatures)

trainData[features]

['heater_temp_interaction', 'hvac-2_lag_1', 'adjusted_radiation', 'temp_diff_lag_1', 'clouds', 'wind_x', 'total_consumption_lag_1', 'total_consumption', 'hvac-1', 'day_of_week', 'wind_direction', 'heater-2_lag_1', 'sun_radiation_west', 'sun_radiation_perpendicular', 'sun_radiation_north', 'sun_radiation_east', 'hvac_temp_interaction', 'total_solar_radiation', 'is_weekend', 'sun_radiation_south', 'hvac-2', 'outside_temperature', 'hvac-1_lag_1', 'total_hvac_consump', 'heater-2', 'wind_direction_radians', 'month', 'dominant_radiation', 'heater-1_lag_1', 'heater-1', 'mean_room_temperature', 'time', 'temp_diff', 'wind_y', 'wind_speed']


Unnamed: 0,hvac-2_lag_1,temp_diff_lag_1,total_consumption_lag_1,heater-2_lag_1,hvac-1,total_consumption,total_solar_radiation,outside_temperature,hvac-2,wind_y,heater-2,heater-1_lag_1,heater-1,mean_room_temperature,time,temp_diff,hvac-1_lag_1,wind_speed
0,,,,,18.799999,87.250000,0.000000,8.97,19.750000,1.324142,21.100000,,27.600000,20.129892,0.0,-11.159892,,2.06
1,19.750000,-11.159892,87.250000,21.100000,18.933333,88.199999,0.000000,9.19,19.833333,1.935767,21.033333,27.600000,28.400000,20.052919,1.0,-10.862919,18.799999,2.06
2,19.833333,-10.862919,88.199999,21.033333,19.000000,89.199999,0.000000,9.42,19.799999,1.651964,21.000000,28.400000,29.400000,19.992375,2.0,-10.572375,18.933333,2.57
3,19.799999,-10.572375,89.199999,21.000000,19.033333,93.666667,0.000000,9.19,19.933333,1.285000,24.600000,29.400000,30.100000,19.941565,3.0,-10.751565,19.000000,2.57
4,19.933333,-10.751565,93.666667,24.600000,19.100000,95.666667,0.000000,9.99,20.000000,0.878992,24.700000,30.100000,31.866666,19.924502,4.0,-9.934502,19.033333,2.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7042,16.066667,-0.517991,73.191667,21.500000,16.666667,73.566668,1409.000000,22.39,16.275000,-0.803991,21.475000,19.200000,19.150001,21.276690,10.0,1.113310,16.425000,4.63
7043,16.275000,1.113310,73.566668,21.475000,16.449999,80.824999,1401.677419,22.59,16.100000,-0.982849,21.525000,19.150001,26.750000,21.392902,11.0,1.197098,16.666667,5.66
7044,16.100000,1.197098,80.824999,21.525000,16.100000,90.191667,1268.709677,23.78,16.075000,-2.110264,19.350000,26.750000,38.666667,21.567946,12.0,2.212054,16.449999,6.17
7045,16.075000,2.212054,90.191667,19.350000,16.050000,89.325000,1229.322581,24.10,16.033333,-2.110264,17.366667,38.666667,39.875000,21.517143,13.0,2.582857,16.100000,6.17


In [158]:
trainingDataX = trainData[features]
trainingDataY = trainData["target"]

print("X train shape:", trainingDataX.shape)
print("Y train shape:", trainingDataY.shape)

X train shape: (7047, 18)
Y train shape: (7047,)


In [159]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.impute import KNNImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

In [160]:
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

def time_transformer(df):
    df["hour_sin"] = np.sin(df["time"] / 24 * 2 * np.pi)
    df["hour_cos"] = np.cos(df["time"] / 24 * 2 * np.pi)
    return df


In [161]:
X_train, X_test, y_train, y_test = train_test_split(
    trainingDataX, trainingDataY, test_size=0.3, random_state=42, shuffle=False
)

In [162]:
import evalml
X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(trainingDataX, trainingDataY, problem_type='regression')

knn = KNNImputer(n_neighbors=5)
X_train = knn.fit_transform(X_train)
X_test = knn.transform(X_test)

X_train = pd.DataFrame(X_train, columns=features)
X_test = pd.DataFrame(X_test, columns=features)

X_train = time_transformer(X_train)
X_test = time_transformer(X_test)

from evalml.automl import AutoMLSearch
automl = AutoMLSearch(X_train=X_train, y_train=y_train, problem_type='regression')
automl.search()


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{1: {'Random Forest Regressor w/ Imputer + RF Regressor Select From Model': 1.3895511627197266,
  'Total time of batch': 1.5059988498687744},
 2: {'Extra Trees Regressor w/ Imputer + Select Columns Transformer': 0.40799832344055176,
  'XGBoost Regressor w/ Imputer + Select Columns Transformer': 0.43700122833251953,
  'LightGBM Regressor w/ Imputer + Select Columns Transformer': 0.2239990234375,
  'Elastic Net Regressor w/ Imputer + Standard Scaler + Select Columns Transformer': 0.3569915294647217,
  'Total time of batch': 1.8880314826965332}}

In [163]:
automl.rankings

Unnamed: 0,id,pipeline_name,search_order,ranking_score,mean_cv_score,standard_deviation_cv_score,percent_better_than_baseline,high_variance_cv,parameters
0,3,XGBoost Regressor w/ Imputer + Select Columns ...,3,0.87641,0.87641,0.00722,315839.018366,False,{'Imputer': {'categorical_impute_strategy': 'm...
1,4,LightGBM Regressor w/ Imputer + Select Columns...,4,0.845108,0.845108,0.010059,304561.976776,False,{'Imputer': {'categorical_impute_strategy': 'm...
2,1,Random Forest Regressor w/ Imputer + RF Regres...,1,0.823064,0.823064,0.004843,296620.232238,False,{'Imputer': {'categorical_impute_strategy': 'm...
3,2,Extra Trees Regressor w/ Imputer + Select Colu...,2,0.708704,0.708704,0.008913,255420.653692,False,{'Imputer': {'categorical_impute_strategy': 'm...
4,5,Elastic Net Regressor w/ Imputer + Standard Sc...,5,0.455548,0.455548,0.023802,164217.598563,False,{'Imputer': {'categorical_impute_strategy': 'm...
5,0,Mean Baseline Regression Pipeline,0,-0.000278,-0.000278,0.000239,0.0,False,{'Baseline Regressor': {'strategy': 'mean'}}


In [164]:
pipeline = automl.best_pipeline
predictions = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, predictions)

print(f"Mean Absolute Error on test data: {mae}")

Mean Absolute Error on test data: 2.6817712011655166


In [165]:
finalData = edit_data(ogTestData.copy())

finalData = time_transformer(finalData)

print("Data types in X_train:")
print(X_train.dtypes)

print("\nData types in finalData[features]:")
print(finalData[features].dtypes)

features += ["hour_sin", "hour_cos"]

finalPred = pipeline.predict(finalData[features])

finalPred = pd.DataFrame(finalPred, columns=["target"])
finalPred.index = ogTestData['ID']
finalPred.count = None
finalPred.to_csv("submission.csv")

Data types in X_train:
hvac-2_lag_1               float64
temp_diff_lag_1            float64
total_consumption_lag_1    float64
heater-2_lag_1             float64
hvac-1                     float64
total_consumption          float64
total_solar_radiation      float64
outside_temperature        float64
hvac-2                     float64
wind_y                     float64
heater-2                   float64
heater-1_lag_1             float64
heater-1                   float64
mean_room_temperature      float64
time                       float64
temp_diff                  float64
hvac-1_lag_1               float64
wind_speed                 float64
hour_sin                   float64
hour_cos                   float64
dtype: object

Data types in finalData[features]:
hvac-2_lag_1               float64
temp_diff_lag_1            float64
total_consumption_lag_1    float64
heater-2_lag_1             float64
hvac-1                     float64
total_consumption          float64
total_solar_radia