This file indlude the robusness analysis for the random selected sensor

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import pvlib
from pvlib.irradiance import aoi

In [None]:
df_Østbirk=pd.read_csv("df_Østbirk_random.csv")
df_Malaga=pd.read_csv("Malaga_random.csv")
df_Monpellier=pd.read_csv("Data_Monpellier.csv")

In [None]:
print(df_Østbirk.shape)
print(df_Malaga.shape)
print(df_Monpellier.shape)

In [8]:
df_Monpellier.rename(columns={'Sash 15 mm below pane': 'Sash 15 mm below pane - aPV east' }, inplace=True)

In [None]:
All_Data= pd.concat([df_Østbirk, df_Malaga,df_Monpellier], axis=0, ignore_index=True)
df_APX= All_Data.drop(columns=[ 'Window_Type', 'Blinds'])
print(df_APX.shape)
print(df_APX.columns)

In [15]:
df_APX['Window_ID'].unique()

array([1, 2, 8])

In [16]:
locations = {
    2: 'Malaga',
    1: 'Østbirk',
    8:'Montpellier' 
}
def location_name(window_ID):
    return locations.get(window_ID)

df_APX["Location"] = df_APX["Window_ID"].apply(location_name)

In [17]:
df_APX=df_APX.drop(columns=["Window_ID"])
df_APX['Time'] = pd.to_datetime(df_APX['Time'])
df_APX=df_APX.set_index('Time')


In [None]:
df_APX['Month'] =df_APX.index.month # extracts the month
df_APX["day_of_year"] = df_APX.index.dayofyear # extracts the day

In [18]:

df_APX["Month_sin"] = np.sin(2 * np.pi * df_APX["Month"] / 12) 
df_APX["Month_cos"] = np.cos(2 * np.pi * df_APX["Month"] / 12)

df_APX["day_of_year_sin"] = np.sin(2 * np.pi * df_APX["day_of_year"] / 365)
df_APX["day_of_year_cos"] = np.cos(2 * np.pi * df_APX["day_of_year"] / 365)

In [19]:
df_APX["minute_of_day"] =df_APX.index.hour * 60 + df_APX.index.minute

# 1440 min = 24 hours
df_APX["time_sin"] = np.sin(2 * np.pi * df_APX["minute_of_day"] / 1440)
df_APX["time_cos"] = np.cos(2 * np.pi * df_APX["minute_of_day"] / 1440)

In [20]:
df_APX=df_APX.reset_index('Time')
# This function calculates the solar position
def get_solar_position(row):
    solpos = pvlib.solarposition.get_solarposition(row['Time'], row['Latitude'], row['Longitude'])
    return pd.Series([solpos['azimuth'].values[0], solpos['apparent_elevation'].values[0]])


In [21]:
df_APX[['solar_azimuth', 'solar_elevation']] = df_APX.apply(get_solar_position, axis=1)

In [22]:
df_APX['solar_azimuth_sin'] = np.sin(np.deg2rad(df_APX['solar_azimuth']))
df_APX['solar_azimuth_cos'] = np.cos(np.deg2rad(df_APX['solar_azimuth']))

In [24]:
df_APX['window_tilt'] = df_APX['Location'].apply(
    lambda loc: 45 if loc == 'Østbirk' else 23)

df_APX['window_azimuth'] = df_APX.apply(
    lambda row: 150 if row['Location'] == 'Montpellier' else (225 if row['Location'] == 'Østbirk' else 177), axis=1)


In [25]:
df_APX['solar_zenith']=  90 - df_APX['solar_elevation']

In [26]:
#calculates the angle of incidence of the sun
df_APX['aoi'] = aoi(
    surface_tilt=df_APX['window_tilt'],
    surface_azimuth=df_APX['window_azimuth'],
    solar_zenith= df_APX['solar_zenith'],
    solar_azimuth=df_APX['solar_azimuth']
)

In [28]:
df_APX['cos_aoi'] = np.cos(np.radians(df_APX['aoi']))

In [29]:
df_APX=df_APX.drop(columns=['aoi','solar_zenith'])

In [31]:
df_list = []
df_APX= df_APX.set_index('Time')
for loc, group in df_APX.groupby("Location"):
    group= group.asfreq('30min')
    group["Location"]=loc
    df_list.append(group)
    
df_adjusted = pd.concat(df_list)
print(df_adjusted.shape)

(29000, 76)


In [32]:
df_adjusted=df_adjusted.reset_index()
df_adjusted.loc[df_adjusted["Location"] == 'Østbirk', "Time"] = df_adjusted.loc[df_adjusted["Location"] == 'Østbirk', "Time"].apply(lambda x: x.replace(year=2024))

In [None]:
df_adjusted3=df_adjusted.copy()
df_adjusted.to_csv('df_adjusted3.csv',index=False)

In [4]:
df_adjusted=pd.read_csv('df_adjusted3.csv')
df_adjusted['Time'] = pd.to_datetime(df_adjusted['Time'])

In [None]:
targets=['Cladding uPV east corner',
       'Sash 15 mm below pane - bottom center', 'Gip uPV east',
       'Outer pane spacer - top center', 'Gip uPV west corner',
       'Outer frame lower - top center', 'Sash behind pane - aPV east',
       'Inner frame uPV east', 'Inner pane spacer - bottom center',
       'Sash gasket level - bottom center', 
       'Gip aPV west', 'Outer frame upper - uPV west', 'Gip uPV east corner',
       'Sash 15 mm below pane - uPV east', 'Clading uPV west',
       'Inner pane spacer - uPV east', 'Outer frame upper - uPV east',
       'Sash gasket level - uPV east', 'Outer frame lower - uPV east',
       'Gip Bottom Center', 'Gip Top Center', 'Clading aPV east',
       'Sash 15 mm below pane - aPV east', 'Sash gasket level - aPV east',
       'Outer frame upper - aPV west', 'Top frame aPV west',
       'Top cladding - top center', 'Blind C room']
dfs = []



for loc, group in df_adjusted.groupby(["Location"],dropna=False):
    location = loc[0]
    for target in targets:
        df_target = group.copy()
        print(len(df_target))
        df_target = df_target.drop(columns=[t for t in targets if t != target])
        df_target = df_target.rename(columns={target: "sensor_temp"})
        df_target["sensor_ID"] = target
        df_target["timeserie_ID"] =f"{location}, {target}"
        dfs.append(df_target)

# Kombinér alt til ét dataframe
df_global_long_form3 = pd.concat(dfs, ignore_index=True)

In [45]:
df_global_long_form3 = df_global_long_form3.dropna(subset='sensor_temp')

In [48]:
df_global_long_form3.to_csv("global_dataset3.csv",index=True)

In [50]:
df_global_long_form = pd.read_csv("global_dataset3.csv",index_col=0)
df_global_long_form['Time']= pd.to_datetime(df_global_long_form["Time"])


df_global_long_form = df_global_long_form.set_index('Time')

df_filtered_timeserie3 = df_global_long_form[~((df_global_long_form['Location'] == 'Montpellier') & (df_global_long_form.index > '2024-12-17'))]



df_filtered_timeserie3.to_csv("df_filtered_timeserie3.csv",index=True)

In [5]:
df_filtered_timeserie= pd.read_csv("df_filtered_timeserie3.csv",index_col=0)
df_filtered_timeserie.index= pd.to_datetime(df_filtered_timeserie.index)


In [7]:
 def split_data_by_half_months(df):
    # Fold 1
    fold1_train = df[df.index.month.isin([5, 6, 7]) | ((df.index.month == 8) & (df.index.day <= 15))]
    fold1_valid = df[(df.index.month == 8) & (df.index.day > 15)]
    
    # Fold 2
    fold2_train = df[df.index.month.isin([5, 6, 7, 8]) | ((df.index.month == 9) & (df.index.day <= 15))]
    fold2_valid = df[(df.index.month == 9) & (df.index.day > 15)]
    
    # Fold 3
    fold3_train = df[df.index.month.isin([5, 6, 7, 8, 9]) | ((df.index.month == 10) & (df.index.day <= 15))]
    fold3_valid = df[(df.index.month == 10) & (df.index.day > 15)]
    fold_4 = df[df.index.month.isin([5, 6, 7, 8, 9,10]) | ((df.index.month == 11) & (df.index.day <= 15))]
    
    test = df[df.index.month.isin([12,1]) | ((df.index.month == 11) & (df.index.day > 15))]
    
    return [
        (fold1_train, fold1_valid),
        (fold2_train, fold2_valid),
        (fold3_train, fold3_valid)
    ],fold_4, test

In [8]:
#ensures uniform time frequency
resampled_groups = []
for ts_id, group in df_filtered_timeserie.groupby('timeserie_ID'):

    group_asfreq = group.asfreq("30min")

    group_asfreq['timeserie_ID'] = ts_id

    resampled_groups.append(group_asfreq)

df_resampled = pd.concat(resampled_groups)
df_filtered_timeserie=df_resampled.copy()

In [9]:
df= df_filtered_timeserie.copy()

In [10]:
df['window_area'] = df['Location'].apply(
    lambda loc: 118*78 if loc == 'Montpellier' else 140*78)

In [11]:
def create_features_exogenes(dataframe,eksogene_variables,lag_sizes):
    df=dataframe.copy()   
    for col in eksogene_variables:    
        for lag in lag_sizes:
            df[f'lag_{lag,col}'] = df[col].shift(lag)
    
    df=df.dropna()
    return df  

In [None]:
df_global_long_form_subset=df.drop(columns=['Location','Month','ET','day_of_year','minute_of_day','THW Index','In Dew','UV Dose','Wind Run','Bar','Heat D-D', 'Cool D-D','In Hum',
                                                            'Wind Dir','Hi Dir','window_azimuth','window_tilt','Rain', 'Rain Rate','Hi Temp','Low Temp','Hi Solar Rad.', 
                                                            'Hi Speed','solar_azimuth','Wind Chill','Hi UV','Heat Index','In Heat','UV Index','THSW Index'])
                            
                                                

                            
                                                

numerical=['Temp Out', 'Solar Rad.',
    'In Temp','cos_aoi','Out Hum',
    'Wind Speed','Dew Pt.']

df_list = []
for ts, group in df_global_long_form_subset.groupby(["timeserie_ID"],dropna=False):
    print(group.shape)
    group = create_features_exogenes(
        dataframe=group,
        eksogene_variables= numerical,
        lag_sizes=[1,2,3,4,5],
        )
   
    df_list.append(group)

df_window = pd.concat(df_list)

print(df_window.shape)

In [20]:
best_params={'n_estimators': 1251, 'learning_rate': 0.02112537357198592, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.6328513694770547, 'colsample_bytree': 0.6953877485019768, 'reg_alpha': 5.413621183560153, 'reg_lambda': 0.5884116397695999, 'gamma': 3.3650674471559303}


In [None]:

expanding_splits,train_all, test = split_data_by_half_months(df_window)

final_model = XGBRegressor(**best_params,random_state=42,enable_categorical =True)
final_train_set = train_all


target=['sensor_temp']
targets=['sensor_temp','timeserie_ID']

X_train_final, y_train_final = final_train_set.drop(columns=targets), final_train_set[target]
X_test, y_test = test.drop(columns=targets), test[target]

X_train_final['sensor_ID']=X_train_final['sensor_ID'].astype('category')
X_test['sensor_ID']=X_test['sensor_ID'].astype('category')

final_model.fit(X_train_final, y_train_final)
y_train_pred= final_model.predict(X_train_final)
mse = mean_squared_error(y_train_final, y_train_pred)
rmse= np.sqrt(mse)
mae= mean_absolute_error(y_train_final, y_train_pred)

print("Train mse:",mse)
print("Train rmse:",rmse)
print("Train mae:", mae)

    

y_test_pred= final_model.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
rmse= np.sqrt(mse)
mae= mean_absolute_error(y_test,y_test_pred)

print("Test mse:",mse)
print("Test rmse:",rmse)
print("Test mae:", mae)



results_df=[]
for ts in test["timeserie_ID"].unique():
  
    mask = test["timeserie_ID"] == ts
    y_true = y_test[mask]
    y_pred = y_test_pred[mask]
        
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)


 
    results_df.append({"Timeserie": ts, "MSE": mse, "RMSE": rmse, "MAE": mae})

   
    plt.figure(figsize=(20, 5))
    plt.plot(y_true.index, y_true, label="True values", linestyle='-')
    plt.plot(y_true.index, y_pred, label="Preicted Values", linestyle='-', alpha=0.7)

    plt.xlabel("Time")
    plt.ylabel(f"Temperature")
    plt.title(f"True value vs. predicted for {ts}")
    plt.legend()
    plt.grid(True)
        
results_df = pd.DataFrame(results_df)
print(results_df)
