# Models with Splitting by Sensor

## Pre-Processing

In [1]:
# Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime
import pickle
import math
import os

from tslearn.clustering import TimeSeriesKMeans
from tslearn.preprocessing import TimeSeriesScalerMinMax

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import mean_absolute_error, mean_squared_error, pairwise_distances_argmin_min

from itertools import chain, combinations
from joblib import Parallel, delayed
from tabulate import tabulate

import warnings
warnings.filterwarnings('ignore')

In [2]:
def calculate_distortion(X_normalized, cluster_centers):
    cluster_centers_reduced = cluster_centers.reshape((cluster_centers.shape[0], -1))
    return sum(np.min(pairwise_distances_argmin_min(cluster_centers_reduced, X_normalized.reshape((X_normalized.shape[0], -1)), metric="euclidean"), axis=1))**2


def calculate_aic(y_true, y_pred, num_features):
    resid = y_true - y_pred
    sse = np.sum(resid ** 2)
    aic = len(y_true) * np.log(sse / len(y_true)) + 2 * num_features
    return aic


def calculate_bic(y_true, y_pred, num_features):
    resid = y_true - y_pred
    sse = np.sum(resid ** 2)
    n = len(y_true)
    bic = n * np.log(sse / n) + num_features * np.log(n)
    return bic


def forward_step(X_train, y_train, selected_features, remaining_features, criterion_func):
    best_criterion = np.inf
    best_feature = None
    
    for feature in remaining_features:
        temp_features = selected_features + [feature]
        temp_model = sm.OLS(y_train, sm.add_constant(X_train[temp_features])).fit()
        temp_criterion = criterion_func(y_train, temp_model.predict(sm.add_constant(X_train[temp_features])), len(temp_model.params))
        
        if temp_criterion < best_criterion:
            best_criterion = temp_criterion
            best_feature = feature
    
    return best_criterion, best_feature

def backward_step(X_train, y_train, selected_features, criterion_func):
    best_criterion = np.inf
    best_feature = None
    
    for feature in selected_features:
        temp_features = selected_features.copy()
        temp_features.remove(feature)
        temp_model = sm.OLS(y_train, sm.add_constant(X_train[temp_features])).fit()
        temp_criterion = criterion_func(y_train, temp_model.predict(sm.add_constant(X_train[temp_features])), len(temp_model.params))
        
        if temp_criterion < best_criterion:
            best_criterion = temp_criterion
            best_feature = feature
    
    return best_criterion, best_feature

def stepwise_bidirectional_selection(X_train, y_train, method='aic'):
    features = list(X_train.columns)
    selected_features = []
    best_criterion = np.inf
    
    while len(features) > 0:
        if method == 'aic':
            forward_criterion, forward_feature = forward_step(X_train, y_train, selected_features, features, calculate_aic)
            backward_criterion, backward_feature = backward_step(X_train, y_train, selected_features, calculate_aic)
        elif method == 'bic':
            forward_criterion, forward_feature = forward_step(X_train, y_train, selected_features, features, calculate_bic)
            backward_criterion, backward_feature = backward_step(X_train, y_train, selected_features, calculate_bic)
        else:
            raise ValueError("Invalid criterion_func. Use 'aic' or 'bic'.")
        
        if forward_criterion < backward_criterion and forward_criterion < best_criterion:
            selected_features.append(forward_feature)
            best_criterion = forward_criterion
        elif backward_criterion < forward_criterion and backward_criterion < best_criterion:
            selected_features.remove(backward_feature)
            best_criterion = backward_criterion
        else:
            break
    
    return selected_features


def r_squared(y_true, y_pred):
    mean_y_true = np.mean(y_true)
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - mean_y_true) ** 2)
    
    r_squared_value = 1 - (ss_res / ss_tot)
    
    return r_squared_value

def mape(y_true, y_pred):
    non_zero_indices = np.where(y_true != 0)[0]
    y_true_no_zeros = np.array(y_true)[non_zero_indices]
    y_pred_no_zeros = np.array(y_pred)[non_zero_indices]

    absolute_percentage_errors = np.abs((y_true_no_zeros - y_pred_no_zeros) / y_true_no_zeros)
    mape_value = np.mean(absolute_percentage_errors) * 100
    return mape_value

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


def evaluate_subset(subset, data, sensor):
    df_train = data[data['sensor_id'].isin(subset)]
    df_test = data[data['sensor_id'] == sensor]

    X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
    y_train = df_train['avg_tens']

    X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
    y_test = df_test['avg_tens']

    selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

    model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
    pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

    rmse_subset = round(rmse(y_test, pred_sensor), 3)

    return subset, rmse_subset


def optimize_ncomp_pls(X, y, n_comp_range, ylabel, objective):
    errors = []
    xticks = np.arange(1, n_comp_range + 1)

    for n_comp in xticks:
        pls = PLSRegression(n_components=n_comp)
        y_cv = cross_val_predict(pls, X, y, cv=10)
        error = rmse(y, y_cv)
        errors.append(error)

    with plt.style.context('ggplot'):
        plt.plot(xticks, np.array(errors), 'o-', color='blue', mfc='blue')
        if objective == 'min':
            idx = np.argmin(errors)
        else:
            idx = np.argmax(errors)
        plt.plot(xticks[idx], np.array(errors)[idx], 'P', ms=10, mfc='red')

        plt.xlabel('Number of PLS components')
        plt.xticks = xticks
        plt.ylabel(ylabel)
        plt.title('PLS')
        plt.grid(True, linestyle='dashed', color='gray')
        plt.gca().set_facecolor('white')

    plt.show()
    
    if objective == 'min':
        return xticks[np.argmin(errors)]
    else:
        return xticks[np.argmax(errors)]

In [6]:
import sys
import os
import warnings

sys.path.append(os.path.abspath('ml_models'))
warnings.filterwarnings('ignore')

df = pd.read_csv('../data/row_data_rovere.csv')
unique_pairs = df[['group', 'sensor_id', 'description']].drop_duplicates().sort_values(by='group').reset_index(drop=True)
unique_pairs.to_csv('unique_pairs.csv', index=False)
unique_pairs

In [4]:
df = pd.read_csv('data_sensors_rovere.csv')
df = df.rename(columns={'group': 'group_id'})

df_rovere = df[['reading_id', 'timestamp', 'sensor_id', 'value', 'description', 'group_id']]

df_rovere['reading_id'] = df_rovere['reading_id'].astype(str)
df_rovere['timestamp'] = pd.to_datetime(df_rovere['timestamp']).dt.floor('D').dt.date
df_rovere['sensor_id'] = df_rovere['sensor_id'].astype(str)
df_rovere['value'] = df_rovere['value'].astype(float)
df_rovere['description'] = df_rovere['description'].astype(str)
df_rovere['group_id'] = df_rovere['group_id'].astype(str)

condition_30 = df_rovere['sensor_id'].isin(['72', '76', '73', '74', '61', '63', '67', '65'])
condition_60 = df_rovere['sensor_id'].isin(['71', '69', '75', '70', '62', '64', '68', '66'])
condition_irrigation = df_rovere['description'] == 'irrigation'

df_rovere.loc[condition_30, 'description'] = 'Tensiometer 30'
df_rovere.loc[condition_60, 'description'] = 'Tensiometer 60'
df_rovere.loc[condition_irrigation, 'description'] = 'Irrigation'

print('Shape:', df_rovere.shape)
print('Types:\n', df_rovere.dtypes)
df_rovere.head(10)

In [5]:
condition_not_in_list = ~df_rovere['sensor_id'].isin(['72', '76', '73', '74', '61', '63', '67', '65', '71', '69', '75', '70', '62', '64', '68', '66'])
df_to_duplicate = df_rovere[condition_not_in_list]
df_to_duplicate['group_id_1'] = df_to_duplicate['group_id'] + '_1'

df_rovere = pd.concat([df_rovere, df_to_duplicate], ignore_index=True)
df_rovere.sort_values(by=['group_id', 'timestamp'], inplace=True)
df_rovere.reset_index(drop=True, inplace=True)

condition_group_id_1 = df_rovere['group_id_1'].notnull()
df_rovere.loc[condition_group_id_1, 'group_id'] = df_rovere.loc[condition_group_id_1, 'group_id_1']
df_rovere.drop(columns=['group_id_1'], inplace=True)

condition_update_group_id = df_rovere['sensor_id'].isin(['71', '69', '75', '70', '62', '64', '68', '66'])
df_rovere.loc[condition_update_group_id, 'group_id'] = df_rovere.loc[condition_update_group_id, 'group_id'] + '_1'


df_group = df_rovere.groupby(['timestamp', 'description', 'sensor_id', 'group_id']).agg({'value': ['min', 'max', 'mean', 'median', 'sum']}).reset_index()
df_group.columns = ['timestamp', 'description', 'sensor_id', 'group_id', 'val_min', 'val_max', 'val_avg', 'val_med', 'val_sum']

df_pivot = df_group.pivot(index=['timestamp', 'group_id'], columns='description', values=['val_min', 'val_max', 'val_avg', 'val_med', 'val_sum']).reset_index()
df_pivot.columns.name = None
df_pivot.columns = ['date', 'group_id', 'min_hum', 'min_temp', 'min_solar', 'min_wind', 'min_irr', 'min_rain', 'min_tens30', 'min_tens60',
                    'max_hum', 'max_temp', 'max_solar', 'max_wind', 'max_irr', 'max_rain', 'max_tens30', 'max_tens60',
                    'avg_hum', 'avg_temp', 'avg_solar', 'avg_wind', 'avg_irr', 'avg_rain', 'avg_tens30', 'avg_tens60',
                    'med_hum', 'med_temp', 'med_solar', 'med_wind', 'med_irr', 'med_rain', 'med_tens30', 'med_tens60',
                    'sum_hum', 'sum_temp', 'sum_solar', 'sum_wind', 'sum_irr', 'sum_rain', 'sum_tens30', 'sum_tens60']

df_pivot['min_tens'] = df_pivot['min_tens30'].combine_first(df_pivot['min_tens60'])
df_pivot.drop(columns=['min_tens30', 'min_tens60'], inplace=True)


df_pivot['max_tens'] = df_pivot['max_tens30'].combine_first(df_pivot['max_tens60'])
df_pivot.drop(columns=['max_tens30', 'max_tens60'], inplace=True)


df_pivot['avg_tens'] = df_pivot['avg_tens30'].combine_first(df_pivot['avg_tens60'])
df_pivot.drop(columns=['avg_tens30', 'avg_tens60'], inplace=True)


df_pivot['med_tens'] = df_pivot['med_tens30'].combine_first(df_pivot['med_tens60'])
df_pivot.drop(columns=['med_tens30', 'med_tens60'], inplace=True)


df_pivot['sum_tens'] = df_pivot['sum_tens30'].combine_first(df_pivot['sum_tens60'])
df_pivot.drop(columns=['sum_tens30', 'sum_tens60'], inplace=True)

df_pivot = df_pivot.reset_index(drop=True)

df = df_pivot

columns_to_drop = ['min_irr', 'max_irr', 'avg_irr', 'med_irr', 'min_rain', 'avg_rain', 'sum_hum', 'sum_temp', 'sum_solar', 'min_wind', 'max_wind', 'avg_wind', 'sum_wind', 'med_wind']
df = df.drop(columns=columns_to_drop).reset_index(drop=True)


group_id_mapping = {
    '1': '72',
    '2': '76',
    '3': '73',
    '4': '74',
    '5': '61',
    '6': '63',
    '7': '67',
    '8': '65',
    '1_1': '71',
    '2_1': '69',
    '3_1': '75',
    '4_1': '70',
    '5_1': '62',
    '6_1': '64',
    '7_1': '68',
    '8_1': '66'
}

df['group_id'] = df['group_id'].replace(group_id_mapping)
df = df.rename(columns={'group_id': 'sensor_id'})
df = df[['sensor_id', 'date', 'avg_tens'] + [col for col in df.columns if col not in ['sensor_id', 'date', 'avg_tens']]]
df = df.sort_values(by=['sensor_id', 'date']).reset_index(drop=True)

df = df[~df['date'].isin([datetime.date(2023, 10, 15)])].reset_index(drop=True)

df

In [6]:
missing_values = df.isna().any(axis=1).sum()
print('Number of Missing Values:', missing_values)

In [7]:
float_columns = df.select_dtypes(include=['float']).columns
df[float_columns] = df[float_columns].interpolate(method='linear', limit_direction='both')

missing_values = df.isna().any(axis=1).sum()
print('Number of Missing Values:', missing_values)

In [8]:
ids = df['sensor_id']
dates = df['date']
X = df.drop(columns=['date', 'sensor_id'])
X = X.shift(1).add_suffix('_lag1').join(X.shift(2).add_suffix('_lag2')).join(X.shift(3).add_suffix('_lag3'))

X['date'] = dates
dates_to_remove = [datetime.date(2023, 4, 28), datetime.date(2023, 4, 29), datetime.date(2023, 4, 30)]
X = X[~X['date'].isin(dates_to_remove)].reset_index(drop=True)
X = X.drop(columns='date')

y = df[['sensor_id', 'date', 'avg_tens']]
y = y[~y['date'].isin(dates_to_remove)].reset_index(drop=True)

df_merged = pd.concat([y, X], axis=1)
df_merged

## Exploratory Data Analysis

In [9]:
tens_30 = ['72', '76', '73', '74', '61', '63', '67', '65']
tens_60 = ['71', '69', '75', '70', '62', '64', '68', '66']

tens_combined = list(set(tens_30 + tens_60))
tens_ordered = sorted(tens_combined, key=lambda x: int(x))

In [10]:
df_cluster = df[['date', 'sensor_id', 'avg_tens']]
df_transformed = pd.pivot_table(df_cluster, values='avg_tens', index='date', columns='sensor_id', aggfunc='mean').reset_index()
data_array = np.array(df_transformed.T.drop('date').values)

X_normalized = TimeSeriesScalerMinMax().fit_transform(data_array)
X_flattened = X_normalized.reshape((X_normalized.shape[0], -1))

n_clusters_range = range(2, 10)
num_executions = 10
np.random.seed(0)

average_distortions = []

for n_clusters in n_clusters_range:
    distortions_for_cluster = []

    for _ in range(num_executions):

        seed = np.random.randint(0, 1000)
        np.random.seed(seed)

        model = TimeSeriesKMeans(n_clusters=n_clusters, metric='dtw', max_iter=10)
        model.fit(X_normalized)
        cluster_centers = model.cluster_centers_
        distortion = calculate_distortion(X_flattened, cluster_centers)
        distortions_for_cluster.append(distortion)

    average_distortion = np.mean(distortions_for_cluster)
    average_distortions.append(average_distortion)


plt.plot(n_clusters_range, average_distortions, marker='o', label='DTW Clustering')
plt.xlabel('Number of Clusters')
plt.ylabel('Distance from Centroids')
plt.title('Select the Optimal Number of Clusters')
plt.legend()
plt.show()

In [11]:
model_1 = TimeSeriesKMeans(n_clusters=3, metric='dtw', max_iter=100)
model_1.fit(data_array)
clusters_1=model_1.predict(data_array)

model_2 = TimeSeriesKMeans(n_clusters=6, metric='dtw', max_iter=100)
model_2.fit(data_array)
clusters_2=model_2.predict(data_array)

df_cluster = pd.DataFrame({'Sensor_ID': tens_ordered, 'Cluster 1': clusters_1, 'Cluster 2': clusters_2})
df_cluster

## Regression

In [12]:
sensor_ids = ['72', '76', '73', '74', '61', '63', '67', '65', '71', '69', '75', '70', '62', '64', '68', '66']
sensor_ids.sort()

residuals = []

for sensor in sensor_ids:
    
    subset = df_merged[df_merged['sensor_id'] == sensor].copy()
    X = subset.drop(['sensor_id', 'date', 'avg_tens', 'avg_tens_lag1', 'avg_tens_lag2', 'avg_tens_lag3'], axis=1)
    y = subset['avg_tens']
    
    selected_features = stepwise_bidirectional_selection(X, y, method='aic')
    model = sm.OLS(y, sm.add_constant(X[selected_features])).fit()

    intercept = model.params[0]
    intercept_vector = np.full(3, intercept)

    pred_sensor = model.predict(sm.add_constant(X[selected_features]))
    combined_vector = np.concatenate((intercept_vector, pred_sensor))

    selected_values = df.loc[df['sensor_id'] == sensor, 'avg_tens']
    values = selected_values - combined_vector
    res = list(zip(values))
    
    residuals.extend(res)


df_residuals = pd.DataFrame(residuals, columns=['residuals'])

df_residuals = df_residuals.shift(1).add_suffix('_lag1').join(df_residuals.shift(2).add_suffix('_lag2')).join(df_residuals.shift(3).add_suffix('_lag3'))
df_residuals = pd.concat([df['date'], df_residuals], axis=1)

dates_to_remove = [datetime.date(2023, 4, 28), datetime.date(2023, 4, 29), datetime.date(2023, 4, 30)]
df_residuals = df_residuals[~df_residuals['date'].isin(dates_to_remove)].reset_index(drop=True)
df_residuals = df_residuals.drop(columns='date')

df_residuals

In [13]:
data = pd.concat([df_merged, df_residuals], axis=1)
data = data.drop(columns='date')
data

## Group 1

### Sensor 72

In [14]:
all_sensors = ['61', '62', '63', '67', '68', '71', '74']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [15]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '72') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [16]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '72']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [17]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '72']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [18]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=best_n_comp)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '72') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [83]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '72']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

### Sensor 71

In [84]:
all_sensors = ['61', '64', '70', '72', '75']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [85]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '71') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [86]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '71']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [87]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '71']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [88]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=best_n_comp)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '71') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [89]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '71']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

### Group 2

### Sensor 76

In [90]:
all_sensors = ['65', '66', '69', '73']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [91]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '76') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [92]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '76']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [93]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '76']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [94]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=best_n_comp)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '76') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [95]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '76']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

### Sensor 69

In [98]:
all_sensors = ['65', '66', '73', '76']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [99]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '69') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [100]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '69']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [101]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '69']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [103]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=best_n_comp)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '69') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [104]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '69']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

### Group 3

### Sensor 73

In [105]:
all_sensors = ['65', '66', '69', '75', '76']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [106]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '73') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [107]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '73']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [108]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '73']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [109]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=best_n_comp)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '73') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [110]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '73']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

### Sensor 75

In [267]:
all_sensors = ['62', '64', '67', '68', '73']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [268]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '75') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [269]:
# best_subset = ['68', '73', '76']

In [270]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '75']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [162]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '75']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [163]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=best_n_comp)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '75') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [164]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '75']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

### Group 4

### Sensor 74

In [280]:
all_sensors = ['62', '63', '67', '68', '70', '72']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [281]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '74') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [282]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '74']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [128]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '74']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [129]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=best_n_comp)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '74') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [130]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '74']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

### Sensor 70

In [300]:
all_sensors = ['61', '64', '71', '74', '75']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [301]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '70') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [None]:
# best_subset = ['64', '65', '68', '69', '71']

In [302]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '70']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [134]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '70']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [135]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=best_n_comp)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '70') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [136]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '70']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

### Group 5

### Sensor 61

In [315]:
all_sensors = ['62', '63', '64', '70', '71', '72', '75']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [316]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '61') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [317]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '61']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

y_baseline = df_test['avg_tens_lag1'].reset_index(drop=True)

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [311]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '61']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [145]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=best_n_comp)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '61') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [146]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '61']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

### Sensor 62

In [331]:
all_sensors = ['61', '64', '67', '68', '75']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [332]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '62') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [326]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '62']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [149]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '62']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [150]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=best_n_comp)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '62') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [151]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '62']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

### Group 6

### Sensor 63

In [350]:
all_sensors = ['61', '64', '67', '72']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [351]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '63') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [335]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '63']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [339]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '63']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [340]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=best_n_comp)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '63') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [341]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '63']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

### Sensor 64

In [390]:
all_sensors = ['62', '63', '68', '70', '75']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [391]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '64') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [392]:
# best_subset = ['65', '66', '69', '74', '69']

In [393]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '64']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))


r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [355]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '64']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [356]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=best_n_comp)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '64') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [357]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '64']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [32]:
best_subset = ['63', '65', '66', '70', '71', '74']

df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '64']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))


r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

### Group 7

### Sensor 67

In [408]:
all_sensors = ['61', '63', '68', '72']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [409]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '67') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [410]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '67']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [397]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '67']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [398]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=best_n_comp)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '67') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [399]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '67']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

### Sensor 68

In [432]:
all_sensors = ['62', '64', '67', '70', '75']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [433]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '68') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [434]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '68']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [414]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '68']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [415]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=best_n_comp)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '68') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [416]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '68']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

### Group 8

### Sensor 65

In [None]:
all_sensors = ['66', '69', '73', '76']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [146]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '65') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [147]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '65']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [152]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '65']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [153]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=37)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '73') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [154]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '65']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

### Sensor 66

In [None]:
all_sensors = ['65', '69', '73', '76']
all_subsets = list(chain.from_iterable(combinations(all_sensors, r) for r in range(1, len(all_sensors) + 1)))

In [155]:
best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data, '66') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [156]:
df_train = data[data['sensor_id'].isin(list(best_subset))]
df_test = data[data['sensor_id'] == '66']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [161]:
X_train_pls = data[data['sensor_id'].isin(list(all_sensors))]
ids_train = X_train_pls[['sensor_id']].reset_index(drop=True)
y_train_pls = X_train_pls[['avg_tens']].reset_index(drop=True)
X_train_pls = X_train_pls.drop(columns=['sensor_id', 'avg_tens'])

X_test_pls = data[data['sensor_id'] == '66']
ids_test = X_test_pls[['sensor_id']].reset_index(drop=True)
y_test_pls = X_test_pls[['avg_tens']].reset_index(drop=True)
X_test_pls = X_test_pls.drop(columns=['sensor_id', 'avg_tens'])


best_n_comp = optimize_ncomp_pls(X_train_pls, y_train_pls, 50, 'RMSE', 'min')

In [162]:
pls_labels = [f'PC{i}' for i in range(1, best_n_comp + 1)]
pls_model = PLSRegression(n_components=best_n_comp)
pls_model.fit(X_train_pls, y_train_pls)

X_train_pls = pls_model.transform(X_train_pls)
X_test_pls = pls_model.transform(X_test_pls)
X_train_pls = pd.DataFrame(X_train_pls, columns=pls_labels)
X_test_pls = pd.DataFrame(X_test_pls, columns=pls_labels)

train_pls = pd.concat([ids_train, y_train_pls, X_train_pls], axis=1)
test_pls = pd.concat([ids_test, y_test_pls, X_test_pls], axis=1)
data_pls = pd.concat([train_pls, test_pls], ignore_index=True)


best_subset = None
best_rmse = float('inf')


results = Parallel(n_jobs=-1)(delayed(evaluate_subset)(subset, data_pls, '66') for subset in all_subsets)

for subset, rmse_subset in results:
    if rmse_subset < best_rmse:
        best_rmse = rmse_subset
        best_subset = subset

print(f"Best Subset: {best_subset}")

In [163]:
df_train = data_pls[data_pls['sensor_id'].isin(list(best_subset))]
df_test = data_pls[data_pls['sensor_id'] == '66']

X_train = df_train.drop(['sensor_id', 'avg_tens'], axis=1)
y_train = df_train['avg_tens']

X_test = df_test.drop(['sensor_id', 'avg_tens'], axis=1)
y_test = df_test['avg_tens']

selected_features = stepwise_bidirectional_selection(X_train, y_train, method='aic')

model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()
pred_sensor = model.predict(sm.add_constant(X_test[selected_features]))

r_squared_arimax = round(r_squared(y_test, pred_sensor), 3)
mape_arimax = round(mape(y_test, pred_sensor), 3)
mae_arimax = round(mae(y_test, pred_sensor), 3)
rmse_arimax = round(rmse(y_test, pred_sensor), 3)

rmse_std = round(np.std(np.sqrt((y_test - pred_sensor)**2)), 3)


table = [
    ['Metric', 'ARIMAX'],
    ['R-squared', r_squared_arimax],
    ['MAPE', mape_arimax],
    ['MAE', mae_arimax],
    ['RMSE', rmse_arimax],
    ['RMSE_std', rmse_std]
]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))