In [8]:
# Libraries

import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import openmeteo_requests
import requests_cache
import pickle

from fuzzy_expert.variable import FuzzyVariable
from fuzzy_expert.rule import FuzzyRule
from fuzzy_expert.inference import DecompositionalInference
from retry_requests import retry
from datetime import date, timedelta, datetime
from scipy.stats import linregress
from ipywidgets import interact, widgets

import warnings

warnings.filterwarnings("ignore")

In [9]:
def pre_processing(data):

    # Columns Selection and Formatting
    
    df_rovere = data[['reading_id', 'timestamp', 'sensor_id', 'value', 'description', 'group_id']].copy()
    df_rovere[['reading_id', 'sensor_id', 'description', 'group_id']] = df_rovere[['reading_id', 'sensor_id', 'description', 'group_id']].astype(str)
    df_rovere['timestamp'] = pd.to_datetime(df_rovere['timestamp']).dt.floor('D').dt.date
    df_rovere['value'] = df_rovere['value'].astype(float)

    tens_30 = ['72', '76', '73', '74', '61', '63', '67', '65']
    tens_60 = ['71', '69', '75', '70', '62', '64', '68', '66']
    tens_all = tens_30 + tens_60
    
    df_rovere.loc[df_rovere['description'] == 'tensiometer', 'description'] = 'Tensiometer'
    df_rovere.loc[df_rovere['description'] == 'irrigation', 'description'] = 'Irrigation'

    
    # Duplication
    
    condition_not_in_list = ~df_rovere['sensor_id'].isin(tens_30)
    df_dup = df_rovere[condition_not_in_list]
    df_dup['group_id'] = df_dup['group_id'] + '_dup'

    df_rovere = df_rovere[~df_rovere['sensor_id'].isin(tens_60)]
    df_rovere = pd.concat([df_rovere, df_dup], ignore_index=True)
    df_rovere.sort_values(by=['group_id', 'timestamp'], inplace=True)
    df_rovere.reset_index(drop=True, inplace=True)

    
    # Grouping and Creation of Summary Values
    
    df_group = df_rovere.groupby(['timestamp', 'description', 'sensor_id', 'group_id']).agg({'value': ['min', 'max', 'mean', 'median', 'sum']}).reset_index()
    df_group.columns = ['timestamp', 'description', 'sensor_id', 'group_id', 'val_min', 'val_max', 'val_avg', 'val_med', 'val_sum']

    
    # Pivoting
    
    df_pivot = df_group.pivot(index=['timestamp', 'group_id'], columns='description', values=['val_min', 'val_max', 'val_avg', 'val_med', 'val_sum']).reset_index()
    df_pivot.columns = ['date', 'group_id'] + [f"{agg}_{feature}" for agg in ['min', 'max', 'avg', 'med', 'sum'] for feature in ['hum', 'temp', 'solar', 'wind', 'irr', 'rain', 'tens']]

    df = df_pivot.reset_index(drop=True)

    
    # Sensor ID Mapping
    
    group_id_mapping = {str(i): str(j) for i, j in zip(range(1, 9), tens_30)}
    group_id_mapping.update({str(i) + '_dup': str(j) for i, j in zip(range(1, 9), tens_60)})
    df['group_id'] = df['group_id'].replace(group_id_mapping)
    df = df.rename(columns={'group_id': 'sensor_id'})

    df = df[['sensor_id', 'date', 'avg_tens', 'max_temp', 'avg_hum', 'avg_solar', 'sum_rain', 'sum_irr']]
    df = df[['sensor_id', 'date', 'avg_tens'] + [col for col in df.columns if col not in ['sensor_id', 'date', 'avg_tens']]]
    df = df.sort_values(by=['sensor_id', 'date']).reset_index(drop=True)

    
    # Imputation of Missing Values
    
    float_columns = df.select_dtypes(include=['float']).columns
    df[float_columns] = df[float_columns].interpolate(method='linear', limit_direction='both')
    

    # Shifting Values using the previous 3 Days
    
    ids = df['sensor_id']
    dates = df['date']
    X = df.drop(columns=['date', 'sensor_id'])
    X = X.shift(1).add_suffix('_lag1').join(X.shift(2).add_suffix('_lag2')).join(X.shift(3).add_suffix('_lag3'))
    
    X['date'] = dates
    dates_to_remove = [date(2023, 4, 28), date(2023, 4, 29), date(2023, 4, 30)]
    X = X[~X['date'].isin(dates_to_remove)].reset_index(drop=True)
    X = X.drop(columns='date')
    
    y = df[['sensor_id', 'date', 'avg_tens']]
    y = y[~y['date'].isin(dates_to_remove)].reset_index(drop=True)
    
    df_merged = pd.concat([y, X], axis=1)
    df = df_merged[df_merged['sensor_id'].isin(tens_30)]
    
    return df

In [10]:
df = pd.read_json('row_data_rovere.json')
df = pre_processing(df)
df

In [12]:
# Crea il DataFrame df_test vuoto
df_test = pd.DataFrame(columns=df.columns)

# Inizializza un contatore per tenere traccia delle righe
counter = 0

# Itera attraverso le righe di df
for index, row in df.iterrows():
    # Se il contatore è multiplo di 10 (ogni 10 osservazioni)
    if counter % 10 == 0:
        # Aggiungi la riga a df_test usando il metodo loc
        df_test.loc[len(df_test)] = row
        # Rimuovi la riga da df usando l'indice
        df = df.drop(index)
    # Incrementa il contatore
    counter += 1

# Resetta gli indici di df
df = df.reset_index(drop=True)

In [16]:
df

In [17]:
df_test

In [15]:
# Rimuovi le colonne 'sensor_id' e 'date' da df
df = df.drop(columns=['sensor_id', 'date'])

# Rimuovi le colonne 'sensor_id' e 'date' da df_test
df_test = df_test.drop(columns=['sensor_id', 'date'])

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Dividi il dataset df in training e validation set
X_train, X_val, y_train, y_val = train_test_split(df.drop(columns=['avg_tens']), df['avg_tens'], test_size=0.2, random_state=42)

# Inizializza il modello di regressione lineare
model = LinearRegression()

# Addestra il modello sul training set
model.fit(X_train, y_train)

# Valuta il modello sul training set
train_predictions = model.predict(X_train)
train_error = mean_squared_error(y_train, train_predictions)

# Valuta il modello sul validation set
val_predictions = model.predict(X_val)
val_error = mean_squared_error(y_val, val_predictions)

# Stampa i risultati
print("Train Error:", train_error)
print("Validation Error:", val_error)

# Applica il modello sul dataset df_test
X_test = df_test.drop(columns=['avg_tens'])  # Assicurati di adattare questa parte in base alla struttura del tuo df_test
y_test = df_test['avg_tens']
test_predictions = model.predict(X_test)
test_error = mean_squared_error(y_test, test_predictions)

# Stampa l'errore sul dataset df_test
print("Test Error on df_test:", test_error)

In [19]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Calcola l'RMSE sul training set
train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))

# Calcola l'RMSE sul validation set
val_rmse = np.sqrt(mean_squared_error(y_val, val_predictions))

# Stampa i risultati
print("Train RMSE:", train_rmse)
print("Validation RMSE:", val_rmse)

# Calcola l'RMSE sul dataset df_test
test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))

# Stampa l'RMSE sul dataset df_test
print("Test RMSE on df_test:", test_rmse)

In [20]:
from sklearn.metrics import mean_absolute_error

# Calcola il MAE sul training set
train_mae = mean_absolute_error(y_train, train_predictions)

# Calcola il MAE sul validation set
val_mae = mean_absolute_error(y_val, val_predictions)

# Stampa i risultati
print("Train MAE:", train_mae)
print("Validation MAE:", val_mae)

# Calcola il MAE sul dataset df_test
test_mae = mean_absolute_error(y_test, test_predictions)

# Stampa il MAE sul dataset df_test
print("Test MAE on df_test:", test_mae)


In [21]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Inizializza il modello di regressione lineare
model = LinearRegression()

# Definisci il numero di fold per la cross-validation
num_folds = 5

# Definisci l'oggetto KFold
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Esegui la cross-validation e ottieni gli errori su training e validation set
train_errors = []
val_errors = []

for train_index, val_index in kf.split(df.drop(columns=['avg_tens'])):
    X_train, X_val = df.drop(columns=['avg_tens']).iloc[train_index], df.drop(columns=['avg_tens']).iloc[val_index]
    y_train, y_val = df['avg_tens'].iloc[train_index], df['avg_tens'].iloc[val_index]

    # Addestra il modello sul training set
    model.fit(X_train, y_train)

    # Valuta il modello sul training set
    train_predictions = model.predict(X_train)
    train_error = mean_squared_error(y_train, train_predictions)
    train_errors.append(train_error)

    # Valuta il modello sul validation set
    val_predictions = model.predict(X_val)
    val_error = mean_squared_error(y_val, val_predictions)
    val_errors.append(val_error)

# Calcola la media degli errori su training e validation set
mean_train_error = np.mean(train_errors)
mean_val_error = np.mean(val_errors)

# Stampa i risultati della cross-validation
print("Mean Train Error:", mean_train_error)
print("Mean Validation Error:", mean_val_error)

# Calcola l'errore sul dataset df_test
test_predictions = model.predict(df_test.drop(columns=['avg_tens']))
test_error = mean_squared_error(df_test['avg_tens'], test_predictions)

# Stampa l'errore sul dataset df_test
print("Test Error on df_test:", test_error)


In [22]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Inizializza il modello di regressione lineare
model = LinearRegression()

# Definisci il numero di fold per la cross-validation
num_folds = 5

# Definisci l'oggetto KFold
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Liste per memorizzare gli errori su training e validation set
train_errors = []
val_errors = []

for train_index, val_index in kf.split(df.drop(columns=['avg_tens'])):
    X_train, X_val = df.drop(columns=['avg_tens']).iloc[train_index], df.drop(columns=['avg_tens']).iloc[val_index]
    y_train, y_val = df['avg_tens'].iloc[train_index], df['avg_tens'].iloc[val_index]

    # Addestra il modello sul training set
    model.fit(X_train, y_train)

    # Valuta il modello sul training set
    train_predictions = model.predict(X_train)
    train_error = np.sqrt(mean_squared_error(y_train, train_predictions))  # RMSE
    train_errors.append(train_error)

    # Valuta il modello sul validation set
    val_predictions = model.predict(X_val)
    val_error = np.sqrt(mean_squared_error(y_val, val_predictions))  # RMSE
    val_errors.append(val_error)

# Calcola la media degli errori su training e validation set
mean_train_rmse = np.mean(train_errors)
mean_val_rmse = np.mean(val_errors)

# Stampa i risultati dell'RMSE
print("Mean Train RMSE:", mean_train_rmse)
print("Mean Validation RMSE:", mean_val_rmse)

# Calcola il MAE su training e validation set
train_errors = []
val_errors = []

for train_index, val_index in kf.split(df.drop(columns=['avg_tens'])):
    X_train, X_val = df.drop(columns=['avg_tens']).iloc[train_index], df.drop(columns=['avg_tens']).iloc[val_index]
    y_train, y_val = df['avg_tens'].iloc[train_index], df['avg_tens'].iloc[val_index]

    # Addestra il modello sul training set
    model.fit(X_train, y_train)

    # Valuta il modello sul training set
    train_predictions = model.predict(X_train)
    train_error = mean_absolute_error(y_train, train_predictions)  # MAE
    train_errors.append(train_error)

    # Valuta il modello sul validation set
    val_predictions = model.predict(X_val)
    val_error = mean_absolute_error(y_val, val_predictions)  # MAE
    val_errors.append(val_error)

# Calcola la media degli errori su training e validation set
mean_train_mae = np.mean(train_errors)
mean_val_mae = np.mean(val_errors)

# Stampa i risultati del MAE
print("Mean Train MAE:", mean_train_mae)
print("Mean Validation MAE:", mean_val_mae)

# Calcola l'errore sul dataset df_test
test_predictions = model.predict(df_test.drop(columns=['avg_tens']))
test_error = np.sqrt(mean_squared_error(df_test['avg_tens'], test_predictions))  # RMSE

# Stampa l'RMSE sul dataset df_test
print("Test RMSE on df_test:", test_error)

# Calcola il MAE sul dataset df_test
test_error_mae = mean_absolute_error(df_test['avg_tens'], test_predictions)  # MAE

# Stampa il MAE sul dataset df_test
print("Test MAE on df_test:", test_error_mae)


In [34]:
from sklearn.model_selection import GridSearchCV, KFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Inizializza il modello XGBoost
model = XGBRegressor()

# Definisci la griglia di iperparametri da esplorare
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Definisci il numero di fold per la cross-validation
num_folds = 80

# Definisci l'oggetto KFold
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Crea un oggetto GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=kf)

# Esegui la ricerca degli iperparametri
grid_result = grid_search.fit(df.drop(columns=['avg_tens']), df['avg_tens'])

# Stampa i migliori iperparametri trovati
print("Best Parameters:", grid_result.best_params_)

# Ottieni il miglior modello
best_model = grid_result.best_estimator_

# Calcola l'errore medio sui fold della cross-validation
cv_results = grid_result.cv_results_
#mean_train_rmse = np.mean(np.sqrt(-cv_results['mean_train_score']))  # RMSE
mean_val_rmse = np.mean(np.sqrt(-cv_results['mean_test_score']))  # RMSE

# Stampa i risultati dell'RMSE
#print("Mean Train RMSE:", mean_train_rmse)
print("Mean Validation RMSE:", mean_val_rmse)

# Calcola il MAE medio sui fold della cross-validation
#mean_train_mae = np.mean(-cv_results['mean_train_score'])  # MAE
mean_val_mae = np.mean(-cv_results['mean_test_score'])  # MAE

# Stampa i risultati del MAE
#print("Mean Train MAE:", mean_train_mae)
print("Mean Validation MAE:", mean_val_mae)

# Addestra il miglior modello sul dataset completo
best_model.fit(df.drop(columns=['avg_tens']), df['avg_tens'])

# Applica il modello sul dataset df_test
test_predictions = best_model.predict(df_test.drop(columns=['avg_tens']))
test_error_rmse = np.sqrt(mean_squared_error(df_test['avg_tens'], test_predictions))  # RMSE
test_error_mae = mean_absolute_error(df_test['avg_tens'], test_predictions)  # MAE

# Stampa l'RMSE e il MAE sul dataset df_test
print("Test RMSE on df_test:", test_error_rmse)
print("Test MAE on df_test:", test_error_mae)


In [31]:
df_test = df_test.astype('float64')


In [35]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Definire il numero di suddivisioni desiderate per la cross-validation
n_splits = 5

# Inizializzare il modello di regressione lineare
model = LinearRegression()

# Inizializzare lo splitter per la cross-validation di tipo sliding window
tscv = TimeSeriesSplit(n_splits=n_splits)

# Inizializzare le liste per memorizzare gli errori su train e validation
train_errors = []
val_errors = []

# Eseguire la cross-validation
for train_index, val_index in tscv.split(df):
    X_train, X_val = df.iloc[train_index], df.iloc[val_index]
    y_train, y_val = df['avg_tens'].iloc[train_index], df['avg_tens'].iloc[val_index]
    
    # Addestrare il modello sul training set
    model.fit(X_train, y_train)
    
    # Calcolare le previsioni su training e validation set
    train_predictions = model.predict(X_train)
    val_predictions = model.predict(X_val)
    
    # Calcolare l'errore sul training set e sul validation set
    train_error = mean_squared_error(y_train, train_predictions)
    val_error = mean_squared_error(y_val, val_predictions)
    
    # Aggiungere gli errori alle liste
    train_errors.append(train_error)
    val_errors.append(val_error)

# Calcolare la media degli errori su train e validation
mean_train_error = np.mean(train_errors)
mean_val_error = np.mean(val_errors)

# Stampare i risultati
print("Mean Train Error:", mean_train_error)
print("Mean Validation Error:", mean_val_error)
