### Import Required Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, ConstantKernel as C
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import time
from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split


### Set Params (Execute only once for model re-trainings, otherwise this can be set to empty for other scenarios)


In [2]:
explore_exploit_data = []
random_data = []
fixed_interval_data = []
#Set it true when model retrainings are performed.
sampling_flag = False

### Model Training (Start after 1st wekk Data Collection)

In [None]:
# Global variables
num_samples = 2000
num_users = 10
file_path = r'D:\AQ\archive\CitieSHealth_BCN_DATA_PanelStudy_20220414.csv'
train_start_idx = 0
N = 42 # Samples per user
exploit_thres = 0.3


# Load and filter CSV data
def load_and_filter_csv(file_path):
    df = pd.read_csv(file_path)
    columns = ['pm25bcn', 'tmean_24h', 'humi_24h', 'pressure_24h', 'bienestar']
    df = df[columns].dropna()
    df['bienestar'] = (df['bienestar'] * 10).astype(int)  # Scale 'bienestar'
    return df

# Preprocess the data
def preprocess_data(df):
    X = df[['pm25bcn', 'tmean_24h', 'humi_24h', 'pressure_24h']].values  # Features
    y = df['bienestar'].values  # Target (mood score)
    
    # Standardizing the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y, scaler
    
def train_gaussian_process(X_train, y_train):

    # For model re-trainings, past data will be considered as well.
    if sampling_flag:
        # Use data from explore_exploit_data
        X_train = np.concatenate((np.array([item["X"] for item in explore_exploit_data]), X_scaled[train_start_idx:train_start_idx + num_samples]), axis=0)
        y_train = np.concatenate((np.array([item["y"] for item in explore_exploit_data]), y[train_start_idx:train_start_idx + num_samples]), axis=0)
   
    # For other scenarios like changing number of samples or exploit threshold (For this Case execute Set Params Cell)
    else:
        # Default behavior when sampling_flag is False
        X_train = X_scaled[train_start_idx:train_start_idx + num_samples]
        y_train = y[train_start_idx:train_start_idx + num_samples]
        print("False")
    # Define a kernel with an RBF component
    kernel = (C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2)) +
                  C(1.0, (1e-3, 1e3)) * Matern(length_scale=1.0, nu=1.5, length_scale_bounds=(1e-2, 1e2)) +
                  WhiteKernel(noise_level=1e-1, noise_level_bounds=(1e-5, 1e1)))
    gpr = GaussianProcessRegressor(kernel=kernel, alpha=1e-10, n_restarts_optimizer=10, normalize_y=True)
    
    # Fit the model
    gpr.fit(X_train, y_train)
    y_pred, sigma = gpr.predict(X_train, return_std=True)
    return gpr, y_pred, sigma

# Function to select exploitation samples based on uncertainty threshold
def select_exploitation_samples(sigma, exploit_thres, N):
    print(len(sigma))
    # Calculate the number of samples to exploit based on theta and N
    exploitation_count = int(exploit_thres * N)

    # Sort the uncertainty (sigma) values in descending order
    sorted_sigma = np.sort(sigma)[::-1]
    # Select the exploitation samples based on the highest uncertainty
    exploitation_threshold = sorted_sigma[exploitation_count - 1]

    # Identify the samples that have uncertainty above the threshold
    exploitation_samples = np.where(sigma >= exploitation_threshold)[0]

    # Return the selected exploitation samples and the threshold
    return exploitation_count, exploitation_samples, exploitation_threshold
    
# Main execution
data = load_and_filter_csv(file_path)
X_scaled, y, scaler = preprocess_data(data)
gpr, y_pred, sigma =  train_gaussian_process(X_scaled, y)
exploitation_count, exploitation_samples, exploitation_threshold = select_exploitation_samples(sigma, exploit_thres, N)

# Print the selected exploitation samples and threshold
print("Exploitation Samples Indices:", exploitation_samples)
print("Exploitation Threshold:", exploitation_threshold)


False


### Consider this as real-time data for num_users (10)

In [None]:
# Initialize number of users and number of samples per user
num_users = 10

# Create a dictionary to store X and y data for each user
Data_For_Sampling = {}

num_records_per_user = 100  # Number of records to assign to each user

for i in range(num_users):
    # Calculate the start and end index for each user
    start_idx = 2000 + i * num_records_per_user
    end_idx = start_idx + num_records_per_user
    
    # Save data in the dictionary with the user name as the key
    Data_For_Sampling[f'User {i+1}'] = {
        'X': X_scaled[start_idx:end_idx],
        'y': y[start_idx:end_idx]
    }

print(len(Data_For_Sampling.values()))


### Logic where Samples are collected out of real-time data by explore or exploit. The 'max_time_gap' can decide how much gap we need between samples. (Currently Value is set for testing purpose)

In [None]:


# Initialize exploitation counts and last exploitation time for each user
exploite_explore_counts = {
    f'User {i+1}': {'exploitation_count': 0, 'exploration_count': 0, 'last_exploitation': None, 'last_exploration': None}
    for i in range(len(Data_For_Sampling))
}

# Variables to store true values. This is currently collected simultaneously from available dataset but in real-time this data will be available later.
true_y = []

# Interval between consecutive messages for each user
max_time_gap = 0.01
num_records = len(next(iter(Data_For_Sampling.values()))['X'])
print(num_records)
for record_idx in range(num_records):  # Loop through all records for each user
    current_time = datetime.now()  # Get the current time
    for user, data in Data_For_Sampling.items():
        # Adjust as per sample requirements (How many samples we need)
        if len(explore_exploit_data) >= 250:
            break
        record_X = data['X'][record_idx]  # Get the X data for this record

        # Get the last exploitation time for the user
        last_exploitation = exploite_explore_counts[user]['last_exploitation'] 
        last_exploration = exploite_explore_counts[user]['last_exploration']
        
        # Predict using the GPR model
        _, sigma = gpr.predict([record_X], return_std=True)  # Predict sigma
        true_value = data['y'][record_idx]
        true_value = int(true_value)

        

        # Check conditions for exploitation, exploration, or do nothing
        if (last_exploitation is None or (current_time - last_exploitation).total_seconds() >= max_time_gap) and \
           (last_exploration is None or (current_time - last_exploration).total_seconds() >= max_time_gap):
           
            if exploite_explore_counts[user]['exploitation_count'] <= exploitation_count and sigma >= exploitation_threshold:
                # Store true values
                true_y.append(true_value)
                print(f"Live data {record_idx+1}: {user} - Sigma: {sigma}")
                    
                # Update exploitation count and time
                exploite_explore_counts[user]['exploitation_count'] += 1
                exploite_explore_counts[user]['last_exploitation'] = current_time
                
                explore_exploit_data.append({
                    "X": record_X,
                    "y": true_value
                })
            else:
                if exploite_explore_counts[user]['exploration_count'] <= N - exploitation_count:
                    # Store true values
                    true_y.append(true_value)
                    exploite_explore_counts[user]['exploration_count'] += 1
                    exploite_explore_counts[user]['last_exploration'] = current_time
                    print("Explore")
                    explore_exploit_data.append({
                    "X": record_X,
                    "y": true_value,
                })
        else:
            print("Do nothing")
        time.sleep(0.1)

### Testing the accuracy (Mean-Absolute Error) for the collected samples (Explore-Exploit)

In [None]:

X_train = np.array([data['X'] for data in explore_exploit_data])
y_train = np.array([data['y'] for data in explore_exploit_data])

# Initialize the MLP Regressor with the requested configuration
model = MLPRegressor(hidden_layer_sizes=(64, 64), max_iter=500, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict the values
y_pred = model.predict(X_train)

# Calculate MAE (Mean Absolute Error)
mae = mean_absolute_error(y_train, y_pred)

print(f"Mean Absolute Error: {mae}")

### The data is trained for samples and the live samples are collected randomly and fixed-interval (i.e SMS sending randomly or at fixed-Interval)

In [None]:


def load_and_filter_csv(file_path):
    df = pd.read_csv(file_path)
    columns = ['pm25bcn', 'tmean_24h', 'humi_24h', 'pressure_24h', 'bienestar']
    df = df[columns].dropna()  # Drop rows with missing values
    df['bienestar'] = (df['bienestar'] * 10).astype(int)  # Scale 'bienestar'
    return df

# Preprocess the data
def preprocess_data(df):
    X = df[['pm25bcn', 'tmean_24h', 'humi_24h', 'pressure_24h']].values  # Features
    y = df['bienestar'].values  # Target
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y

# Train the model
def train_model(X_train, y_train):
    model = MLPRegressor(hidden_layer_sizes=(64, 64), max_iter=500, random_state=42)
    model.fit(X_train, y_train)
    return model

# Evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return rmse, mae, r2

file_path = r'D:\AQ\archive\CitieSHealth_BCN_DATA_PanelStudy_20220414.csv'
data = load_and_filter_csv(file_path)
# Load your data (replace with your actual data)


    
if sampling_flag:

    # Use sampled data for training and evaluation separately
    train_data_random = np.array([entry["X"] for entry in random_data])
    train_labels_random = np.array([entry["y"] for entry in random_data])

    train_data_fixed = np.array([entry["X"] for entry in fixed_interval_data])
    train_labels_fixed = np.array([entry["y"] for entry in fixed_interval_data])

    random_indices = np.random.choice(range(2000, 3000), size=1000, replace=False)
    X_train_random, y_train_random = X_scaled[random_indices], y[random_indices]
    # Train and evaluate for random selection
    X_combined = np.concatenate([X_scaled[random_indices], train_data_random], axis=0)
    y_combined = np.concatenate([y[random_indices], train_labels_random], axis=0)

    # Train the model
    model_random = train_model(X_combined, y_combined)
    rmse_random, mae_random, r2_random = evaluate_model(model_random, X_train_random, y_train_random)
    print(f"Random Data Evaluation:\nRMSE: {rmse_random}, MAE: {mae_random}, R²: {r2_random}")

    #interval_indices = np.arange(2000, 3000, 2)  # Select records with fixed intervals
    interval_indices = np.linspace(0, 1000 - 1, 1000, dtype=int)
    X_train_interval, y_train_interval = X_scaled[interval_indices], y[interval_indices]
    # Train and evaluate for fixed interval selection
    model_interval = train_model(np.concatenate([X_scaled[interval_indices], train_data_fixed], axis=0),
                             np.concatenate([y[interval_indices], train_labels_fixed], axis=0))
    rmse_interval, mae_interval, r2_interval = evaluate_model(model_interval, X_train_interval, y_train_interval)
    print(f"Fixed Interval Data Evaluation:\nRMSE: {rmse_interval}, MAE: {rmse_interval}, R²: {r2_interval}")

else:
    # Preprocess the data
    X_scaled, y = preprocess_data(data)

    # Step 1: Train with records
    train_data = X_scaled[:2000]  
    train_labels = y[:2000]

    # Step 2: Select random samples from for testing
    random_indices = np.random.choice(range(2000, 3000), size=250, replace=False)
    print(len(random_indices))
    X_train_random, y_train_random = X_scaled[random_indices], y[random_indices]
    for i in range(len(X_train_random)):
        random_data.append({"X": X_train_random[i], "y": y_train_random[i]})

    # Step 3: Select samples with a fixed interval for testing
   # interval_indices = np.arange(2000, 3000, 1.5)  # Select records with fixed intervals
    interval_indices = np.linspace(0, 1000 - 1, 250, dtype=int)

    print(len(interval_indices))
    X_train_interval, y_train_interval = X_scaled[interval_indices], y[interval_indices]
    for i in range(len(X_train_interval)):
        fixed_interval_data.append({"X": X_train_interval[i], "y": y_train_interval[i]})
    # Train the model and evaluate for random selection
    model_random = train_model(train_data, train_labels)
    rmse_random, mae_random, r2_random = evaluate_model(model_random, X_train_random, y_train_random)
    print(f"Random Split Evaluation:\nRMSE: {rmse_random}, MAE: {mae_random}, R²: {r2_random}")

    # Train the model and evaluate for fixed interval selection
    model_interval = train_model(train_data, train_labels)
    rmse_interval, mae_interval, r2_interval = evaluate_model(model_interval, X_train_interval, y_train_interval)
    print(f"Fixed Interval Split Evaluation:\nRMSE: {rmse_interval}, MAE: {rmse_interval}, R²: {r2_interval}")

In [None]:
# Set True when required for re-training
sampling_flag = True

### Testing the MAE by setting different 'exploit_thres' param in Model Training

In [None]:

# Data for the graph
x = ['10%' , '30%', '50%', '70%', '90%']
accuracies = {
    'E&E': [12.76, 11.38, 12.98, 12.62, 12.35]
}

# Plotting the data
plt.figure(figsize=(4, 4))
for algo, values in accuracies.items():
    plt.plot(x, values, marker='o')

# Adding titles and labels
plt.title("Explore-Exploit Distribution", fontsize=16)
plt.xlabel("Explore/Exploit Ratios", fontsize=12)
plt.ylabel("MAE Values", fontsize=12)
plt.ylim(11.25, 13.00)
plt.tick_params(axis='both', labelsize=11)

plt.grid(True)

# Save the plot as PDF
plt.savefig(r'D:\AQ\E&E_Distribution.pdf', format='pdf')

# Display the plot
plt.tight_layout()
plt.show()


### Testing the MAE for Model retrainings across all Algorithms

In [None]:
# Data for the graph
algorithms = ['E&E', 'Random', 'Fixed Interval']
x = ['Initial Phase', 'Round 1', 'Round 2', 'Round 3']
rmse_values = {
    'E&E': [12.13, 11.38, 11.13, 9.94],
    'Random': [13.01, 12.51, 11.83, 10.96],
    'Fixed': [16.02, 15.37, 14.27, 12.43]
}

# Plotting the data
plt.figure(figsize=(4, 4))
for algo, values in rmse_values.items():
    plt.plot(x, values, marker='o', label=algo)

# Adding titles and labels
plt.title("MAE Comparison Across Algorithms")
plt.xlabel("Model Retrain Rounds")
plt.ylabel("MAE Values")
plt.ylim(8, 18)

plt.legend(title="Algorithms")
plt.grid(True)

plt.savefig(r'D:\AQ\MAE_ChangesOverTime.pdf', format='pdf')

# Display the plot
plt.tight_layout()
plt.show()


### Testing the MAE by changing count in 'explore_exploit_data', 'random_data' and 'fixed_interval_data'

In [None]:
# Data for the graph
algorithms = ['E&E', 'Random', 'Fixed Interval']
x = ['250', '500', '750', '2000']
rmse_values = {
    'E&E': [12.13, 11.82, 11.75, 11.44],
    'Random': [13.04, 12.81, 12.51, 11.44],
    'Fixed': [16.02, 16.02, 15.53, 11.44]
}

# Plotting the data
plt.figure(figsize=(4, 4))
for algo, values in rmse_values.items():
    plt.plot(x, values, marker='o', label=algo)

# Adding titles and labels
plt.title("MAE Comparison Across Algorithms")
plt.xlabel("Initial Number of Samples (w/o Model Retraining)")
plt.ylabel("MAE Values")
plt.ylim(10, 18)

plt.legend(title="Algorithms")
plt.grid(True)

plt.savefig(r'D:\AQ\MAE_SampleSizes.pdf', format='pdf')

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# Load and filter CSV data
def load_and_filter_csv(file_path):
    df = pd.read_csv(file_path)
    columns = ['pm25bcn', 'tmean_24h', 'humi_24h', 'pressure_24h', 'bienestar']
    df = df[columns].dropna()
    df['bienestar'] = (df['bienestar'] * 10).astype(int)  # Scale 'bienestar'
    return df

# Preprocess the data
def preprocess_data(df):
    X = df[['pm25bcn', 'tmean_24h', 'humi_24h', 'pressure_24h']].values  # Features
    y = df['bienestar'].values  # Target (mood score)
    
    # Standardizing the features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y, scaler

# Train and test the MLPRegressor
def train_and_evaluate_model(X_train, y_train, X_test, y_test):
    # Define the MLPRegressor model
    model = MLPRegressor(hidden_layer_sizes=(64, 64), max_iter=500, random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_test, y_pred)
    return model, mae

# Main execution
file_path = r'D:\AQ\archive\CitieSHealth_BCN_DATA_PanelStudy_20220414.csv'
df = load_and_filter_csv(file_path)

# Split the data
X, y, scaler = preprocess_data(df)
X_train, y_train = X[:2000], y[:2000]  # First 2000 records for training
X_test, y_test = X[2000:3000], y[2000:3000]  # Last 1000 records for testing

# Train and evaluate the model
model, mae = train_and_evaluate_model(X_train, y_train, X_test, y_test)

