In [6]:
import os
import json
import warnings
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

from source.general_functions import (
    prepare_for_dump,
    extend_results_dict,
    create_dir_if_not_exists,
)
from source.experiment_functions import ( 
    data2train_f,
    get_res_table,
    plot_mean_conf,
    data2model_params,
    prepare_train_funct_fl,
    get_stats_several_trials,
)

warnings.filterwarnings("ignore")

### DO NOT CHANGE ###
N_TRIALS = 10
TEST_SIZE = 0.2
METRIC_F = mean_squared_error
P_SCALE_LIST = [10, 2, 128, 25, 64, 600, 2000, 1024]

DATA_DIR = Path('./data')
### DO NOT CHANGE ###

### Prepare Airline Data:

You have to download the Airline dataset. Check out the following resources:
- J. Hensman, N. Fusi, and N. D. Lawrence. Gaussian processes for Big data. In Proceedings of the Twenty-Ninth Conference on Uncertainty in Artificial Intelligence, UAI’13, pages 282–290, Arlington, Virginia, USA, Aug. 2013. AUAI Press.
- J. Hensman, N. Durrande, and A. Solin. Variational Fourier features for Gaussian processes. The Journal of Machine Learning Research, 18(1):5537–5588, Jan. 2017. ISSN 1532-4435.

Once you obtain the dataset, run the next cell filling in the path to the Airline file (raw dataset). This is a preprocessing step.

Note: If you don't do this step you will not get the results for Airline dataset!

In [None]:
from load_data import load_airline

path_raw_airline = '' # Fill in the path to the raw Airline data 
load_airline(path_raw_airline)

### Different Optimization Problems (Regularization for $\lambda$):

In [7]:
### DO NOT CHANGE ###
DATA_NAMES = ['energy', 'yacht', 'concrete', 'airfoil', 'wine']
RES_PATH = (
    create_dir_if_not_exists(f'./artifacts/Regularization_Study') 
    / 'results.json'
)
### DO NOT CHANGE ###

In [None]:
### Instructions: ###
# If the results have already been obtained, 
# you can set TRAIN = False to prevent extra calculations.
TRAIN = True 

if TRAIN:  
    final_res = defaultdict(dict)
    for data_name in DATA_NAMES:  
        print(data_name)
        DATA_PATH = DATA_DIR / f'{data_name}.csv'
        for reg_type in ['l1', 'l2', 'fixed_norm']:
            for l_pos in [True, False]:
                train_model_f = prepare_train_funct_fl(
                    data2model_params(data_name, 'FL_Model', P_SCALE_LIST, reg_type, l_pos)
                )
                res = get_stats_several_trials(
                    DATA_PATH, train_model_f, METRIC_F, N_TRIALS, TEST_SIZE
                )
                mode = reg_type + ('_pos' if l_pos else '')
                final_res[data_name][mode] = res
    with open(RES_PATH, 'w') as outfile:
        json.dump(final_res, outfile)
else:
    with open(RES_PATH, 'r') as f:
        final_res = json.load(f)

dict_final_res = defaultdict(list)
for data_name in DATA_NAMES:  
    for reg_type in ['l1', 'l2', 'fixed_norm']:
        for l_pos in [True, False]:
            mode = reg_type + ('_pos' if l_pos else '')
            dict_final_res['data'].append(data_name)
            dict_final_res['Regularization'].append(mode)
            dict_final_res['MSE'].append(np.mean(final_res[data_name][mode]['metric']))
            dict_final_res['Time'].append(np.mean(final_res[data_name][mode]['train_time']))
display(
    pd.pivot(
        pd.DataFrame(dict_final_res), 
        columns=['data'], 
        index=['Regularization']
    ).reset_index()
)

### FL Model Compared to Cross-Validation:

#### Dynamical Behavior:

In [9]:
### DO NOT CHANGE ###
DATA_NAMES = ['energy', 'yacht', 'concrete', 'airfoil', 'wine'] 

if 'airline' in ';'.join(os.listdir('./data')):
    DATA_NAMES.append('airline')
else:
    print('There will be no Airline data results!')
### DO NOT CHANGE ###

In [None]:
### Instructions: ###
# Set TRAIN = False only if you already have the results for all the datasets
TRAIN = True

if TRAIN:
    for data_name in DATA_NAMES:  
        print(data_name)
        DATA_PATH = DATA_DIR / f'{data_name}.csv'
        RES_PATH = create_dir_if_not_exists(f'./artifacts/FLvsCV/{data_name}') / 'results.json'
        res = defaultdict(list)
        for model_name in ['FL_Model', 'CV_Model']:
            if data_name == 'airline':
                test_size, n_trials, p_scale_list = 0.3, 5, [10, 2, 128, 25, 64, 1024]
                train_model_f = data2train_f(data_name, model_name, p_scale_list)
                _res = get_stats_several_trials(
                    DATA_PATH, train_model_f, METRIC_F, n_trials, test_size
                )
                extend_results_dict(res, n_features=[len(p_scale_list),]*n_trials, **_res)
            else:
                for nps in range(1, len(P_SCALE_LIST) + 1):
                    p_scale_list = P_SCALE_LIST[:nps]
                    train_model_f = data2train_f(data_name, model_name, p_scale_list)
                    _res = get_stats_several_trials(
                        DATA_PATH, train_model_f, METRIC_F, N_TRIALS, TEST_SIZE
                    )
                    extend_results_dict(res, n_features=[nps,]*N_TRIALS, **_res)
        res = prepare_for_dump(res)
        with open(RES_PATH, 'w') as outfile:
            json.dump(res, outfile)

#### Dynamical Behavior of Models:

In [None]:
### Instructions: ###
# Choose the dataset: 
# DATA_NAME = 'yacht' / 'airfoil' / 'energy' / 'concrete' / 'wine'
DATA_NAME = 'yacht' 

RES_DIR = create_dir_if_not_exists(f'./artifacts/FLvsCV')
RES_PATH = RES_DIR / f'{DATA_NAME}/results.json'
with open(RES_PATH, 'r') as f:
    df_res = pd.DataFrame(json.load(f))

show_plot = True
y_scale_mse = 'linear'
_title = f'({DATA_NAME.capitalize()} dataset)'
params_list = [
    dict(
        y_col='train_time', 
        y_label='Train Time (sec.)', 
        title=f'Training Time vs #Features {_title}',
        y_scale='linear',
    ),
    dict(
        y_col='metric', 
        y_label=f'{y_scale_mse}(MSE)' if y_scale_mse == 'log' else 'MSE', 
        title=f'Prediction quality vs #Features {_title}', 
        y_scale=y_scale_mse,
    ),
]

for i, params in enumerate(params_list):
    x_col, y_col, y_label = 'n_features', params['y_col'], params['y_label']
    title, y_scale = params['title'], params['y_scale']
    grouped_df = (
        df_res.groupby([x_col, 'model'])[y_col].agg(['mean', 'std'])
        .reset_index().rename(columns={'mean': y_col}))
    
    grouped_df['model'] = grouped_df['model'].str.removesuffix('_Model')
    save_path = RES_DIR / f'{DATA_NAME}/{y_col}.png'
    
    plot_mean_conf(
        grouped_df, x_col, y_col, 'std', 'model', '#Features (P)', y_label, 
        '', save_path, show_plot, y_scale, grouped_df[x_col].unique(),
    )

#### Table: Comparison of FL Model and CV for Small/Large-Scale Data: 

In [None]:
display(pd.DataFrame(get_res_table(DATA_NAMES, len(P_SCALE_LIST), DATA_DIR)))