# Sample LASSO task functions

In [None]:
import os, sys
from pathlib import Path
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# set path for jupyter notebook
if os.path.exists(os.path.abspath('..')):
    sys.path.append(os.path.abspath('..'))

from src.feat_build import main
from src.model_build import train

In [None]:
inv_dir = Path(os.getcwd())
proj_dir = inv_dir.parent

sample_guids_parquet = 'sample_guid_10000_china_us.parquet'
directories = ["frgnd_backgrnd_apps_v4_hist", "web_cat_usage_v2","power_acdc_usage_v4_hist","os_c_state", "hw_pack_run_avg_pwr"]

if 'feat.parquet' not in os.listdir(inv_dir / 'out'):
    main.generate_features(sample_guids_parquet, inv_dir, directories)
else:
    print('Features already generated')

In [None]:
# Generates Synthetic Data
# main.generate_synthetic_data(proj_dir / 'dummy_data')
# syn_feat = pd.read_parquet(proj_dir / "dummy_data" / "synthetic_data.parquet")
# syn_feat.shape

Read in Data

In [None]:
feat = pd.read_parquet(os.path.join('out', 'feat.parquet'))
feat.shape

In [None]:
# feat['cpu_suffix_Core-U'].value_counts()

# we are only interested in U series CPU's
feat = feat.loc[feat['cpu_suffix_Core-U']==1,:]

In [None]:
# sklearn lasso alpha: 0.01 on research data for testing
best_feats = ['sw_category_Development & Programming (IDEs, Text Editors, Version Control)',
       'sw_category_Gaming (Casual, Online & Offline)',
       'sw_category_Multimedia Editing (Audio & Video)', 'sw_category_Other',
       'sw_category_Simulation & Virtual Reality',
       'sw_category_System & Utilities',
       'sw_category_Web Browsers & Communication',
       'sw_event_name_DC_DISPLAY_OFF', 'sw_event_name_DC_DISPLAY_ON',
       'temp_avg', 'web_parent_category_content creation',
       'web_parent_category_education', 'web_parent_category_entertainment',
       'web_parent_category_games', 'web_parent_category_news',
       'web_parent_category_private', 'web_parent_category_reference',
       'web_sub_category_communication',
       'web_sub_category_music / audio streaming', 'web_sub_category_news',
       'web_sub_category_presentations', 'web_sub_category_reference',
       'web_sub_category_spreadsheets', 'web_sub_category_video games',
       'web_sub_category_word processing', 'cpu_norm_usage', 'nrs',
       'countryname_normalized_China', 'modelvendor_normalized_Apple',
       'modelvendor_normalized_Dell', 'modelvendor_normalized_LG',
       'modelvendor_normalized_Lenovo', 'modelvendor_normalized_Other',
       'modelvendor_normalized_Razer', 'modelvendor_normalized_Timi',
       'os_Win10', 'os_Win11', 'graphicsmanuf_AMD', 'graphicsmanuf_Intel',
       'graphicsmanuf_Nvidia', 'graphicsmanuf_Other', 'cpu_family_Core i3',
       'cpu_family_Core i5', 'cpu_suffix_Core-H', 'cpu_suffix_Core-U',
       'persona_Casual User', 'persona_Communication', 'persona_Gamer',
       'persona_Office/Productivity', 'persona_Win Store App User',
       'age_category', '#ofcores', 'screensize_category', 'day_of_week',
       'month_of_year']

In [None]:
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LassoCV
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import mean_squared_error
# from sklearn.pipeline import make_pipeline

# # Assuming you have your data in X and y
# X, y = feat.drop("power_mean", axis=1), feat["power_mean"]

# # Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Create a pipeline with StandardScaler and LassoCV
# lasso_pipeline = make_pipeline(
#     #StandardScaler(),
#     LassoCV(cv=20, alphas = np.logspace(-2, 0, 50), random_state=42, max_iter=10000)
# )

# # Fit the model
# lasso_pipeline.fit(X_train, y_train)

# # Make predictions
# y_train_pred = lasso_pipeline.predict(X_train)
# y_test_pred = lasso_pipeline.predict(X_test)

# # Calculate MSE for training and test sets
# train_mse = mean_squared_error(y_train, y_train_pred)
# test_mse = mean_squared_error(y_test, y_test_pred)

# print(f"Training MSE: {train_mse:.4f}")
# print(f"Test MSE: {test_mse:.4f}")

# # Get the best alpha value
# best_alpha = lasso_pipeline.named_steps['lassocv'].alpha_
# print(f"Best alpha: {best_alpha:.4f}")

# # Get the number of features selected
# n_features = np.sum(lasso_pipeline.named_steps['lassocv'].coef_ != 0)
# print(f"Number of features selected: {n_features}")
# best_feats = lasso_pipeline.named_steps['lassocv'].coef_
# best_feats = X.columns[[i for i in range(len(best_feats)) if best_feats[i] != 0]]

Trivial Results

In [None]:
trivial_mse, trivial_model, trivial_r2, trivial_sim = train.trivial(feat, best_feats)
trivial_mse, trivial_r2, trivial_sim

Run baseline non-private case with no regularization:

In [None]:
baseline_mse, baseline_feat_dict, baseline_r2, baseline_sim = train.train(feat, best_feats,"lasso", tol=1e-4, l=0)
baseline_mse, baseline_r2, baseline_sim

Compare multiple methods with varying regularization: $l$

In [None]:
# compare methods
results = pd.DataFrame(columns=["coord_desc", "fw_lap", "fw_exp"])
for l in [0.25, 0.5, .9, 1, 5, 10, 25, 100]:
    # print("parameter: ", l)
    test_mse1, feat_dict, r2, similarity = train.train(feat, best_feats, "lasso", tol=1e-4, l=1/l) 
    test_mse2, feat_dict, r2, similarity = train.train(feat, best_feats, "fw-lasso-lap", tol=1e-4, l=l, max_iter=500)
    test_mse3, feat_dict, r2, similarity = train.train(feat, best_feats, "fw-lasso-exp", tol=1e-4, l=l, max_iter=500, normalize=True)
    results.loc[l] = [test_mse1, test_mse2, test_mse3]

In [None]:
results

In [None]:
results.plot(kind='line', logx=True, marker='.', ylim=(0, 25));

Test many epsilon values at regularization $l=10$

In [None]:
epss = [0.25, 0.5, 1, 5, 10, 100, 10_000]
epsresults = pd.DataFrame(columns=["fw_lap", "fw_exp"])
for eps in epss:
    print("parameter: ", eps)
    test_mse1, feat_dict, r2, similarity = train.train(feat, best_feats,"fw-lasso-lap", tol=1e-8, l=5, epsilon=eps, max_iter=500)
    test_mse2, feat_dict, r2, similarity = train.train(feat, best_feats, "fw-lasso-exp", tol=1e-8, l=5, epsilon=eps, max_iter=500)
    epsresults.loc[eps] = [test_mse1, test_mse2]

In [None]:
epsresults

In [None]:
epsresults.iloc[:, 0:2].plot(kind='line', logx=True, marker='.');

Example of using sigmoid utility mapping on exponential implementation:

In [None]:
l = 10 # less regularization hurts the sensitivity (and therefore max iters as well)
tol = 0#1e-8
max_iter = 5500
epss = [0.01, 0.05, 0.1, 0.5, 1, 10, 100, None]
epsresults = []
eps_similarities = []
eps_r2 = []
model = "fw-lasso-exp"

for eps in epss:
    test_mse, feat_dict, r2, similarity = train.train(feat, best_feats, model, normalize=False, clip_sd=None, tol=tol, l=l, epsilon=eps, max_iter=max_iter, plot=inv_dir / 'out' / f'{model}_{eps}_convergence.png', triv=10.12)
    eps_similarities.append(similarity)
    eps_r2.append(r2)
    epsresults.append(test_mse)

rmses = np.sqrt(np.array(epsresults))
max_rmse = 1.1*np.max(np.append(rmses, np.array([trivial_mse**.5, baseline_mse**.5]))) # buffer by 10% since models having 0 utility cannot be judged by rmse. 
c = 5 # higher values punish rmse more
utility = 2 / (1 + np.exp(c * rmses / max_rmse))
epsresults

In [None]:
eps_r2

In [None]:
eps_similarities

## main plot

In [None]:
l = 10 # less regularization hurts the sensitivity (and therefore max iters as well)
tol = 1e-7
max_iter = 2500
epss = [0.01, 0.05, 0.1, 0.5, 1, 10, 100]
epsresults = []
eps_similarities = []
eps_r2 = []
model = "compare-fw-plot"

for eps in epss:
    test_mse, feat_dict, r2, similarity = train.train(feat, best_feats, model, normalize=False, clip_sd=None, tol=tol, l=l, epsilon=eps, max_iter=max_iter, plot=inv_dir / 'out' / f'main_lasso_{eps}.png', triv=None)
    eps_similarities.append(similarity)
    eps_r2.append(r2)
    epsresults.append(test_mse)
# plt.legend(frameon=False, fontsize="small")
plt.ylim(0, 50)
plt.savefig(f'main_lasso_{eps}.png', dpi=300, facecolor='#EEEEEE', edgecolor='#EEEEEE', pad_inches=1)
rmses = np.sqrt(np.array(epsresults))
max_rmse = 1.1*np.max(np.append(rmses, np.array([trivial_mse**.5, baseline_mse**.5]))) # buffer by 10% since models having 0 utility cannot be judged by rmse. 
c = 5 # higher values punish rmse more
utility = 2 / (1 + np.exp(c * rmses / max_rmse))
epsresults

In [None]:
test_mse, r2

# non-private

In [None]:
ls = [1e-10, 1e-5, 1e-1, 1, 1e1, 1e2, 1e5] # less regularization hurts the sensitivity (and therefore max iters as well)
tol = 1e-8
max_iter = 10_000
epsresults = []
eps_similarities = []
eps_r2 = []
model = "fw-lasso"

for l in ls:
    test_mse, feat_dict, r2, similarity = train.train(feat, best_feats, model, normalize=False, clip_sd=None, tol=tol, l=l, max_iter=max_iter, plot=inv_dir / 'out' / f'{model}_l_{l}_convergence.png')
    eps_similarities.append(similarity)
    eps_r2.append(r2)
    epsresults.append(test_mse)

rmses = np.sqrt(np.array(epsresults))
max_rmse = 1.1*np.max(np.append(rmses, np.array([trivial_mse**.5, baseline_mse**.5]))) # buffer by 10% since models having 0 utility cannot be judged by rmse. 
c = 5 # higher values punish rmse more
utility = 2 / (1 + np.exp(c * rmses / max_rmse))
epsresults

normalize=False, no clipping 

In [None]:
l = 1 # less regularization hurts the sensitivity (and therefore max iters as well)
tol = 1e-7
max_iter = 5_000
epss = [None, 0.5, 10]#[0.01, 0.05, 0.1, 0.5, 1, 10, 100, None]
epsresults = []
eps_similarities = []
eps_r2 = []
model = "fw-lasso-exp"

for eps in epss:
    test_mse, feat_dict, r2, similarity = train.train(feat, best_feats, model, normalize=False, tol=tol, l=l, epsilon=eps, max_iter=max_iter, plot=inv_dir / 'out' / f'{model}_{eps}_convergence.png')
    eps_similarities.append(similarity)
    eps_r2.append(r2)
    epsresults.append(test_mse)

rmses = np.sqrt(np.array(epsresults))
max_rmse = 1.1*np.max(np.append(rmses, np.array([trivial_mse**.5, baseline_mse**.5]))) # buffer by 10% since models having 0 utility cannot be judged by rmse. 
c = 5 # higher values punish rmse more
utility = 2 / (1 + np.exp(c * rmses / max_rmse))
epsresults

In [None]:
# non-private results for FW
test_mse, r2, similarity

In [None]:
plt.plot(epss, eps_r2, 'darkred')
plt.xlabel('epsilon (log scaled)')
plt.ylabel('r2')
plt.axhline(trivial_r2, color='gray')
plt.axhline(baseline_r2, color='r')
plt.xscale('log')

In [None]:
plt.clf()
plt.plot(epss, epsresults, 'g')
plt.xlabel('epsilon (log scaled)')
plt.ylabel('mse')
plt.axhline(trivial_mse, color='gray')
plt.axhline(baseline_mse, color='r')
plt.xscale('log')

In [None]:
plt.clf()
plt.plot(epss, eps_similarities, 'b')
plt.xlabel('epsilon (log scaled)')
plt.ylabel('similarity')
plt.ylim(0, 1)
plt.axhline(trivial_sim, color='gray')
plt.axhline(baseline_sim, color='r')
plt.xscale('log')

In [None]:
c = 5 # higher values punish rmse more
utility = 2 / (1 + np.exp(c * rmses[:-1] / max_rmse))

In [None]:
plt.plot(epss[:-1], utility)
plt.xlabel('epsilon (log scaled)')
plt.ylabel('utility (sigmoid)')
plt.axhline(2 / (1 + np.exp(c * epsresults[-1]**.5 / max_rmse)), color='red')
# plt.axhline(2 / (1 + np.exp(c * trivial_mse**.5 / max_rmse)), color='gray')
# plt.axhline(2 / (1 + np.exp(c * baseline_mse**.5 / max_rmse)), color="red")
plt.ylim(0, 1)
plt.xscale('log')

In [None]:
max_rmse = np.max(epsresults)
plt.plot(epss[:-1], 1 - rmses[:-1] / max_rmse)
plt.xlabel('epsilon (log scaled)')
plt.ylabel('utility (linear)')
plt.axhline(1 - rmses[-1] / max_rmse, color='red')
# plt.axhline(1 - trivial_mse**.5 / max_rmse, color='gray')
# plt.axhline(1 - baseline_mse**.5 / max_rmse, color="red")
plt.ylim(0, 1)
plt.xscale('log')

In [None]:
pd.DataFrame({'epsilon': epss[:-1],
              'task': ["LASSO" for i in range(len(epss)-1)],  
            'utility': utility.tolist()}).to_csv("results_draft.csv", index_label="Index")