In [2]:
import numpy as np
import scipy as sp
import scipy.linalg
import scipy.stats
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from sklearn.metrics import r2_score
import pickle
import time
from scipy.optimize import fsolve
from scipy.optimize import least_squares
from scipy.optimize import minimize
from scipy import sparse
import os.path
from scipy.interpolate import splrep, splev
from importlib import reload
import pandas as pd

In [3]:
import sys
sys.path.append('../.')
from data import input as input
from algorithms import algo as algo
from algorithms import utility as utility 
from model import anomaly as anomaly
from algorithms import robust_pca as robust_pca
from tqdm import tqdm

In [4]:
#%pdb
reload(input)
reload(algo)
reload(utility)
reload(anomaly)
reload(robust_pca)

def generate_synthetic_experiment_data(n1=100, n2=100, num_exper=200, 
                                 r_range = [1, 10], mean_value_range = [1, 10], 
                                 p_o_range = [0.5, 1], p_a_range = [0, 0.3], alpha_range = [0, 1], 
                                 algorithms_collect = [], metrics = None,
                                 row_specific_anomalies=False,
                                 verbose = 2):

    parameters = []
    results = []
    for T in tqdm(range(0, num_exper)):
        r = np.random.randint(r_range[0], high=r_range[1]+1)
        mean_value = np.random.uniform(mean_value_range[0], mean_value_range[1])
        p_o = np.random.uniform(p_o_range[0], p_o_range[1])
        p_a = np.random.uniform(p_a_range[0], p_a_range[1], size=(n1, 1))
        if row_specific_anomalies:
            alpha = np.random.uniform(alpha_range[0], alpha_range[1], size = (n1,1))
            anomaly_model = anomaly.ExponentialAnomalyRow(p = p_a, exp_rate = 1/alpha, p_range = (0.01, 0.5), one_over_exp_range=(0.01, 0.9))
        else:
            alpha = np.random.uniform(alpha_range[0], alpha_range[1])
            anomaly_model = anomaly.ExponentialAnomaly(p = p_a, exp_rate = 1/alpha, p_range = (0.01, 0.5), one_over_exp_range=(0.01, 0.9))

        data = input.DataInput()
        data.synthetic_data(n1=n1, n2=n2, r=r, mean_value = mean_value, prob_observing = p_o)
        data.add_anomaly(anomaly_model)
        data.cost_generation(c01 = [0, 10], c10 = [0, 10])
        data.compute_posterior()

        result_dict = {}
        ideal_cost = utility.ideal_cost(data)
        result_dict['c_opt'] = ideal_cost

        if 'EW' in algorithms_collect:
            result_dict.update(metrics(*algo.EW_algorithm(data, r_constraint = r, hard_impute_yes=False, gamma = data.n1/r*0.5), 'EW', data, ideal_cost))

        if 'DRMF' in algorithms_collect:
            result_dict.update(metrics(*algo.DRMF(data, r, rate_anomalies=np.max(p_a)), 'DRMF', data, ideal_cost))
        
        if 'stable_PCP' in algorithms_collect:
            result_dict.update(metrics(*robust_pca.Robust_PCA(data, r_constraint=r), 'stable_PCP', data, ideal_cost))

        if 'RMC' in algorithms_collect:
            result_dict.update(metrics(*robust_pca.Robust_PCA(data, r_constraint=r, up_M = np.max(data.M0)*10, up_A = np.max(data.M0)*10), 'RMC', data, ideal_cost))

        results.append(result_dict)
        parameters.append((r, mean_value, p_o, p_a, alpha))

        if (verbose == 2):
            print(result_dict)

    result_df = pd.DataFrame.from_records(results)
    return result_df, parameters

def metrics(M_est, anomaly_decision, algo_name, data, ideal_cost):
    results = {
        f'c_{algo_name}': utility.cost_compute(data, anomaly_decision),
        f'frobenious_norm_{algo_name}': utility.error_Frobenious(M_est, data.M0),
        f'max_norm_{algo_name}': utility.error_max(M_est, data.M0)
    }
    results[f'regret_{algo_name}'] = results[f'c_{algo_name}'] - ideal_cost
    return results

#results_df, parameters = generate_data_for_table_cost(num_exper=100, metrics = metrics)

In [5]:
results_df

Unnamed: 0,c_opt,c_EW,frobenious_norm_EW,max_norm_EW,c_DRMF,frobenious_norm_DRMF,max_norm_DRMF,c_stable_PCP,frobenious_norm_stable_PCP,max_norm_stable_PCP,c_RMC,frobenious_norm_RMC,max_norm_RMC
0,0.562491,0.574355,71.492424,17.941794,1.875806,134.529391,25.420813,1.755503,253.086899,44.123696,1.755503,256.200061,44.230186
1,0.440449,0.497262,204.961906,36.261300,1.459573,197.964279,31.894118,1.012074,384.237344,56.642336,1.012074,384.237344,56.642336
2,0.598540,0.609959,47.945193,5.064728,1.808508,101.053949,15.084983,1.443537,62.377274,4.087610,2.205648,50.140404,3.612387
3,0.403426,0.436367,182.750721,12.537658,1.466145,193.931428,15.117837,0.927048,345.500101,30.140850,0.927048,345.500101,30.140850
4,0.534849,0.543098,49.285608,5.611037,1.856119,74.158382,6.358079,1.803965,163.525993,18.019304,1.803965,163.525993,18.019304
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.429696,0.472633,174.896418,59.691015,1.651815,336.301848,100.072843,0.926895,199.754805,16.123454,0.926895,199.754805,16.123454
96,0.443194,0.483618,202.365811,33.391575,1.746964,379.083414,51.752652,1.403931,554.249261,79.009910,1.403931,554.249261,79.009910
97,0.354824,0.411293,228.472704,17.187239,1.402755,259.607154,45.178274,0.813660,355.063508,27.171610,0.813660,355.063508,27.171610
98,0.483001,0.501861,81.341618,5.422731,1.741464,151.473143,12.980513,1.112891,105.513591,8.753834,1.265428,1311.309917,139.841051


In [7]:
import pickle
pickle.dump(results_df, open('results/results_single_p_alpha_100.pt', 'wb'))

In [8]:
results_df['regret_EW'] = results_df['c_EW'] - results_df['c_opt'] 
results_df['regret_DRMF'] = results_df['c_DRMF'] - results_df['c_opt']
results_df['regret_stable_PCP'] = results_df['c_stable_PCP'] - results_df['c_opt']
results_df['regret_RMC'] = results_df['c_RMC'] - results_df['c_opt']
results_df.describe()

Unnamed: 0,c_opt,c_EW,frobenious_norm_EW,max_norm_EW,c_DRMF,frobenious_norm_DRMF,max_norm_DRMF,c_stable_PCP,frobenious_norm_stable_PCP,max_norm_stable_PCP,c_RMC,frobenious_norm_RMC,max_norm_RMC,regret_EW,regret_DRMF,regret_stable_PCP,regret_RMC
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.4633,0.505821,166.938347,19.932867,1.704858,289.456091,40.232789,1.150014,309.189468,42.125092,1.241661,465.727484,59.287227,0.042521,1.241558,0.686715,0.778361
std,0.087115,0.072016,75.897388,13.269703,0.146279,157.32731,27.976578,0.245607,168.379067,39.206415,0.374609,488.969871,59.173166,0.023844,0.103108,0.180833,0.305614
min,0.319659,0.361297,41.472351,4.208095,1.378908,60.401028,5.405302,0.793386,62.377274,4.08761,0.793386,50.140404,3.612387,0.005055,1.005795,0.327737,0.327737
25%,0.402707,0.461118,100.358469,10.725765,1.569617,176.641966,20.362213,0.950187,185.264308,17.141623,0.950187,230.020337,20.378766,0.025711,1.177245,0.561484,0.561484
50%,0.452981,0.497418,160.682585,17.394671,1.73444,262.499692,34.389469,1.12621,298.280717,31.989184,1.133574,346.959878,39.175482,0.038656,1.257362,0.666122,0.697388
75%,0.52492,0.553725,227.781177,25.049459,1.820506,368.765851,51.954947,1.247785,392.332534,56.172271,1.427452,450.290764,67.329317,0.05682,1.3119,0.752241,0.93829
max,0.681275,0.686488,424.65875,68.561557,1.943846,903.355351,142.546745,1.86294,895.1156,223.745758,2.306274,3208.954958,277.685809,0.122969,1.468946,1.302666,1.737634


## Fix m, increasing n

In [5]:
#%pdb
reload(input)
reload(algo)
reload(utility)
reload(anomaly)
reload(robust_pca)

def fix_m_increase_n():
    test_pair = [(50, 50), (50, 100), (50, 500), (50, 1000)]
    results_df = {}
    for pair in test_pair:
        n1 = pair[0]
        n2 = pair[1]
        results_df[pair], _ = generate_synthetic_experiment_data(n1=n1, n2=n2, num_exper=100, 
                                                                 r_range = [3, 3], mean_value_range = [5, 5], 
                                                                p_o_range = [0.8, 0.8], p_a_range = [0.04, 0.04], alpha_range = [0.2, 0.2], 
                                                                metrics = metrics, algorithms_collect=['EW'], verbose=0)
    return results_df
results_df = fix_m_increase_n()

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:10<00:00,  9.50it/s]
100%|██████████| 100/100 [00:18<00:00,  5.45it/s]
100%|██████████| 100/100 [01:37<00:00,  1.02it/s]
100%|██████████| 100/100 [03:23<00:00,  2.04s/it]


In [6]:
for key in results_df.keys():
    print(key)
    print(results_df[key].describe())

(50, 50)
            c_opt        c_EW  frobenious_norm_EW  max_norm_EW   regret_EW
count  100.000000  100.000000          100.000000   100.000000  100.000000
mean     0.143215    0.159304           81.943612    16.386820    0.016089
std      0.008003    0.011034            9.066445     7.325082    0.006445
min      0.127358    0.134727           68.402689     7.756895    0.002908
25%      0.136476    0.152563           75.439944    11.509536    0.011875
50%      0.143259    0.158792           79.357574    14.198454    0.015794
75%      0.148572    0.165381           86.450125    18.708612    0.019627
max      0.165876    0.189915          109.142140    51.803946    0.041839
(50, 100)
            c_opt        c_EW  frobenious_norm_EW  max_norm_EW   regret_EW
count  100.000000  100.000000          100.000000   100.000000  100.000000
mean     0.144178    0.156604          104.509969    16.195943    0.012427
std      0.005496    0.007139            9.355545     4.951849    0.003501
min   

In [7]:
def fix_m_increase_n():
    test_pair = [(100, 100), (100, 500), (100, 1000)]
    results_df = {}
    for pair in test_pair:
        n1 = pair[0]
        n2 = pair[1]
        results_df[pair], _ = generate_synthetic_experiment_data(n1=n1, n2=n2, num_exper=100, 
                                                                 r_range = [3, 3], mean_value_range = [5, 5], 
                                                                p_o_range = [0.8, 0.8], p_a_range = [0.04, 0.04], alpha_range = [0.2, 0.2], 
                                                                metrics = metrics, algorithms_collect=['EW'], verbose=0)
    return results_df
results_df = fix_m_increase_n()

100%|██████████| 100/100 [03:54<00:00,  2.34s/it]
100%|██████████| 100/100 [05:57<00:00,  3.57s/it] 
100%|██████████| 100/100 [16:47<00:00, 10.08s/it] 


In [9]:
for key in results_df.keys():
    print(key)
    print(results_df[key].describe())

(100, 100)
            c_opt        c_EW  frobenious_norm_EW  max_norm_EW   regret_EW
count  100.000000  100.000000          100.000000   100.000000  100.000000
mean     0.143633    0.152877          131.350072    17.734776    0.009244
std      0.003260    0.004110           14.397981     7.444000    0.002288
min      0.133050    0.139765          109.555341     7.799142    0.004008
25%      0.141494    0.150900          121.504591    12.883980    0.007742
50%      0.143984    0.153091          128.175703    15.559798    0.009019
75%      0.145804    0.155773          138.113654    20.766264    0.010830
max      0.150293    0.161309          181.714661    52.873729    0.015198
(100, 500)
            c_opt        c_EW  frobenious_norm_EW  max_norm_EW   regret_EW
count  100.000000  100.000000          100.000000   100.000000  100.000000
mean     0.143601    0.149419          237.364699    19.145663    0.005818
std      0.001813    0.002204           16.092126     6.291650    0.000928
min