In [1]:
import numpy as np
import pandas as pd
import json
from dswe import FunGP
from dswe import CovMatch
from dswe import generate_test_set, compute_weighted_diff, compute_weighted_stat_diff

In [2]:
## Provide dataset path, xcol, ycol, circ_pos, testcol, grid_size, limit_memory etc.
df = pd.read_csv('../../Downloads/DS/Turbine_Upgrade_Dataset/Turbine Upgrade Dataset(VG Pair).csv')
xcol = [3,4,5,6]
ycol = 10
circ_pos = [1]
testcol = [0,1]
grid_size = [50,50]
limit_memory = True
conf_level = 0.95
opt_method = 'L-BFGS-B'
sample_size = {'optim_size': 500, 'band_size': 5000}
rng_seed = [1, 10, 150, 343, 769, 1001, 1337, 2222, 3456, 5000] # seed to run many iterartions to average result
baseline = 1

In [3]:
optim_idx_R = None
band_idx_R = None
estimated_params_R = None

testset_R = None
result_matching_R = None
weighted_diff_R = None

In [4]:
## Provide mutliple outputs which have generated by the R code. Comment if don't have.
optim_idx_R = json.load(open('../../Downloads/DSWE-Package/testing/optimIdx.json'))
band_idx_R = json.load(open('../../Downloads/DSWE-Package/testing/bandIdx.json'))
estimated_params_R = json.load(open('../../Downloads/DSWE-Package/testing/estimatedParams.json'))
testset_R = pd.read_csv('../../Downloads/DSWE-Package/testing/testset.csv')
result_matching_R = json.load(open('../../Downloads/DSWE-Package/testing/matchedData.json'))
weighted_diff_R = 5.3
weighted_stat_diff_R = 1.98

In [5]:
## Process everything generated from R.
if optim_idx_R is not None:
    for i in range(len(optim_idx_R)):
        if optim_idx_R[i] is not None:
            optim_idx_R[i] = list(np.array(optim_idx_R[i]) - 1)
            
if band_idx_R is not None:
    for i in range(len(band_idx_R)):
        if band_idx_R[i] is not None:
            band_idx_R[i] = list(np.array(band_idx_R[i]) - 1)

if testset_R is not None:
    testset_R =  testset_R.values

if result_matching_R is not None:
    result_matching_R[0] = pd.DataFrame(result_matching_R[0])
    result_matching_R[1] = pd.DataFrame(result_matching_R[1])
    matched_data_X_R = [result_matching_R[0].iloc[:,xcol].values, result_matching_R[1].iloc[:,xcol].values]
    matched_data_y_R = [result_matching_R[0].iloc[:,10].values, result_matching_R[1].iloc[:,10].values]

In [6]:
Xlist = [df[df['upgrade status'] == 0].to_numpy()[:,xcol].astype(float), df[df['upgrade status'] == 1].to_numpy()[:,xcol].astype(float)]
ylist = [df[df['upgrade status'] == 0].to_numpy()[:,ycol].astype(float), df[df['upgrade status'] == 1].to_numpy()[:,ycol].astype(float)]

result_matching = CovMatch(Xlist, ylist, circ_pos)
matched_data_X = result_matching.matched_data_X
matched_data_y = result_matching.matched_data_y

testset = generate_test_set(matched_data_X, testcol, grid_size)

In [7]:
print ("Accuracy of testset matching: {}%".format(np.mean(np.round(testset, 2) == np.round(testset_R, 2))*100))

Accuracy of testset matching: 100.0%


In [8]:
print ("Accuracy of Covmatching: {}%".format((np.mean(np.round(matched_data_X[0], 2) == np.round(matched_data_X_R[0], 2))*100 + np.mean(np.round(matched_data_X[1], 2) == np.round(matched_data_X_R[1], 2))*100)/2.))

Accuracy of Covmatching: 100.0%


In [9]:
weighted_diff = {}
weighted_stat_diff = {}
order = []

### Experiment 1: Base case (Everything generated from Python)

In [10]:
mlist_X = [matched_data_X[0][:, testcol], matched_data_X[1][:, testcol]]
mlist_y = [matched_data_y[0], matched_data_y[1]]

wd = []
wsd = []
for rsd in rng_seed:
    result_GP = FunGP(mlist_X, mlist_y, testset, conf_level=conf_level, limit_memory=limit_memory, opt_method=opt_method, 
                    sample_size=sample_size, rng_seed=rsd, optim_idx=None, band_idx=None, params=None)
    wd.append(compute_weighted_diff(Xlist, result_GP.mu1, result_GP.mu2, testset, testcol, baseline=baseline))
    wsd.append(compute_weighted_stat_diff(Xlist, result_GP.mu1, result_GP.mu2, result_GP.band, testset, testcol, baseline=baseline))

    # saving indices and paramaters. Override after each iter. Better to set rng_seed list to one value.
    with open('ablation/optim_idx.json', 'w') as f:
        json.dump(result_GP.optim_idx, f)
    with open('ablation/band_idx.json', 'w') as f:
        json.dump(result_GP.band_idx, f)
    with open('ablation/estimated_params.json', 'w') as f:
        json.dump(result_GP.params, f)

order.append([False, False, False])               
weighted_diff['experiment 1'] = wd
weighted_stat_diff['experiment 1'] = wsd

### Experiment 2: optim_idx=True, estimated_params=False, band_idx=False

In [11]:
mlist_X = [matched_data_X[0][:, testcol], matched_data_X[1][:, testcol]]
mlist_y = [matched_data_y[0], matched_data_y[1]]

wd = []
wsd = []
for rsd in rng_seed:
    result_GP = FunGP(mlist_X, mlist_y, testset, conf_level=conf_level, limit_memory=limit_memory, opt_method=opt_method, 
                    sample_size=sample_size, rng_seed=rsd, optim_idx=optim_idx_R, band_idx=None, params=None)
    wd.append(compute_weighted_diff(Xlist, result_GP.mu1, result_GP.mu2, testset, testcol, baseline=baseline))
    wsd.append(compute_weighted_stat_diff(Xlist, result_GP.mu1, result_GP.mu2, result_GP.band, testset, testcol, baseline=baseline))

order.append([True, False, False])                   
weighted_diff['experiment 2'] = wd
weighted_stat_diff['experiment 2'] = wsd

### Experiment 3: optim_idx=True, estimated_params=False, band_idx=True

In [12]:
mlist_X = [matched_data_X[0][:, testcol], matched_data_X[1][:, testcol]]
mlist_y = [matched_data_y[0], matched_data_y[1]]
result_GP = FunGP(mlist_X, mlist_y, testset, conf_level=conf_level, limit_memory=limit_memory, opt_method=opt_method, 
                sample_size=sample_size, optim_idx=optim_idx_R, band_idx=band_idx_R, params=None)
                
order.append([True, False, True])                   
weighted_diff['experiment 3'] = [compute_weighted_diff(Xlist, result_GP.mu1, result_GP.mu2, testset, testcol, baseline=baseline)]
weighted_stat_diff['experiment 3'] = [compute_weighted_stat_diff(Xlist, result_GP.mu1, result_GP.mu2, result_GP.band, testset, testcol, baseline=baseline)]

### Experiment 4: optim_idx=False, estimated_params=False, band_idx=True

In [13]:
mlist_X = [matched_data_X[0][:, testcol], matched_data_X[1][:, testcol]]
mlist_y = [matched_data_y[0], matched_data_y[1]]

wd = []
wsd = []
for rsd in rng_seed:
    result_GP = FunGP(mlist_X, mlist_y, testset, conf_level=conf_level, limit_memory=limit_memory, opt_method=opt_method, 
                    sample_size=sample_size, rng_seed=rsd, optim_idx=None, band_idx=band_idx_R, params=None)
    wd.append(compute_weighted_diff(Xlist, result_GP.mu1, result_GP.mu2, testset, testcol, baseline=baseline))
    wsd.append(compute_weighted_stat_diff(Xlist, result_GP.mu1, result_GP.mu2, result_GP.band, testset, testcol, baseline=baseline))

order.append([False, False, True])                   
weighted_diff['experiment 4'] = wd
weighted_stat_diff['experiment 4'] = wsd

### Experiment 5: optim_idx=None, estimated_params=True, band_idx=False

In [14]:
mlist_X = [matched_data_X[0][:, testcol], matched_data_X[1][:, testcol]]
mlist_y = [matched_data_y[0], matched_data_y[1]]

wd = []
wsd = []
for rsd in rng_seed:
    result_GP = FunGP(mlist_X, mlist_y, testset, conf_level=conf_level, limit_memory=limit_memory, opt_method=opt_method, 
                    sample_size=sample_size, rng_seed=rsd, optim_idx=None, band_idx=None, params=estimated_params_R)
    wd.append(compute_weighted_diff(Xlist, result_GP.mu1, result_GP.mu2, testset, testcol, baseline=baseline))
    wsd.append(compute_weighted_stat_diff(Xlist, result_GP.mu1, result_GP.mu2, result_GP.band, testset, testcol, baseline=baseline))

order.append([None, True, False])                       
weighted_diff['experiment 5'] = wd
weighted_stat_diff['experiment 5'] = wsd

### Experiment 6: optim_idx=None, estimated_params=True, band_idx=True

In [15]:
mlist_X = [matched_data_X[0][:, testcol], matched_data_X[1][:, testcol]]
mlist_y = [matched_data_y[0], matched_data_y[1]]
result_GP = FunGP(mlist_X, mlist_y, testset, conf_level=conf_level, limit_memory=limit_memory, opt_method=opt_method, 
                sample_size=sample_size, optim_idx=None, band_idx=band_idx_R, params=estimated_params_R)

order.append([None, True, True])                   
weighted_diff['experiment 6'] = [compute_weighted_diff(Xlist, result_GP.mu1, result_GP.mu2, testset, testcol, baseline=baseline)]
weighted_stat_diff['experiment 6'] = [compute_weighted_stat_diff(Xlist, result_GP.mu1, result_GP.mu2, result_GP.band, testset, testcol, baseline=baseline)]


## Final Outputs

In [16]:
def mean_std(dictlist):
    _mean = []
    _std = []

    for _, value in dictlist.items():
        if len(value) > 1:
            _mean.append(round(np.array(value).mean(axis=0), 2))
            _std.append(round(np.array(value).std(axis=0), 2))
        elif len(value) == 1:
            _mean.append(value[0])
            _std.append(0)
    return _mean, _std

In [17]:
order = np.array(order)
result = pd.DataFrame()
result['Experiment'] = [1,2,3,4,5,6]
result['optim_idx'] = order[:,0]
result['estimated_params'] = order[:,1]
result['band_idx'] = order[:,2]
result['weighted_diff_mean'], result['weighted_diff_std']= mean_std(weighted_diff)
result['weighted_stat_diff_mean'], result['weighted_stat_diff_std']= mean_std(weighted_stat_diff)
result['weighted_diff'] = list(weighted_diff.values())
result['weighted_stat_diff'] = list(weighted_stat_diff.values())
if weighted_diff_R:
    result['weighted_diff_R'] = weighted_diff_R
if weighted_stat_diff_R:
    result['weighted_stat_diff_R'] = weighted_stat_diff_R

In [18]:
list(weighted_diff.values())

[[5.56, 5.82, 5.74, 5.07, 5.92, 5.73, 5.43, 4.92, 4.74, 6.11],
 [5.49, 5.36, 5.56, 5.05, 5.48, 5.23, 5.26, 4.77, 4.79, 5.44],
 [5.3],
 [5.38, 5.8, 5.37, 5.33, 5.56, 5.62, 5.37, 5.43, 5.22, 5.7],
 [5.49, 5.36, 5.56, 5.05, 5.48, 5.23, 5.26, 4.77, 4.79, 5.44],
 [5.3]]

In [19]:
list(weighted_stat_diff.values())

[[2.28, 2.5, 2.4, 1.83, 2.36, 2.18, 1.98, 1.95, 1.45, 2.41],
 [2.17, 1.92, 2.18, 1.72, 2.17, 1.69, 1.89, 1.61, 1.8, 1.98],
 [1.69],
 [1.89, 2.6, 2.1, 2.06, 2.22, 2.12, 1.83, 2.02, 1.65, 2.25],
 [2.14, 1.88, 2.13, 1.65, 2.15, 1.7, 1.87, 1.62, 1.76, 2.16],
 [1.61]]

In [20]:
result.to_csv('ablation/output.csv', index=False)

In [21]:
result

Unnamed: 0,Experiment,optim_idx,estimated_params,band_idx,weighted_diff_mean,weighted_diff_std,weighted_stat_diff_mean,weighted_stat_diff_std,weighted_diff,weighted_stat_diff,R
0,1,False,False,False,5.5,0.43,2.13,0.31,"[5.56, 5.82, 5.74, 5.07, 5.92, 5.73, 5.43, 4.9...","[2.28, 2.5, 2.4, 1.83, 2.36, 2.18, 1.98, 1.95,...",5.3
1,2,True,False,False,5.24,0.27,1.91,0.2,"[5.49, 5.36, 5.56, 5.05, 5.48, 5.23, 5.26, 4.7...","[2.17, 1.92, 2.18, 1.72, 2.17, 1.69, 1.89, 1.6...",5.3
2,3,True,False,True,5.3,0.0,1.69,0.0,[5.3],[1.69],5.3
3,4,False,False,True,5.48,0.17,2.07,0.25,"[5.38, 5.8, 5.37, 5.33, 5.56, 5.62, 5.37, 5.43...","[1.89, 2.6, 2.1, 2.06, 2.22, 2.12, 1.83, 2.02,...",5.3
4,5,,True,False,5.24,0.27,1.91,0.21,"[5.49, 5.36, 5.56, 5.05, 5.48, 5.23, 5.26, 4.7...","[2.14, 1.88, 2.13, 1.65, 2.15, 1.7, 1.87, 1.62...",5.3
5,6,,True,True,5.3,0.0,1.61,0.0,[5.3],[1.61],5.3
