In [1]:
# do this if you use a custom virtual environment to specify the directory to look for installed packages
import sys
import os
## goes back to the project directory
os.chdir("..")
# switch to the name of your virtual environment
kernel_name = ".venv_mp"
sys.path.append("\\".join([os.getcwd(), kernel_name, "Lib\\site-packages"]))

In [2]:
from data_reader import read_data
from node import get_node_data
import pandas as pd
import numpy as np

In [3]:
raw_data = read_data()
# nodes = {"pi"+str(i+2):i for i in range(4)}
nodes = ["pi2", "pi3", "pi4", "pi5"]

In [4]:
from modelling import grid_search_models, fit_clf, score_clf, select_model_data  
from similar import get_similar_pairs_nodes

def test_in_pairs(similar_pairs, model_data, models, mmd_scores, ocsvm_scores):
    l = []
    for i in range(len(similar_pairs)):
        node_x, node_y = similar_pairs[i]
        x = model_data[node_x]
        y = model_data[node_y]
        
        model_x = models[node_x]
        ex = fit_clf(model_x, x)
        exy = score_clf(model_x, y)

        model_y = models[node_y]
        ey = fit_clf(model_y, y)
        eyx = score_clf(model_y, x)       
        
        l.append(pd.DataFrame([{"model_node" : node_x, "test_node" : node_y, "discrepancy" : round(abs(ex-exy),2), 
                               "mmd_score" : mmd_scores[i], "ocsvm_score" : ocsvm_scores[i][0]}]))
        l.append(pd.DataFrame([{"model_node" : node_y, "test_node" : node_x, "discrepancy" : round(abs(ey-eyx),2),
                                "mmd_score" : mmd_scores[i], "ocsvm_score" : ocsvm_scores[i][1]}]))
    return pd.concat(l, ignore_index = True)

def test_hypothesis(clf_name, model_data, similar_pairs, similar_nodes, mmd_scores, ocsvm_scores): 
    models, models_df = grid_search_models(clf_name, model_data, similar_nodes)
    test_df = test_in_pairs(similar_pairs, model_data, models, mmd_scores, ocsvm_scores)
    return models_df.merge(test_df, how='outer', on='model_node')
    
def test_hypothesis_all_clfs(data, similar_pairs, similar_nodes, mmd_scores, ocsvm_scores):
    model_data = select_model_data(data, similar_nodes)
    svr_df = test_hypothesis("svr", model_data, similar_pairs, similar_nodes, mmd_scores, ocsvm_scores)
    lsvr_df = test_hypothesis("lsvr", model_data, similar_pairs, similar_nodes, mmd_scores, ocsvm_scores)
    df = pd.concat([svr_df, lsvr_df], ignore_index = True)
    
    return df

In [5]:
def get_results(data, standardised):
    results = []
    for experiment in range(1,4):
        print(f"Experiment {experiment}: ", end="")
        similar_pairs, similar_nodes, asmmd, mmd_scores, ocsvm_scores = get_similar_pairs_nodes(experiment, data, standardised)

        print(similar_pairs)
        df = test_hypothesis_all_clfs(data[experiment]["sampled_data"], similar_pairs, similar_nodes, mmd_scores, ocsvm_scores)

        df["experiment"] = [experiment] * df.shape[0]
        df["std"] = [standardised] * df.shape[0]
        df["asmmd"] = [asmmd] * df.shape[0]
        results.append(df)

    return pd.concat(results, ignore_index = True)

In [6]:
from node import create_samples

def save_samples(samples, ocsvm_data, standardised):
    if standardised:
        f = "standardised"
    else:
        f = "original"
        
    raw_node_data = []
    for experiment in range(1,4):
        raw_node_data.append(pd.concat(ocsvm_data[experiment]["raw_node_data"]))
    raw_node_data = pd.concat(raw_node_data)
    raw_node_data.to_csv(f"data/samples/{f}/raw_node_data.csv")

    for sample_id in range(4):
        sample = []
        for experiment in range(1,4):
            sample.append(pd.concat(samples[experiment][sample_id]))
        results = pd.concat(sample, ignore_index = True)
        results.to_csv(f"data/samples/{f}/sample_{sample_id+1}.csv", index=False)

In [7]:
standardised = True
std_ocsvm_data, std_samples = create_samples(raw_data, standardised)
save_samples(std_samples, std_ocsvm_data, standardised)

In [8]:
standardised = False
ocsvm_data, samples = create_samples(raw_data, standardised)
save_samples(samples, ocsvm_data, standardised)

In [9]:
def run(ocsvm_data, samples, standardised):
    if standardised:
        data_type = "standardised"
    else:
        data_type = "original"
    for sample_id in range(4):
        print(f"Sample {sample_id+1} \n")
        data = ocsvm_data.copy()
        for experiment in range(1,4):
            data[experiment]["sampled_data"] = samples[experiment][sample_id]

        results = get_results(data, standardised)
        results.to_csv(f"results/{data_type}/sample_{sample_id+1}.csv", index=False)
        print()

In [10]:
run(std_ocsvm_data, std_samples, standardised = True)

Sample 1 

Experiment 1: [('pi2', 'pi3'), ('pi3', 'pi4')]
Experiment 2: [('pi2', 'pi3'), ('pi3', 'pi5')]
Experiment 3: [('pi2', 'pi4'), ('pi4', 'pi5')]

Sample 2 

Experiment 1: [('pi2', 'pi3'), ('pi3', 'pi4')]
Experiment 2: [('pi2', 'pi3'), ('pi3', 'pi5')]
Experiment 3: [('pi2', 'pi4')]

Sample 3 

Experiment 1: [('pi2', 'pi3'), ('pi3', 'pi4')]
Experiment 2: [('pi2', 'pi3'), ('pi3', 'pi5')]
Experiment 3: [('pi2', 'pi4'), ('pi4', 'pi5')]

Sample 4 

Experiment 1: [('pi2', 'pi3'), ('pi3', 'pi4')]
Experiment 2: [('pi2', 'pi3'), ('pi3', 'pi5')]
Experiment 3: [('pi2', 'pi4')]



In [11]:
run(ocsvm_data, samples, standardised = False)

Sample 1 

Experiment 1: [('pi2', 'pi4')]
Experiment 2: [('pi2', 'pi4')]
Experiment 3: [('pi2', 'pi4'), ('pi2', 'pi5'), ('pi4', 'pi5')]

Sample 2 

Experiment 1: [('pi2', 'pi4')]
Experiment 2: [('pi2', 'pi4'), ('pi3', 'pi5')]
Experiment 3: [('pi2', 'pi4'), ('pi2', 'pi5'), ('pi4', 'pi5')]

Sample 3 

Experiment 1: [('pi2', 'pi4')]
Experiment 2: [('pi2', 'pi4')]
Experiment 3: [('pi2', 'pi4'), ('pi2', 'pi5'), ('pi4', 'pi5')]

Sample 4 

Experiment 1: [('pi2', 'pi4')]
Experiment 2: [('pi2', 'pi4'), ('pi3', 'pi5')]
Experiment 3: [('pi2', 'pi4'), ('pi2', 'pi5'), ('pi4', 'pi5')]

