In [1]:
# do this if you use a custom virtual environment to specify the directory to look for installed packages
import sys
import os
import pandas as pd
## goes back to the project directory
os.chdir("..")
# switch to the name of your virtual environment
kernel_name = ".venv_mp"
sys.path.append("\\".join([os.getcwd(), kernel_name, "Lib\\site-packages"]))

In [None]:
from node import create_samples, save_samples
directory = "//".join([os.getcwd(), "data", "bank-marketing", "reduced", "raw_node_data.csv"])
raw_data = pd.read_csv(directory, header =0, index_col = 0)
n_samples = 100
samples = create_samples(n_samples, raw_data)
save_samples(samples, "bank")

In [2]:
from node import get_node_data
directory = "//".join([os.getcwd(), "data", "bank-marketing", "samples"])
samples = [get_node_data(pd.read_csv(directory + f"//sample_{i+1}.csv")) for i in range(100)]

In [14]:
from modelling import select_model_data  
from similar import get_similar_pairs_nodes
from hypothesis_testing import test_hypothesis

def get_results(raw_node_data):
    results = []
    for balanced in [False, True]:
        rnd = raw_node_data
        if balanced:
            balanced_node_data = []
            clf_name = "lr"
            for df in raw_node_data:
                yes = df.loc[df.label == "yes"]
                no = df.loc[df.label == "no"]
#                 print(df.shape[0], yes.shape[0], no.shape[0])
                if yes.shape[0] < no.shape[0]:
                    balanced_df = [yes, no.sample(yes.shape[0])]
                else:
                    balanced_df = [no, yes.sample(no.shape[0])]
                balanced_node_data.append(pd.concat(balanced_df).sample(frac=1).reset_index(drop=True))
            rnd = balanced_node_data
        else:
            clf_name = "lr_balanced"
        node_data, similar_pairs, similar_nodes, asmmd, mmd_scores, ocsvm_scores = get_similar_pairs_nodes(rnd)
#         for i in range(4):
#             if rnd[i].shape[0] != raw_node_data[i].shape[0]:
#                 print(rnd[i])
#                 print(balanced_node_data[i])
#                 if rnd[i] != balanced_node_data[i]:
#                     print(rnd[i].shape[0], balanced_node_data.shape[0])
        
        if similar_pairs != []:
            print(f"{similar_pairs} (balanced={balanced})", end=" ")
            model_data = select_model_data(node_data, similar_nodes)
            df = test_hypothesis(clf_name, model_data, similar_pairs, similar_nodes, mmd_scores, ocsvm_scores)
            df["asmmd"] = [asmmd] * df.shape[0]
            df["balanced"] = [balanced] * df.shape[0]
            results.append(df)
    if results != []:
        return pd.concat(results, ignore_index = True)
    else:
        return pd.DataFrame()

In [13]:
get_results(samples[0])

[('pi1', 'pi3'), ('pi2', 'pi3')] (balanced=False) 1348 210 1138
1348 200 1148
1348 215 1133
1348 518 830
1348 519 829
[('pi1', 'pi2'), ('pi1', 'pi3'), ('pi2', 'pi3'), ('pi4', 'pi5')] (balanced=True) 

Unnamed: 0,model_node,model,model_r2,train_time,optimisation_time,test_node,discrepancy,model_r2-d,test_r2,mmd_score,ocsvm_score,asmmd,balanced
0,pi1,LogisticRegression(max_iter=100000),0.91,0.15,10.95,pi3,0.01,0.901896,0.904369,0.07,0.01,0.116081,False
1,pi3,"LogisticRegression(C=0.1, max_iter=100000, sol...",0.9,0.03,3.65,pi1,0.01,0.91358,0.914403,0.07,0.05,0.116081,False
2,pi3,"LogisticRegression(C=0.1, max_iter=100000, sol...",0.9,0.03,3.65,pi2,0.03,0.931575,0.929101,0.07,0.99,0.116081,False
3,pi2,LogisticRegression(max_iter=100000),0.93,0.05,4.31,pi3,0.03,0.901896,0.904369,0.07,0.99,0.116081,False
4,pi1,LogisticRegression(max_iter=100000),0.87,0.03,2.33,pi2,0.0,0.867036,0.869806,0.27,0.02,0.336352,True
5,pi1,LogisticRegression(max_iter=100000),0.87,0.03,2.33,pi3,0.0,0.870801,0.875969,0.12,0.01,0.336352,True
6,pi2,LogisticRegression(max_iter=100000),0.87,0.07,2.14,pi1,0.0,0.870712,0.870712,0.27,0.13,0.336352,True
7,pi2,LogisticRegression(max_iter=100000),0.87,0.07,2.14,pi3,0.0,0.873385,0.875969,0.19,0.96,0.336352,True
8,pi3,"LogisticRegression(C=0.1, max_iter=100000, sol...",0.88,0.01,2.21,pi1,0.01,0.862797,0.870712,0.12,0.08,0.336352,True
9,pi3,"LogisticRegression(C=0.1, max_iter=100000, sol...",0.88,0.01,2.21,pi2,0.01,0.867036,0.869806,0.19,0.99,0.336352,True


In [15]:
def run(samples):
    for sample_id in range(100):
        print(f"Sample {sample_id+1}", end=": ")
        results = get_results(samples[sample_id])
        results.to_csv(f"results/bank-marketing/sample_{sample_id+1}.csv", index=False)
        print()

In [16]:
run(samples)

Sample 1: [('pi1', 'pi3'), ('pi2', 'pi3')] (balanced=False) [('pi1', 'pi2'), ('pi2', 'pi3'), ('pi4', 'pi5')] (balanced=True) 
Sample 2: [('pi2', 'pi3')] (balanced=False) [('pi2', 'pi3'), ('pi4', 'pi5')] (balanced=True) 
Sample 3: [('pi1', 'pi2'), ('pi1', 'pi3'), ('pi2', 'pi3')] (balanced=False) [('pi1', 'pi2'), ('pi1', 'pi3'), ('pi2', 'pi3')] (balanced=True) 
Sample 4: [('pi1', 'pi3'), ('pi2', 'pi3')] (balanced=False) [('pi2', 'pi3'), ('pi4', 'pi5')] (balanced=True) 
Sample 5: [('pi1', 'pi3'), ('pi2', 'pi3')] (balanced=False) [('pi1', 'pi2'), ('pi1', 'pi3'), ('pi2', 'pi3'), ('pi4', 'pi5')] (balanced=True) 
Sample 6: [('pi1', 'pi2'), ('pi1', 'pi3'), ('pi2', 'pi3')] (balanced=False) [('pi1', 'pi2'), ('pi1', 'pi3'), ('pi2', 'pi3'), ('pi4', 'pi5')] (balanced=True) 
Sample 7: [('pi1', 'pi3'), ('pi2', 'pi3')] (balanced=False) [('pi1', 'pi3'), ('pi4', 'pi5')] (balanced=True) 
Sample 8: [('pi2', 'pi3')] (balanced=False) [('pi1', 'pi3'), ('pi2', 'pi3'), ('pi4', 'pi5')] (balanced=True) 
Sample 9