In [1]:
# do this if you use a custom virtual environment to specify the directory to look for installed packages
import sys
import os
import pandas as pd
## goes back to the project directory
os.chdir("..")
# switch to the name of your virtual environment
kernel_name = ".venv_mp"
sys.path.append("\\".join([os.getcwd(), kernel_name, "Lib\\site-packages"]))

In [6]:
from node import create_samples, save_samples
directory = "//".join([os.getcwd(), "data", "bank-marketing", "reduced", "raw_node_data.csv"])
raw_data = pd.read_csv(directory, header =0, index_col = 0)
n_samples = 100
samples = create_samples(n_samples, raw_data)
save_samples(samples, "bank")

In [2]:
from node import get_node_data
directory = "//".join([os.getcwd(), "data", "bank-marketing", "samples"])
samples = [get_node_data(pd.read_csv(directory + f"//sample_{i+1}.csv")) for i in range(100)]

In [9]:
from modelling import select_model_data  
from similar import get_similar_pairs_nodes
from hypothesis_testing import test_hypothesis

def get_results(raw_node_data):
    results = []
    for balanced in [False, True]:
        if balanced:
            balanced_node_data = []
            for df in raw_node_data:
                yes = df.loc[df.label == "yes"]
                no = df.loc[df.label == "no"]
#                 print(yes.shape, no.shape)
                if yes.shape[0] < no.shape[0]:
                    balanced_df = [yes, no.sample(yes.shape[0])]
                else:
                    balanced_df = [no, yes.sample(no.shape[0])]
                balanced_node_data.append(pd.concat(balanced_df).sample(frac=1).reset_index(drop=True))
            raw_node_data = balanced_node_data
#         print(raw_node_data)       
        node_data, similar_pairs, similar_nodes, asmmd, mmd_scores, ocsvm_scores = get_similar_pairs_nodes(raw_node_data)
#         print(mmd_scores, ocsvm_scores)
        
        if similar_pairs != []:
            print(f"{similar_pairs} (balanced={balanced})", end=" ")
            model_data = select_model_data(node_data, similar_nodes)
            df = test_hypothesis("lr", model_data, similar_pairs, similar_nodes, mmd_scores, ocsvm_scores)
            df["asmmd"] = [asmmd] * df.shape[0]
            df["balanced"] = [balanced] * df.shape[0]
            results.append(df)
    if results != []:
        return pd.concat(results, ignore_index = True)
    else:
        return pd.DataFrame()

In [6]:
get_results(samples[0])

[('pi1', 'pi6'), ('pi1', 'pi7'), ('pi3', 'pi4'), ('pi3', 'pi6'), ('pi3', 'pi7'), ('pi3', 'pi8'), ('pi4', 'pi5'), ('pi4', 'pi6'), ('pi4', 'pi7'), ('pi4', 'pi8'), ('pi5', 'pi6'), ('pi5', 'pi7'), ('pi6', 'pi7'), ('pi6', 'pi8'), ('pi7', 'pi8')] (balanced=False) (35, 5) (1897, 5)
(348, 5) (1588, 5)
(41, 5) (1892, 5)
(30, 5) (1903, 5)
(28, 5) (1906, 5)
(37, 5) (1897, 5)
(20, 5) (1913, 5)
(35, 5) (1898, 5)
(1027, 5) (906, 5)
(337, 5) (1599, 5)
[('pi1', 'pi3'), ('pi1', 'pi4'), ('pi1', 'pi5'), ('pi1', 'pi6'), ('pi1', 'pi8'), ('pi2', 'pi10'), ('pi3', 'pi4'), ('pi3', 'pi5'), ('pi3', 'pi6'), ('pi3', 'pi8'), ('pi4', 'pi5'), ('pi4', 'pi6'), ('pi4', 'pi8'), ('pi5', 'pi6'), ('pi5', 'pi7'), ('pi5', 'pi8'), ('pi6', 'pi8'), ('pi7', 'pi8')] (balanced=True) 

Unnamed: 0,model_node,model,model_r2,train_time,optimisation_time,test_node,discrepancy,model_r2-d,test_r2,mmd_score,ocsvm_score,asmmd,balanced
0,pi1,LogisticRegression(max_iter=100000),0.98,0.07,3.75,pi6,0.01,0.989653,0.989653,36.72,0.00,39.302441,False
1,pi1,LogisticRegression(max_iter=100000),0.98,0.07,3.75,pi7,0.00,0.981893,0.981893,40.99,0.00,39.302441,False
2,pi6,LogisticRegression(max_iter=100000),0.99,0.08,6.35,pi1,0.01,0.981884,0.981884,36.72,0.00,39.302441,False
3,pi6,LogisticRegression(max_iter=100000),0.99,0.08,6.35,pi3,0.01,0.984480,0.984480,36.96,0.00,39.302441,False
4,pi6,LogisticRegression(max_iter=100000),0.99,0.08,6.35,pi4,0.00,0.985522,0.985522,33.78,0.98,39.302441,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,pi8,"LogisticRegression(C=1, max_iter=100000, solve...",0.69,0.02,19.57,pi7,0.21,0.476190,0.809524,946.19,0.91,2136.912574,True
62,pi2,LogisticRegression(max_iter=100000),0.74,0.01,0.94,pi10,0.19,0.554487,0.690705,854.44,0.00,2136.912574,True
63,pi10,LogisticRegression(max_iter=100000),0.69,0.03,1.78,pi2,0.19,0.500000,0.743243,854.44,0.46,2136.912574,True
64,pi7,LogisticRegression(max_iter=100000),0.81,0.02,0.91,pi5,0.04,0.850746,0.835821,1603.21,0.87,2136.912574,True


In [17]:
def run(samples):
    for sample_id in range(75,100):
        print(f"Sample {sample_id+1}", end=": ")
        results = get_results(samples[sample_id])
        results.to_csv(f"results/bank-marketing/sample_{sample_id+1}.csv", index=False)
        print()

In [18]:
run(samples)

Sample 76: [('pi1', 'pi3'), ('pi1', 'pi6'), ('pi1', 'pi7'), ('pi1', 'pi8'), ('pi3', 'pi7'), ('pi3', 'pi8'), ('pi4', 'pi5'), ('pi5', 'pi7'), ('pi5', 'pi8'), ('pi7', 'pi8')] (balanced=False) [('pi1', 'pi3'), ('pi1', 'pi4'), ('pi1', 'pi6'), ('pi1', 'pi7'), ('pi1', 'pi8'), ('pi2', 'pi10'), ('pi3', 'pi4'), ('pi3', 'pi5'), ('pi3', 'pi6'), ('pi3', 'pi7'), ('pi3', 'pi8'), ('pi4', 'pi5'), ('pi4', 'pi6'), ('pi5', 'pi7'), ('pi6', 'pi7'), ('pi6', 'pi8'), ('pi7', 'pi8')] (balanced=True) 
Sample 77: [('pi1', 'pi3'), ('pi3', 'pi5'), ('pi3', 'pi6'), ('pi3', 'pi7'), ('pi3', 'pi8'), ('pi4', 'pi6'), ('pi4', 'pi7'), ('pi4', 'pi8'), ('pi5', 'pi6'), ('pi5', 'pi8'), ('pi6', 'pi7'), ('pi6', 'pi8'), ('pi7', 'pi8')] (balanced=False) [('pi1', 'pi3'), ('pi1', 'pi5'), ('pi1', 'pi6'), ('pi1', 'pi8'), ('pi2', 'pi10'), ('pi3', 'pi4'), ('pi3', 'pi5'), ('pi3', 'pi6'), ('pi3', 'pi8'), ('pi4', 'pi5'), ('pi4', 'pi6'), ('pi4', 'pi8'), ('pi5', 'pi6'), ('pi5', 'pi8'), ('pi6', 'pi8')] (balanced=True) 
Sample 78: [('pi1', 'pi3

Sample 91: [('pi1', 'pi3'), ('pi1', 'pi4'), ('pi1', 'pi5'), ('pi1', 'pi6'), ('pi1', 'pi7'), ('pi1', 'pi8'), ('pi3', 'pi5'), ('pi3', 'pi7'), ('pi3', 'pi8'), ('pi4', 'pi5'), ('pi5', 'pi7'), ('pi6', 'pi8'), ('pi7', 'pi8')] (balanced=False) [('pi1', 'pi3'), ('pi1', 'pi4'), ('pi1', 'pi5'), ('pi1', 'pi6'), ('pi1', 'pi7'), ('pi1', 'pi8'), ('pi2', 'pi10'), ('pi3', 'pi4'), ('pi3', 'pi5'), ('pi3', 'pi6'), ('pi3', 'pi8'), ('pi4', 'pi5'), ('pi4', 'pi6'), ('pi4', 'pi7'), ('pi4', 'pi8'), ('pi5', 'pi6'), ('pi5', 'pi7'), ('pi5', 'pi8'), ('pi6', 'pi8')] (balanced=True) 
Sample 92: [('pi1', 'pi3'), ('pi1', 'pi7'), ('pi3', 'pi5'), ('pi3', 'pi6'), ('pi3', 'pi7'), ('pi3', 'pi8'), ('pi4', 'pi5'), ('pi4', 'pi6'), ('pi4', 'pi8'), ('pi5', 'pi6'), ('pi5', 'pi7'), ('pi5', 'pi8'), ('pi6', 'pi7'), ('pi6', 'pi8'), ('pi7', 'pi8')] (balanced=False) [('pi1', 'pi3'), ('pi1', 'pi4'), ('pi1', 'pi5'), ('pi1', 'pi6'), ('pi1', 'pi7'), ('pi1', 'pi8'), ('pi2', 'pi10'), ('pi3', 'pi4'), ('pi3', 'pi6'), ('pi3', 'pi7'), ('pi3', '