In [1]:
# do this if you use a custom virtual environment to specify the directory to look for installed packages
import sys
import os
import pandas as pd
## goes back to the project directory
os.chdir("..")
# switch to the name of your virtual environment
kernel_name = ".venv_mp"
sys.path.append("\\".join([os.getcwd(), kernel_name, "Lib\\site-packages"]))

In [None]:
from node import create_samples, save_samples
directory = "//".join([os.getcwd(), "data", "bank-marketing", "reduced", "raw_node_data.csv"])
raw_data = pd.read_csv(directory, header =0, index_col = 0)
n_samples = 100
samples = create_samples(n_samples, raw_data)
save_samples(samples, "bank")

In [2]:
from node import get_node_data
directory = "//".join([os.getcwd(), "data", "bank-marketing", "samples"])
samples = [get_node_data(pd.read_csv(directory + f"//sample_{i+1}.csv")) for i in range(100)]

In [11]:
from modelling import select_model_data  
from similar import get_similar_pairs_nodes
from hypothesis_testing import test_hypothesis

def get_results(raw_node_data):
    results = []
    for balanced in [True, False]:
        if balanced:
            balanced_node_data = []
            for df in raw_node_data:
                yes = df.loc[df.label == "yes"]
                no = df.loc[df.label == "no"]
                if yes.shape[0] < no.shape[0]:
                    balanced_df = [yes, no.sample(yes.shape[0])]
                else:
                    balanced_df = [no, yes.sample(no.shape[0])]
                balanced_node_data.append(pd.concat(balanced_df).sample(frac=1).reset_index(drop=True))
            raw_node = balanced_node_data
                
        node_data, similar_pairs, similar_nodes, asmmd, mmd_scores, ocsvm_scores = get_similar_pairs_nodes(raw_node_data)
        
        if similar_pairs != []:
            print(f"{similar_pairs} (balanced={balanced})", end=" ")
            model_data = select_model_data(node_data, similar_nodes)
            df = test_hypothesis("lr", model_data, similar_pairs, similar_nodes, mmd_scores, ocsvm_scores)
            df["asmmd"] = [asmmd] * df.shape[0]
            df["balanced"] = [balanced] * df.shape[0]
            results.append(df)
    if results != []:
        return pd.concat(results, ignore_index = True)
    else:
        return pd.DataFrame()

In [12]:
def run(samples):
    for sample_id in range(100):
        print(f"Sample {sample_id+1}", end=": ")
        results = get_results(samples[sample_id])
        results.to_csv(f"results/bank-marketing/sample_{sample_id+1}.csv", index=False)
        print()

In [13]:
run(samples)

Sample 1: [('pi2', 'pi3'), ('pi4', 'pi5')] (balanced=True) [('pi2', 'pi3'), ('pi4', 'pi5')] (balanced=False) 
Sample 2: 
Sample 3: [('pi2', 'pi3'), ('pi2', 'pi5'), ('pi3', 'pi5'), ('pi4', 'pi5')] (balanced=True) [('pi2', 'pi3'), ('pi2', 'pi5'), ('pi3', 'pi5'), ('pi4', 'pi5')] (balanced=False) 
Sample 4: [('pi2', 'pi3'), ('pi3', 'pi4'), ('pi3', 'pi5'), ('pi4', 'pi5')] (balanced=True) [('pi2', 'pi3'), ('pi3', 'pi4'), ('pi3', 'pi5')] (balanced=False) 
Sample 5: 
Sample 6: [('pi3', 'pi4')] (balanced=True) [('pi3', 'pi4')] (balanced=False) 
Sample 7: [('pi2', 'pi3'), ('pi2', 'pi4'), ('pi3', 'pi4')] (balanced=True) [('pi2', 'pi3'), ('pi3', 'pi4')] (balanced=False) 
Sample 8: [('pi2', 'pi3'), ('pi3', 'pi5')] (balanced=False) 
Sample 9: [('pi2', 'pi4')] (balanced=True) [('pi2', 'pi4')] (balanced=False) 
Sample 10: [('pi2', 'pi5'), ('pi3', 'pi5')] (balanced=True) [('pi2', 'pi5'), ('pi3', 'pi5')] (balanced=False) 
Sample 11: [('pi2', 'pi4'), ('pi3', 'pi4')] (balanced=True) 
Sample 12: [('pi2', '

Sample 100: [('pi2', 'pi3')] (balanced=True) [('pi2', 'pi3'), ('pi2', 'pi5'), ('pi3', 'pi5')] (balanced=False) 
