In [9]:
import os,sys
cwd = os.path.abspath(os.path.curdir)
sys.path.append(cwd)  # workplace
import networkx as nx
import numpy as np
import pandas as pd
from dowhy import gcm
from card_gt.inter_gt import *
from utils.utils import get_args


def one_hot_encode_to_boolean(number, num_classes=51):
    """
    One-hot encode a number as boolean array.
    
    Args:
    - number: The number to encode.
    - num_classes: The total number of classes.
    
    Returns:
    - one_hot_bool: The one-hot encoded boolean array.
    """
    one_hot_bool = np.zeros(num_classes, dtype=bool)
    one_hot_bool[number] = True
    return ~one_hot_bool


def cf_gen(gcm, causal_model, inv_dim, inv_data, obs_data):
    samples = gcm.counterfactual_samples(causal_model,
                                         {inv_dim: lambda y: inv_data},
                                         observed_data=obs_data)
    return samples.to_numpy()

def interv_gen(gcm, causal_model,inv_dim,inv_data,sz=1000):
    samples = gcm.interventional_samples(causal_model,
                                        {inv_dim: lambda y: inv_data},
                                        num_samples_to_draw=sz)
    return samples.to_numpy()


def train_scm_model(adj_df,data_df):
    causal_graph = nx.from_numpy_array(adj_df.to_numpy(), create_using=nx.DiGraph)
    causal_model = gcm.InvertibleStructuralCausalModel(causal_graph)
    nrow,_ = adj_df.shape

    data = data_df.iloc[:,:nrow]
    

    data.dropna(inplace=True)

    data.columns = list(causal_graph.nodes)
    dag,nodes = adj2dag(adj_df.to_numpy())
    cg_nodes = list(causal_graph.nodes)

    gcm.auto.assign_causal_mechanisms(causal_model, data)

    for ind,node in enumerate(list(cg_nodes)):
        if len(dag.get_parents(nodes[ind])) == 0 :
            causal_model.set_causal_mechanism(node, gcm.EmpiricalDistribution())
        else:     
            causal_model.set_causal_mechanism(node, gcm.AdditiveNoiseModel(gcm.ml.create_linear_regressor()))

    gcm.fit(causal_model, data)
    return gcm,causal_model,causal_graph

def mae_mean_cf(gcm_gt, causal_model_gt,gcm_syn, causal_model_syn,data_df,sz=1000):
    intervention_ls = np.random.randn(51)*5
    mae_dims = []
    for inv_dim in range(51):
        for itvn in intervention_ls:
            index = np.arange(0,len(data_df.iloc[:,0]))
            np.random.shuffle(index)
            AE = np.abs(np.mean(cf_gen(gcm_gt, causal_model_gt,inv_dim,itvn,data_df.iloc[index[:sz]]),axis=0)-\
                        np.mean(cf_gen(gcm_syn, causal_model_syn,inv_dim,itvn,data_df.iloc[index[:sz]]),axis=0))
            MAE_i = np.mean(AE[one_hot_encode_to_boolean(inv_dim)])
            mae_dims.append(MAE_i)
    return np.mean(mae_dims)

  from .autonotebook import tqdm as notebook_tqdm


# Causal inference questions and answers: 51 nodes

In [10]:
import numpy as np
import pandas as pd

for sindex in range(1,11):
    np.random.seed(77)
    intervention_ls = np.random.randn(51)*5
    dataname = 'sim_lu'

    data_path = f'/Users/ruibo/Documents/Codes/CauTabBench/data/{dataname}/{sindex}/generated_graph_data.csv'
    data_df = pd.read_csv(data_path)
    adj_path = f'./data/{dataname}/{sindex}/generated_graph_target.csv'
    adj_df = pd.read_csv(adj_path)


    gcm_gt ,causal_model_gt, causal_graph_gt = train_scm_model(adj_df,data_df)



    answers = []
    with open(f'./card_gt/data/table/counterfactual_questions_{sindex}.txt', 'w') as file:
        for v_i in range(51):
            v_o = np.random.randint(0, high=51, size=1, dtype=int)[0]
            int_i = intervention_ls[v_i]        
            
            input_sample = 'Given the values of V0 to V9 are' 
            for i in range(51):
                input_sample += f' {data_df.loc[0][i]:.3f},'    
            
            question = input_sample + f' what is the expectation of the distribution of V{v_o} if the value of V{v_i} is {int_i}?\n'
            
            file.write(question)
        
            data = data_df.iloc[:1,:51]
            data.columns = list(causal_graph_gt.nodes)
            samples = gcm_gt.counterfactual_samples(causal_model_gt, {v_i: lambda y: int_i},observed_data=data)[v_o]
            answers.append(samples)

    with open(f'./card_gt/data/table/counterfactual_answers_{sindex}.txt', 'w') as file:
        for i in answers:
            file.write(f"{i[0]}\n")



Fitting causal mechanism of node 50: 100%|██████████| 51/51 [00:00<00:00, 677.98it/s]
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' 

# Causal inference questions and answers: 10 nodes

In [39]:
import numpy as np
import pandas as pd

for sindex in range(100,110):
    np.random.seed(77)
    intervention_ls = np.random.randn(10)*5
    dataname = 'sim_lu'

    data_path = f'/Users/ruibo/Documents/Codes/CauTabBench/data/{dataname}/{sindex}/generated_graph_data.csv'
    data_df = pd.read_csv(data_path)
    adj_path = f'./data/{dataname}/{sindex}/generated_graph_target.csv'
    adj_df = pd.read_csv(adj_path)
    gcm_gt ,causal_model_gt, causal_graph_gt = train_scm_model(adj_df,data_df)

    answers = []
    with open(f'./card_gt/data/table/counterfactual_questions_{sindex}.txt', 'w') as file:
        for v_i in range(10):
            v_o = np.random.randint(0, high=10, size=1, dtype=int)[0]
            int_i = intervention_ls[v_i]        
            
            input_sample = 'Given the values of V0 to V9 are' 
            for i in range(10):
                input_sample += f' {data_df.loc[0][i]:.3f},'    
            
            question = input_sample + f' what is the expectation of the distribution of V{v_o} if the value of V{v_i} is {int_i}?\n'
            
            file.write(question)
        
            data = data_df.iloc[:1,:10]
            data.columns = list(causal_graph_gt.nodes)
            samples = gcm_gt.counterfactual_samples(causal_model_gt, {v_i: lambda y: int_i},observed_data=data)[v_o]
            answers.append(samples)

    with open(f'./card_gt/data/table/counterfactual_answers_{sindex}.txt', 'w') as file:
        for i in answers:
            file.write(f"{i[0]}\n")



Fitting causal mechanism of node 9: 100%|██████████| 10/10 [00:00<00:00, 217.24it/s]
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
Fitting causal mechanism of node 9: 100%|██████████| 10/10 [00:00<00:00, 910.34it/s]
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.loc[0][i]:.3f},'
  input_sample += f' {data_df.l

# Causal inference questions: intervention distribution with 10 nodes

In [None]:
import numpy as np
np.random.seed(77)
intervention_ls = np.random.randn(10)*5

dataname = 'lu'
with open(f'./card_gt/data/table/intervention_questions.txt', 'w') as file:

    for v_i in range(10):
        v_o = np.random.randint(0, high=10, size=1, dtype=int)[0]
        int_i = intervention_ls[v_i]        
        question = f'Given that the intervention value on V{v_i} is {int_i}, what is the expectation of the interventional distribution of V{v_o}? \n'
        file.write(question)





# Causal inference questions: intervention distribution with 51 nodes

In [5]:

dataname= 'sim_lu' # lg lu sg nn
mae_m = []
sindex = 1

data_path = f'./data/{dataname}/{sindex}/generated_graph_data.csv'
data_df = pd.read_csv(data_path)
adj_path = f'./data/{dataname}/{sindex}/generated_graph_target.csv'
adj_df = pd.read_csv(adj_path)
n_nodes,_ = adj_df.shape

In [6]:
import numpy as np
np.random.seed(77)
intervention_ls = np.random.randn(n_nodes)*5


dataname = 'lu'
with open(f'./card_gt/data/table/intervention_questions.txt', 'w') as file:

    for v_i in range(n_nodes):
        v_o = np.random.randint(0, high=n_nodes, size=1, dtype=int)[0]
        int_i = intervention_ls[v_i]        
        question = f'Given that the intervention value on V{v_i} is {int_i}, what is the expectation of the interventional distribution of V{v_o}? \n'
        file.write(question)





# Interventional eva

In [11]:
import re
def remove_quotes(input_str):
    # Find the start of the "answer" field
    start_index = input_str.find('"answer": "') + len('"answer": "')
    
    # Find the end of the answer field (next double quote after the value)
    # end_index = input_str.find('"', start_index)
    
    # Extract the text after "answer": 
    answer_text = input_str[start_index:].replace('"', '')
    
    return input_str[:start_index] + answer_text[:-2] +'"}'

def remove_quotes_in_answer(input_str):
    # Regex pattern to match the "answer" field content and remove quotes inside it
    modified_str = re.sub(r'("answer":\s*")([^"]*)"', lambda m: m.group(1) + m.group(2).replace('"', '') + '"', input_str)
    return modified_str

In [12]:
import json
prefix = 'graph'
graph_id = 1
llm = 'llama'

result_path = 'result'
input_file = f'./card_gt/{result_path}/{llm}/{prefix}_dsep_response_i2_lu{graph_id}.txt'
output_file = f'./card_gt/{result_path}/{llm}/{prefix}_dsep_response_i2_lu{graph_id}.json'

with open(input_file, 'r') as file:
    text = file.read()
# Split text into individual question-answer blocks

def extract_yes_no(text):
    # Use regular expressions to find occurrences of "yes" or "no" in the text
    yes_no_responses = re.findall(r'\b(yes|no)\b', text, flags=re.IGNORECASE)
    
    # Normalize the responses to lower case (or capitalize as needed)
    normalized_responses = [response.capitalize() for response in yes_no_responses]
    
    return normalized_responses



In [8]:
import numpy as np

import json
import re
pattern = r'V\d+'
pattern2 = r'\d+%'

def mae(gt,pred):
    gt = np.array(gt) 
    pred = np.array(pred)
    gt = gt[~np.isnan(pred)]
    pred = pred[~np.isnan(pred)]
    gt = gt[np.abs(pred)<100]
    pred  = pred[np.abs(pred)<100]

    return np.mean(np.abs(gt - pred))
eva_ls = []

llm = 'llama'
for sim_id in range(1,11):
    print(f'{sim_id}: ')
    with open(f'/Users/ruibo/Documents/Codes/CauTabBench/card_gt/data/table/intervention_{sim_id}_gt.txt', 'r') as file:    
        anw_gt = file.readlines()
    anw_gt = [float(a.replace('\n', '')) for a in anw_gt]


    answers = []
    with open(f'card_gt/result/{llm}/table_intv_response_lu{sim_id}.txt', 'r') as file:    
        json_data_ls = file.readlines()

    # Parse the JSON data from the string
    for js_data in json_data_ls:
        js_data = remove_quotes(js_data)
        js_data = js_data.replace("\\", "")
        data = json.loads(js_data)
        
        
        # Remove all matches from the text

        modified_text = re.sub(pattern, '', data['answer'])
        
        modified_text = re.sub(pattern2, '', modified_text)
        
        numbers  = re.findall(r"[-+]?(?:\d*\.*\d+)",modified_text)
        print(f'{sim_id}: {numbers}')

        if len(numbers) == 0:
            numbers = np.nan
        elif len(numbers) == 1:
            numbers = float(numbers[0])
        else: 
            numbers_ls = []
            for i in numbers:
                numbers_ls.append(float(i))
            numbers = np.mean(np.array(numbers_ls))
        
        
        answers.append(numbers)
        
    # print(answers)
    print(mae(anw_gt,answers))
    eva_ls.append(mae(anw_gt,answers))
# eva_ls
print(f'{np.mean(eva_ls):.3f} \pm {np.std(eva_ls):.3f}')

1: 
1: ['-0.6828']
1: ['0.1']
1: ['1.0']
1: ['1.0']
1: ['1.5338']
1: ['4.0']
1: ['0.42']
1: ['0.28494']
1: ['1.008165']
1: ['0.0']
1: ['0.0']
1: ['4.0']
1: ['7.0041']
1: ['0.092']
1: ['0.5424']
1: ['-0.392']
1: ['0.0']
1: ['1.094']
1: ['0.59']
1: ['0.58']
1: ['0.54']
1: ['7.159']
1: ['-0.59']
1: ['0.9']
1: ['-1.98']
1: ['1.0']
1: ['1.0']
1: ['1.0']
1: ['-1.5428']
1: ['0.7']
1: ['9.0']
1: ['7.0']
1: ['0.17']
1: ['-1.2']
1: ['0.42']
1: ['7.6828']
1: ['0.0', '0.0']
1: ['1.139246']
1: ['5.0']
1: ['-1.58898']
1: ['0.084']
1: ['-0.59']
1: ['-1.24']
1: ['-1.52']
1: ['0.1']
1: ['0.17918']
1: ['8.1426']
1: ['-0.653']
1: ['-0.65']
1: ['58.0']
1: ['-1.38']
2.8733104609587676
2: 
2: ['62.0']
2: ['1.78656']
2: ['0.62']
2: ['62.0']
2: ['0.62']
2: ['0.7856']
2: ['0.7858']
2: ['98']
2: ['1.56']
2: ['1.86']
2: ['0.6237']
2: ['1.5337']
2: ['62.62']
2: ['0.87']
2: ['7.0']
2: ['62.0']
2: ['-2.82', '-2.82']
2: ['85.0']
2: ['60.0']
2: ['62.0']
2: ['0.78667']
2: ['62.0', '.2', '.4', '.0', '.0', '.8', '0.1', 

# CF eva

In [15]:

import numpy as np

import json
import re
pattern = r'V\d+'
pattern2 = r'\d+%'

# Remove all matches from the text

def mae(gt,pred):
    gt = np.array(gt) 
    pred = np.array(pred)
    gt = gt[~np.isnan(pred)]
    pred = pred[~np.isnan(pred)]
    gt = gt[np.abs(pred)<100]
    pred  = pred[np.abs(pred)<100]

    return np.mean(np.abs(gt - pred))
eva_ls = []

llm = 'mistral'
for sim_id in range(1,11):

    with open(f'/Users/ruibo/Documents/Codes/CauTabBench/card_gt/data/table/counterfactual_answers_{sim_id}.txt', 'r') as file:    
        anw_gt = file.readlines()
    anw_gt = [float(a.replace('\n', '')) for a in anw_gt]


    answers = []
    with open(f'card_gt/result/{llm}/table_cf_response_lu{sim_id}.txt', 'r') as file:    
        json_data_ls = file.readlines()

    # Parse the JSON data from the string
    for js_data in json_data_ls:
        
        js_data = remove_quotes(js_data)
        # js_data = js_data.replace("\n", "")

        js_data = js_data.replace("\\", "")
        print(js_data)
        data = json.loads(js_data)
        # Remove all matches from the text

        modified_text = re.sub(pattern, '', data['answer'])
        modified_text = re.sub(pattern2, '', modified_text)
        numbers  = re.findall(r"[-+]?(?:\d*\.*\d+)",modified_text)
        print(f'{sim_id}: {numbers}')
        if len(numbers) == 0:
            numbers = np.nan
        elif len(numbers) == 1:
            numbers = float(numbers[0])
        else: 
            numbers_ls = []
            try:
                for i in numbers:
                    numbers_ls.append(float(i))
                numbers = np.mean(np.array(numbers_ls))
            except ValueError:
                numbers = np.nan
        answers.append(numbers)
    # print(answers)
    print(mae(anw_gt,answers))
    eva_ls.append(mae(anw_gt,answers))
# eva_ls
print(f'{np.mean(eva_ls):.3f} \pm {np.std(eva_ls):.3f}')

{"question":"0.2434937746037162", "answer": " The final answer is 0.1975."}
1: ['0.1975']
{"question":"-0.7991788375513162", "answer": " The final answer is 49."}
1: ['49']
{"question":"-0.5660213006565025", "answer": " The final answer is 4.9."}
1: ['4.9']
{"question":"-0.2992603342754523", "answer": " The final answer is 4987.63."}
1: ['4987.63']
{"question":"-0.2992603342754523", "answer": " The final answer is 0.897654321."}
1: ['0.897654321']
{"question":"0.8274120601682686", "answer": " The final answer is 0.9729."}
1: ['0.9729']
{"question":"0.8744077999384204", "answer": " The final answer is 0.0, as there was no valid input provided for the model to parse."}
1: ['0.0']
{"question":"-0.2992603342754523", "answer": " The final answer is 0.5454545454545455."}
1: ['0.5454545454545455']
{"question":"-0.2992603342754523", "answer": " The final answer is 154.5."}
1: ['154.5']
{"question":"0.3124048954591789", "answer": " The final answer is 0.0."}
1: ['0.0']
{"question":"1.6814124953

# Cdir questions

In [6]:
import os,sys
cwd = os.path.abspath(os.path.curdir)
sys.path.append(cwd)  # workplace
import numpy as np
import pandas as pd

import pandas as pd
from causallearn.graph.GraphNode import GraphNode
import copy
from causallearn.graph.Dag import Dag
import networkx as nx


def ini_nodes(adj_df):
    nodes = []
    for i in range(len(adj_df[0,:])):
        nodes.append(GraphNode(str(i)))
    return nodes


def adj2dag(adj_df):
    G = nx.from_numpy_array(adj_df, create_using=nx.DiGraph)
    nodes = ini_nodes(adj_df)
    dag = Dag(nodes)
    for i,j in list(G.edges()):
        dag.add_directed_edge(nodes[i], nodes[j])
    return dag,nodes

def remove_edge(index_x, index_y,nodes, dag):
    dag_rm = copy.deepcopy(dag)
    dag_rm.remove_connecting_edge(nodes[index_x], nodes[index_y])
    return dag_rm

def get_all_xy_edges(dag,nodes):
    x_ls = []
    y_ls = []
    for e in list(dag.get_graph_edges()):        
        index_x = int(e.get_node1().get_name())
        index_y = int(e.get_node2().get_name())
        x_ls.append(index_x)
        y_ls.append(index_y)
    dir = np.array([x_ls,y_ls])
    return dir.T

def get_eva_xy_dirs(dag,nodes):
    x_ls = []
    y_ls = []
    for e in list(dag.get_graph_edges()):        
        index_x = int(e.get_node1().get_name())
        index_y = int(e.get_node2().get_name())
        dag_rm = remove_edge(index_x, index_y,nodes, dag)
        # print(e.get_node1(),e.get_node2(),dag_rm.is_dseparated_from(nodes[index_x],nodes[index_y],set()))
        if dag_rm.is_dseparated_from(nodes[index_x],nodes[index_y],set()):
            x_ls.append(index_x)
            y_ls.append(index_y)
    dir = np.array([x_ls,y_ls])
    return dir.T


In [7]:
dataname = 'lu'
for sim_seed in range(1,11):
    with open(f'./card_gt/data/table/cdir_{sim_seed}_questions.txt', 'w') as file:
        adj_path = f'./data/sim_{dataname}/{sim_seed}/generated_graph_target.csv'
        adj_df = pd.read_csv(adj_path)
        dag,nodes = adj2dag(adj_df.to_numpy())
        xy_edges = get_all_xy_edges(dag,nodes)
        eva_xy_dirs =  get_eva_xy_dirs(dag,nodes)
        for pair in eva_xy_dirs:    
            file.write(f'Between V{pair[0]} and V{pair[1]}, is V{pair[0]} the cause?\n')
            




# Graph reasoning: D-separation questions

In [3]:
def text_cz(cz):
    cz_text = ''
    for element in cz:
        cz_text += 'V' + str(element) 
    return cz_text


def text_ci(ciset):
    question_ls = []
    for cis in ciset:
        if len(cis[2]) == 0:
            ci_text = f'Are V{cis[0]} and V{cis[1]} independent from each other?'
        else:
            cz_text =text_cz(cis[2])
            ci_text = f'Are V{cis[0]} and V{cis[1]} conditionally independent given {cz_text}?'    
        question_ls.append(ci_text)
    return question_ls


def text_dsep(ciset):
    question_ls = []
    for cis in ciset:
        if len(cis[2]) == 0:
            ci_text = f'Are V{cis[0]} and V{cis[1]} d-seperated?'
        else:
            cz_text =text_cz(cis[2])
            ci_text = f'Are V{cis[0]} and V{cis[1]} d-seperated given {cz_text}?'    
        question_ls.append(ci_text)
    return question_ls

from card_gt.src.causal_eval.helper import get_sets


In [4]:
dataname = 'lu'
for sim_seed in range(1,11):
    adj_path = f'./data/sim_{dataname}/{sim_seed}/generated_graph_target.csv'
    adj_df = pd.read_csv(adj_path)
    _,_,conditional_independent_set,_,conditional_dependent_set = get_sets(adj_df.to_numpy())

    ci_sets = text_dsep(conditional_independent_set)
    nci_set = text_dsep(conditional_dependent_set)
    with open(f'./card_gt/data/graph/{sim_seed}_questions.txt', 'w') as file:
        for q in ci_sets:
            file.write(q)
            file.write('\n')
        for q in nci_set:
            file.write(q)
            file.write('\n')

    with open(f'./card_gt/data/graph/{sim_seed}_answers.txt', 'w') as file:
        for q in ci_sets:
            file.write('yes')
            file.write('\n')
        for q in nci_set:
            file.write('no')
            file.write('\n')

# Knowledge discovery: Conditional independence

In [5]:
dataname = 'lu'
for sim_seed in range(1,11):
    adj_path = f'./data/sim_{dataname}/{sim_seed}/generated_graph_target.csv'
    adj_df = pd.read_csv(adj_path)
    _,_,conditional_independent_set,_,conditional_dependent_set = get_sets(adj_df.to_numpy())

    ci_sets = text_ci(conditional_independent_set)
    nci_set = text_ci(conditional_dependent_set)
    with open(f'./card_gt/data/table/{sim_seed}_questions.txt', 'w') as file:
        for q in ci_sets:
            file.write(q)
            file.write('\n')
        for q in nci_set:
            file.write(q)
            file.write('\n')

    with open(f'./card_gt/data/table/{sim_seed}_answers.txt', 'w') as file:
        for q in ci_sets:
            file.write('yes')
            file.write('\n')
        for q in nci_set:
            file.write('no')
            file.write('\n')