In [27]:
import numpy as np # linear algebra
import pandas as pd 
from sklearn.preprocessing import StandardScaler

path = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "label"]

data = pd.read_csv(path, names=column_names, index_col=False)
    

#data.reset_index(drop=True, inplace=True) 
#data = data.dropna(how = 'all')
#data = data[data.age != '?']

data.head()


from sklearn import preprocessing
# one-hot encoding for categorical vars
for feat in ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country',"label"]:
    data[feat] = preprocessing.LabelEncoder().fit_transform(data[feat])

#standardized continous data
continuous = ['fnlwgt', 'education-num','capital-gain','capital-loss','hours-per-week']
data[continuous] = StandardScaler().fit_transform(data[continuous])
#####################################################
#### For this experiment, we uniquely drop the default variable (prior default)
###################################################
#data = data.drop(['educationlevel'], axis=1)   
data.to_csv("adult_data.csv",index=False)
data.head() 

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,7,-1.063611,9,1.134739,4,1,1,4,1,0.148453,-0.21666,-0.035429,39,0
1,50,6,-1.008707,9,1.134739,2,4,0,4,1,-0.14592,-0.21666,-2.222153,39,0
2,38,4,0.245079,11,-0.42006,0,6,1,4,1,-0.14592,-0.21666,-0.035429,39,0
3,53,4,0.425801,1,-1.197459,2,6,0,2,1,-0.14592,-0.21666,-0.035429,39,0
4,28,4,1.408176,9,1.134739,2,10,5,2,0,-0.14592,-0.21666,-0.035429,5,0


In [37]:
new_data = pd.read_csv("adult_data.csv")
new_data = new_data.values
X = new_data[:, :14].astype(np.uint32)
y = new_data[:, 14].astype(np.uint8)
print(X[0])

[                  39                    7 18446744073709551615
                    9                    1                    4
                    1                    1                    4
                    1                    0                    0
                    0                   39]


In [12]:
from pycausal.pycausal import pycausal as pc
pc = pc()
pc.start_vm()

from pycausal import prior as p
prior = p.knowledge(addtemporal = [['sex', 'age', 'race', 'native-country'], ['education', 'education-num', 'marital-status', 'occupation',
                                    'hours-per-week', 'workclass', 'relationship', 'capital-gain', 'capital-loss'], ['income']])

from pycausal import search as s
tetrad = s.tetradrunner()
tetrad.run(algoId = 'fges', scoreId = 'cg-bic-score', dfs = data, priorKnowledge = prior,
           maxDegree = -1, faithfulnessAssumed = True, verbose = False)
tetrad.getEdges()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0


In [9]:
edges = []
for edge in tetrad.getEdges():
    edges.append(list([column_names.index(edge.split(' ')[0]), column_names.index(edge.split(' ')[-1])]))

print(edges )

[[9, 8], [3, 6], [0, 6], [9, 12], [4, 11], [0, 10], [12, 5], [8, 7], [8, 5], [8, 14], [4, 6], [0, 5], [0, 4], [12, 1], [4, 10], [9, 3], [2, 4], [4, 3], [0, 11], [2, 0], [1, 6], [7, 6], [7, 14], [3, 5], [9, 4], [8, 1], [12, 10], [7, 12], [14, 11], [2, 9], [12, 14], [11, 10], [7, 4], [14, 4], [4, 1], [13, 8], [14, 5], [0, 7], [0, 14], [14, 3], [14, 6], [9, 1], [1, 5], [14, 10], [9, 0], [13, 3], [13, 2], [9, 7], [13, 4], [9, 6], [9, 14], [12, 4], [7, 5]]


In [8]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0
