In [166]:
import numpy as np
import matplotlib.pyplot as plt
import math
import random
import pandas as pd

## Causally Suggestive Data- Fruits

In [521]:
colors = "red purple blue green".split()
textures = "smooth spiky rough".split()
shapes = "sphere cube star".split()

class Fruit: # attributes: color, texture, shape
    def __init__(self, color, texture, shape, poison_func):
        assert color in colors, "Invalid color"
        assert texture in textures, "Invalid texture"
        assert shape in shapes, "Invalid shape"
        self.color = color
        self.texture = texture
        self.shape = shape
        self.poison_func = poison_func
    
    def __str__(self):
        return(f"{self.color} {self.texture} {self.shape}")
    def __repr__(self):
        return(f"{self.color} {self.texture} {self.shape}")
    
    def is_poisonous(self): # establish ground truth about fruit based on attributes. 
        # Here select that spiky fruits poisonous
        return(self.poison_func(self))
    
    def to_row(self):
        # convert to one hot encoding of fruit that can be added to dataframe as row
        return([colors.index(self.color), textures.index(self.texture), shapes.index(self.shape), int(self.is_poisonous())])
    
    @staticmethod
    def generate_random(poison_func): 
        color = colors[random.randint(0, len(colors)-1)]
        texture = textures[random.randint(0, len(textures)-1)]
        shape = shapes[random.randint(0, len(shapes)-1)]
        return(Fruit(color, texture, shape, poison_func))
    


In [523]:
def is_poisonous(fruit): # establish ground truth about fruit based on attributes. 
    # Here select that spiky fruits poisonous
    return(fruit.texture == "spiky")


In [524]:
apple = Fruit1("red", "spiky", "sphere", is_poisonous)

In [525]:
apple.is_poisonous()

True

In [537]:
f = Fruit.generate_random(is_poisonous)
print(f)
f.is_poisonous()

purple spiky star


True

In [483]:
df = pd.DataFrame(columns="colors textures shapes poisonous".split())

In [484]:
df = pd.DataFrame(columns="colors textures shapes poisonous".split())
def add_data(poison_func, n=10, df=None):
    if not df:
        df = pd.DataFrame(columns="colors textures shapes poisonous".split())
    for i in range(n):
        fruit = Fruit.generate_random(poison_func)
        df.loc[len(df.index)] = [colors.index(fruit.color), textures.index(fruit.texture), shapes.index(fruit.shape), int(fruit.is_poisonous())]
    return(df)

In [485]:
data = add_data(is_poisonous)
data

Unnamed: 0,colors,textures,shapes,poisonous
0,2,1,1,0
1,2,1,1,0
2,2,0,2,0
3,0,1,1,1
4,3,0,2,0
5,3,1,2,0
6,1,2,0,0
7,1,1,0,0
8,1,2,2,0
9,0,1,1,1


# First Order Generalization: features

In [540]:
def is_poisonous(fruit): 
    return(fruit.texture == "spiky")
data = add_data(is_poisonous)
data

Unnamed: 0,colors,textures,shapes,poisonous
0,1,1,2,1
1,3,1,1,1
2,3,0,1,0
3,2,2,0,0
4,3,1,0,1
5,1,1,1,1
6,1,0,1,0
7,0,2,2,0
8,0,0,0,0
9,3,2,1,0


In [490]:
def filterPoison(df,value):
    return(data[data['poisonous']==value].drop(['poisonous'], axis=1))

In [491]:
def NaiveProbabilities(data, y, y_label='poisonous'):
    # posterior = prior * likelihood
    df = data[data[y_label]==y].drop([y_label], axis=1)
    count_y, num_features = df.shape
    prior = count_y / data.shape[0] # P(Y), prior
    max_length = max([len(eval(feature)) for feature in df.columns]) # 4
    probs = np.zeros((max_length, num_features))
    for index in range(num_features):
        feature = df.columns[index]
        col = df[feature]
        for xj in range(max_length): # which feature value are we testing? P[Xi=xij | Y=y]; find range
            probs[xj, index] = col[col==xj].size / count_y
    return(pd.DataFrame(probs, columns=df.columns))

In [542]:
probs = NaiveProbabilities(data, 1)
probs

Unnamed: 0,colors,textures,shapes
0,0.0,0.0,0.25
1,0.5,1.0,0.5
2,0.0,0.0,0.25
3,0.5,0.0,0.0


In [545]:
rows, cols = np.where(probs==0.25)

In [515]:
def Entropy(data, col_name):
    col = data[col_name]
    def H(p):
        if(p==0):
            return(0)
        return(-1 * p * np.log2(p))
    return(col.apply(H).sum())
Entropy(probs, 'colors')

1.5

In [502]:
def identify(fruit_data):
    prob_0 = NaiveProbabilities(data, y=0)
    prob_1 = NaiveProbabilities(data, y=1)
    
    entropies = {}
    for col in prob_0.columns:
        entropies[col] = [Entropy(prob_0, col)]
        entropies[col].append(Entropy(prob_1, col))
    var = ''
    lowest_entropy = 10000
    for col in prob_0.columns:
        #print(f"Testing {col}, champ {var}, with entropy {lowest_entropy}")
        if(sum(entropies[col]) < lowest_entropy):
            lowest_entropy = sum(entropies[col])
            var = col
    return(var, entropies)

In [516]:
colors = "red purple blue green".split()
textures = "smooth spiky rough".split()
shapes = "sphere cube star".split()

def is_poisonous(fruit): # test various different criterion for poison and see whether model identifies
    return(fruit.texture == "rough")

data = add_data(is_poisonous)
identify(data)

('textures',
 {'colors': [1.3787834934861753, 0.9182958340544896],
  'textures': [0.9852281360342515, 0.0],
  'shapes': [1.4488156357251847, 0.9182958340544896]})

In [390]:
filterPoison(data,0)

Unnamed: 0,shapes,colors,textures
0,1,0,2
3,0,0,2
5,1,1,0
6,0,3,0
8,2,0,2
9,1,3,0


In [510]:
data

Unnamed: 0,colors,textures,shapes,poisonous
0,1,0,2,1
1,1,0,1,0
2,1,2,1,0
3,0,2,2,1
4,0,1,1,0
5,2,2,2,1
6,0,0,0,0
7,2,0,1,0
8,2,2,0,0
9,2,0,2,1


In [561]:
colors

['red', 'purple', 'blue', 'green']

In [560]:
data[data['colors']==0]

Unnamed: 0,colors,textures,shapes,poisonous
7,0,2,2,0
8,0,0,0,0


In [556]:
probs = NaiveProbabilities(data, y=1)
probs

Unnamed: 0,colors,textures,shapes
0,0.0,0.0,0.25
1,0.5,1.0,0.5
2,0.0,0.0,0.25
3,0.5,0.0,0.0


In [557]:
probs==0.5

Unnamed: 0,colors,textures,shapes
0,False,False,False
1,True,False,True
2,False,False,False
3,True,False,False


In [558]:
np.where(probs==0.5)

(array([1, 1, 3]), array([0, 2, 0]))

In [342]:
def NaiveClassProbability(x, y, data, Y):
    # computes term P(X=x | y) * P(y)
    df = data[data[Y]==y].drop([Y], axis=1)
    count_y, num_features = df.shape
    prior = count_y / train.shape[0]
    log_prob = 0
    # presumably, x is some kind of iterable / vector. Iterate over each component
    for index in range(num_features):
        col = df[df.columns[index]]
        xi = np.array(x)[index] # so works regardless of whether x is pd.series or numpy
        #estimate P(Xi=xi | Y=y) as percent of rows within df where Xi=xi
        log_prob += np.log(col[col==xi].size / count_y)
    
    return(np.log(prior) + log_prob)

def NaiveBayes(x, data, Y='poisonous'):
    p0 = NaiveClassProbability(x, 0, data, Y)
    p1 = NaiveClassProbability(x, 1, data, Y)
    if(p0>p1):
        return(0)
    return(1)

In [None]:
NaiveBayes([])

# Pearl, DAGs 

In [118]:
class CausalLink:
    def __init__(self, source, sink):
        # directed edge from v1 to v2, indicating one-way causality
        assert source != sink
        self.cause = source
        self.effect = sink
    def __repr__(self):
        return(f"{self.__class__}::{self.cause}-->{self.effect}")
    def __str__(self):
        return(f"{self.cause}-->{self.effect}")
    

class Feature:
    def __init__(self, name, data=0):
        self.parents = [] # edges where Feature is the effect
        self.children = []# edges where Feature is the cause
        self.data = data
        self.name = name

    def add_parent(self, node):
        if(node not in self.parents):
            self.parents.append(node)
    def add_child(self, node):
        if(node not in self.children):
            self.children.append(node)
    def in_degree(self):
        return(len(self.parents))
    def out_degree(self):
        return(len(self.children))
    
    def __repr__(self):
        return(f"{self.__class__}::{self.name}")
    def __str__(self):
        return(str(self.name))
    
    def __eq__(self, other):
        return (self.data, self.name) == (other.data, other.name)


In [114]:
class CausalGraph:
    def __init__(self):
        self.vertices = []
        self.edges = []
    
    def addNode(self, name):
        vertex = Feature(name)
        self.vertices.append(vertex)
    
    def addNodes(self, names):
        for name in names.split():
            self.addNode(name)
    
    def getNode(self, name):
        for vertex in self.vertices:
            if(vertex.name == name):
                return vertex
        return(None)
    
    def __str__(self):
        result = "["
        for v in self.vertices:
            result += str(v) + ", "
        result += "]\n"
        for edge in self.edges:
            result = result + str(edge) + "\n"
        return(result)
    
    def addEdge(self, name_1, name_2):
        source, sink = self.getNode(name_1), self.getNode(name_2)
        if(source and sink): # both exist as valid vertices
            edge = CausalLink(source, sink)
            self.edges.append(edge)

            source.add_child(sink)
            sink.add_parent(source)
        else:
            print("Invalid")
    
    

In [549]:
G = CausalGraph()
G.addNodes("shape texture color poisonous")

In [551]:
G.vertices

[<class '__main__.Feature'>::shape,
 <class '__main__.Feature'>::texture,
 <class '__main__.Feature'>::color,
 <class '__main__.Feature'>::poisonous]

In [116]:
G.addEdge("shape", "texture")
G.addEdge("kind", "shape")
G.addEdge("kind", "color")
G.addEdge("texture", "poison")
G.addEdge('shape', "poison")

In [190]:
print(G)

[shape, texture, kind, color, poison, ]
shape-->texture
kind-->shape
kind-->color
texture-->poison
shape-->poison



In [117]:
G.getNode("shape").children

[<class '__main__.Feature'>:: texture, <class '__main__.Feature'>:: poison]

In [69]:
f2 = Feature("shape")

In [70]:
f3 = Feature("poison")
f4 = Feature("kind")

In [71]:
e12 = CausalLink(source=f1, sink=f2)
e13 = CausalLink(source=f1, sink=f3)
e41 = CausalLink(f4, f1)

In [72]:
f1.add_edge(e12)
f1.add_edge(e13)
f1.add_edge(e41)

In [74]:
f1.children

{<class '__main__.CausalLink'>, color-->poison,
 <class '__main__.CausalLink'>, color-->shape}

shape


In [28]:
G = CausalGraph()
G.addNode("color")
G.addNode("texture")
G.addNode("shape")
G.addNode("poisonous")
print(G)

[color, texture, shape, poisonous, ]
