In [2]:
import random
import pandas as pd

# This class represtens the tree recursive tree structure.
# Attributes:
# attributes: The full list of available attributes from 
#     which the decision trees can choose from
# max_depth: The depth of each decision tree
# e: The epsilon parameter for the Laplacian distribution
# conn: Connection object to the database
# table: Underlying table for the queries
# a_priori: Pointer to the root node to update the number 
#     of records for each class
# condition: Relevant for leaf nodes. Contains the path 
#     (attributes and values) in a sql query string
# value: The value the attribute of the parent node. 
#     This allows a faster searching the tree structure
class Node():
    def __init__(self, attributes, max_depth, e, conn, 
                 table, a_priori, condition='', value=None):
        # create a new instance, for deletion puroses
        self.attributes = attributes.copy() 
        self.children = dict()
        self.attribute = None
        self.value = value
        self.label = ""            # class label
        self.condition = condition #SQL query
        self.conn = conn
        self.e = e
        self.table = table
        
        # add recursively new levels while the max_depth 
        # is > 0 and attributes are available.
        if max_depth > 0 and len(self.attributes)  > 0:
            self.attribute = random.choice(list(self.attributes.items()))[0]
            values = self.attributes.pop(self.attribute, None) # Select a random attribute

            # Create a child for each distinct value 
            # of the chosen attribute
            for value in values:
                condition = self.update_condition(
                    self.condition, self.attribute, value)
                child = Node(self.attributes, max_depth - 1,e, conn, table, a_priori, condition, value)
                self.children[value] = child
        else: #leaf
            self.train(a_priori)

    def update_condition(self, parent_condition,additional_condition, value):
        condition = parent_condition
        if condition != "": #earlier conditions already exist
            condition = condition + " AND "
        if isinstance(value, int): #check, if value is an integer
            condition = condition + str(additional_condition) + " LIKE '" + str(int(value)) + "' "
        else: 
            condition = condition + str(additional_condition) + " LIKE '" + str(value) + "' "
        return condition

    def train(self, a_priori):
        query0 = "EXEC laplacianCount @table = '" + self.table + "', @attribute = 'income', @e = " + str(self.e) + ", @condition = '" + self.condition.replace("'", "''") + " AND income = 0'"
        query1 = "EXEC laplacianCount @table = '" + self.table + "', @attribute = 'income', @e = " + str(self.e) + ", @condition = '" + self.condition.replace("'", "''") + " AND income = 1'"
        
        # number of records for the specific classes
        quantity0 = pd.read_sql(query0, self.conn).values[0][0] 
        quantity1 = pd.read_sql(query1, self.conn).values[0][0]
        
        # determine the label for the leaf
        if quantity0 >= quantity1:
            self.label = 0
        else:
            self.label = 1
            
        # update a priori probability of the root node
        a_priori[0] += quantity0
        a_priori[1] += quantity1


In [3]:
import pandas as pd
import numpy as np

# This class represents the random decision tree object, which contains the
# root node and functions for visualization and preditcion.
# Because the random forest class uses threads to call this class,
# he parameters for the constructor are passed within a list structure.
#
# To make predictions for completely unknown instances, the tree collects 
# the quantity of instances for each class and returns, if no prediction 
# can be made, with the most likely class based on the a priori probability.
class RandomDecisionTree(object):
    def __init__(self, inputs):
        max_depth = inputs[0]
        attributes = inputs[1]
        e = inputs[2]
        conn = inputs[3]
        table = inputs[4]

        self.a_priori = list() 
        self.a_priori.append(0)
        self.a_priori.append(0)
        
        #Create the root node
        self.root = Node(attributes, max_depth, e, conn, table, self.a_priori)

    def visualise(self):
        self.__traverse_for_visualisation__(self.root, 0)

    def __traverse_for_visualisation__(self, node, level):
        if node:
            print(str(level) + ' ' + str(node.attribute))
            if len(node.children) > 0: #iterate over all children
                for key, value in node.children.items():
                    self.__traverse_for_visualisation__(
                        node.children[key], level + 1)
            else: #leaf
                print(node.condition + ", label: " 
                      + str(node.label))

    def predict(self, df):
        predictions = list()
        row = df
        node = self.root
        while len(node.children) > 0:
            value = row.loc[node.attribute]
            if value in node.children: #check if key in dict
                node = node.children[value]
            else: # value combination not in training set
                predictions.append(self.get_class_a_priori())
                break
        if node.label == None: # no instance in training set
            predictions.append(self.get_class_a_priori())
        else:
            predictions.append(node.label)
        return predictions[0]

    def get_class_a_priori(self):
        if self.a_priori[0] > self.a_priori[1]:
            return 0
        else:
            return 1

In [4]:
import pandas as pd
import threading
import queue
# This class represents an ensemble structe of private random decision trees.
# It allows the parallel creation and training of the contained trees.
#
# Parameters for constructor:
# number_trees: Number of random decision trees, which should be used
# max_depth: The depth of each decision tree
# attributes: The full list of available attributes from which the decision
#     trees can choose from
# e: the epsilon parameter for the Laplacian distribution
# conn: connection object to the database
# table: underlying table for the queries
class RandomForest(object):
    def __init__(self, number_trees, max_depth, attributes, e, conn, table):
        self.trees = list() #list of containing trees
        # the more trees are used, the higher the noise
        self.e_per_tree = e / number_trees
        # Queue for the output of the parallel threads
        que = queue.Queue() 
        for i in range(0,number_trees):
            thr = threading.Thread(target = lambda q, 
                    arg : q.put(RandomDecisionTree(arg)), 
                    args = (que, 
                    [max_depth,attributes,self.e_per_tree,conn[i],
                    table, '', None]))
            thr.start()
            thr.join()
        while len(self.trees) < number_trees: 
            # wait until all trees are build and trained 
            self.trees.append(que.get())

    # This function accepts a pandas DataFrame object 
    # with the instances for predictions
    # and returns a list with the estimated classes.
    def predict(self, df):
        predictions = list()
        for (idx, row) in df.iterrows():
            # the variable vote collects the votes of all trees.
            # This can easily adapt to parallel execution.
            vote = 0
            for tree in self.trees:
                vote += tree.predict(row) # prediction for a specific
                                          # instance for each tree
            # Compute prediction
            # If vote is greater than the halt number of trees,
            # more of the half trees voted for class 1. 
            # Otherwise the prediction is 0.
            # It is also possible to use a weighted majority vote
            # or use the posterior probability over all known instances
            if vote > len(self.trees) / 2:
                predictions.append(1)
            else:
                predictions.append(0)
        return pd.Series(predictions, dtype='int64')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import random
import logging
import datetime
import pyodbc

logging.basicConfig(filename='C:/Users/StefanHanisch/ownCloud/Documents/Seminararbeit/e0.01.log',level=logging.INFO)

#Connect to DB  - IMPORTANT: Use own credentials
server = ''
database = '' 
username = '' 
password = '' 
cnxn = pyodbc.connect('DRIVER={ODBC Driver 13 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password+';Trusted_Connection=yes;')
cursor = cnxn.cursor()

census = pd.read_sql("SELECT * FROM census", cnxn)

#Extract the possible attributes
attributes = dict()
for attribute in census:
    attributes[attribute] = list(pd.Series.unique(census[attribute]))

#Remove the ID and outcome variable (income class)
attributes.pop('income')
attributes.pop('ID')

test = pd.read_sql("SELECT * FROM test", cnxn)

x_test = test[list(attributes.keys())]
y_test = test['income']


epsilon = 0.01
for trees in range(1,13):
    for depth in range(1,8):

        conn_list = list()
        for i in range(0,Anzahl_Trees):
            conn_list.append(pyodbc.connect('DRIVER={ODBC Driver 13 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password+';Trusted_Connection=yes;'))
        clf = RandomForest(number_trees = trees, attributes=attributes,max_depth=depth , e = epsilon, conn=conn_list, table='census')
        pred = clf.predict(x_test)
        acc = accuracy_score(pred, y_test)
        result = str(datetime.datetime.now()) +  ' - trees: ' + str(trees) + ', depth: ' + str(depth) + ' , Acc: ' + str(acc)
        print(result)
        logging.info(result)

