# Imports

In [None]:
!apt-get install -y python3-dev graphviz libgraphviz-dev pkg-config

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
pkg-config is already the newest version (0.29.2-1ubuntu3).
graphviz is already the newest version (2.42.2-6).
python3-dev is already the newest version (3.10.6-1~22.04).
python3-dev set to manually installed.
The following additional packages will be installed:
  libgail-common libgail18 libgtk2.0-0 libgtk2.0-bin libgtk2.0-common libgvc6-plugins-gtk
  librsvg2-common libxdot4
Suggested packages:
  gvfs
The following NEW packages will be installed:
  libgail-common libgail18 libgraphviz-dev libgtk2.0-0 libgtk2.0-bin libgtk2.0-common
  libgvc6-plugins-gtk librsvg2-common libxdot4
0 upgraded, 9 newly installed, 0 to remove and 10 not upgraded.
Need to get 2,433 kB of archives.
After this operation, 7,694 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libgtk2.0-common all 2.24.33-2ubuntu2 [125 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/m

In [None]:
!pip install pygraphviz

Collecting pygraphviz
  Downloading pygraphviz-1.11.zip (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.8/120.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pygraphviz
  Building wheel for pygraphviz (setup.py) ... [?25l[?25hdone
  Created wheel for pygraphviz: filename=pygraphviz-1.11-cp310-cp310-linux_x86_64.whl size=175928 sha256=e929b35ce7910e563653de502eab8afac6a4a210d2f1c6555dea5ae59dbfe810
  Stored in directory: /root/.cache/pip/wheels/5b/ee/36/f47a0d35664fbe1a2b5a433ae33c6ad636b00bb231f68a9aaa
Successfully built pygraphviz
Installing collected packages: pygraphviz
Successfully installed pygraphviz-1.11


In [None]:
from sklearn.datasets import load_wine
from networkx.drawing.nx_agraph import graphviz_layout
from tqdm.notebook import trange, tqdm
from sklearn.model_selection import train_test_split
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns
import pydotplus
import networkx as nx
import pandas as pd
import numpy as np
import copy
import uuid

# Code

In [None]:
class Node:
  def __init__(self,
               name: str,
               value : float or int,
               cond_type : int = 0,
               depth : int = 0,
               leaf : bool = False):

    self.name = name
    self.value = value
    self.cond_type = cond_type

    if cond_type == 0: #Categorical
      self.symbol = '<='
    elif cond_type == 1: #Scalar
      self.symbol = '='

    self.depth = depth
    self.leaf = leaf

    self.left = None    #true
    self.right = None   #false

    self.identifier = uuid.uuid4()


  def decision(self,
               x: float or int):

    return x <= self.value if self.cond_type == 0 else  self.value == x


In [None]:
class Tree:

  def __init__(self,
               all_features: np.ndarray,
               targets: np.ndarray,
               max_depth: int = 5):

    self.all_features = all_features
    self.max_depth = max_depth

    self.chosen_features = np.random.choice(self.all_features, max_depth,
                                            replace=True)

    self.features = {feature : self.all_features.index(feature) for feature in self.chosen_features}

    self.targets = targets
    self.root = None
    self.nodes = None

    self.score = 0


  def __deepcopy__(self, obj):
        new = type(self)(copy.deepcopy(self.all_features, obj),
                         copy.deepcopy(self.chosen_features, obj),
                         copy.deepcopy(self.targets, obj),
                         copy.deepcopy(self.max_depth, obj)
                         )
        new.features = copy.deepcopy(self.features, obj)
        new.root = copy.deepcopy(self.root, obj)
        new.nodes = copy.deepcopy(self.nodes, obj)

        return new


  def insert_node(self,
                  node: Node,
                  key_name: str,
                  key_value: float or int = 0,
                  current_depth: int = 0):

    if node is None:
      return Node(key_name, key_value, depth=current_depth, cond_type=0)

    else:
      current_depth += 1
      direction = np.random.rand()
      if direction <= 0.5:
        node.left = self.insert_node(node.left, key_name, key_value, current_depth)
      else:
        node.right = self.insert_node(node.right, key_name, key_value, current_depth)

    return node


  def set_outputs(self,
                  node : Node,
                  outputs_classes: np.ndarray,
                  before_label: str = '',
                  current_depth : int = 0,
                  direction: str = 'right'):

    current_depth += 1

    if node is None:
      label = np.random.choice(outputs_classes)
      value = float(label.split('_')[1])
      return Node(label, int(value), depth=current_depth, leaf=True)

    else:
      if node.right and node.left is None:
        left_output_classes = np.delete(outputs_classes, np.where(outputs_classes == node.right.name))
      else:
        left_output_classes = outputs_classes
      node.left = self.set_outputs(node.left, left_output_classes, node.name, current_depth, direction='left')

      if node.left and node.right is None:
        right_output_classes = np.delete(outputs_classes, np.where(outputs_classes == node.left.name))
      else:
        right_output_classes = outputs_classes
      node.right = self.set_outputs(node.right, right_output_classes, node.name, current_depth)

    return node


  def to_nodes(self,
               node: Node,
               nodes: np.ndarray,):
    if node and not node.leaf:
      nodes.append(node)
      self.to_nodes(node.left, nodes)
      self.to_nodes(node.right, nodes)


  def get_nodes(self):
    nodes = []
    self.to_nodes(self.root, nodes)
    return nodes


  def create(self):
    for feature in self.chosen_features:
      self.root = self.insert_node(self.root, feature, np.random.normal())

    self.root = self.set_outputs(self.root, self.targets)
    self.nodes = self.get_nodes()


  def make_predict(self,
                   cell: np.ndarray):

    node = self.root
    while node:
      if node.leaf:
        return node.value

      elif node.decision(cell[self.features[node.name]]):
        node = node.left

      else:
        node = node.right


  def predict(self,
              data: np.ndarray):
    results = []
    for cell in data:
      results.append(self.make_predict(cell))

    return results


  def to_graph(self,
               graph,
               node: Node):

    if node.left is not None:
        graph.add_edge(node.identifier, node.left.identifier)
        self.to_graph(graph, node.left)

    if node.right is not None:
        graph.add_edge(node.identifier, node.right.identifier)
        self.to_graph(graph, node.right)


  def get_labels(self,
                 node: Node,
                 labels: dict):

    if node:
      labels[node.identifier] = f'{node.name} {node.symbol} {node.value:.4}' if not node.leaf else f'{node.name}'
      self.get_labels(node.left, labels)
      self.get_labels(node.right, labels)


  def show(self):
    graph = nx.Graph()
    self.to_graph(graph, self.root)

    labels = {}
    self.get_labels(self.root, labels)
    pos = graphviz_layout(graph, prog="dot")

    plt.figure(figsize=(13, 5))
    nx.draw(graph, pos,
            labels=labels,
            with_labels=True,
            node_size=500,
            font_size=10,
            font_color="white",
            font_weight="bold",
            linewidths=0.5,
            edge_color="gray",
            style="dashed",
            bbox=dict(facecolor="black", edgecolor='black', boxstyle='round, pad=1.0'))

    plt.show()

In [None]:
class Genetic:

  def __init__(self,
               pop_size: int,
               epochs: int,
               model: Tree,
               mutation_rate: float = 4e-2):

    self.epochs = epochs
    self.pop_size = pop_size
    self.population = []

    self.model = model
    self.n_nodes = len(model.nodes)
    self.best = None
    self.best_score = None
    self.fitness = []

    self.mutation_rate = mutation_rate
    self.save_rate = mutation_rate
    self.no_improvement = 0


  def start_pop(self):
    self.population = np.random.normal(size=(self.pop_size, self.n_nodes))

    self.best = self.population[0]
    self.best_score = 0


  def evaluation_function(self,
                          data: np.ndarray,
                          target: np.ndarray,
                          sample_size: int):

    return sum(self.model.predict(data) == target) / sample_size


  def evaluation(self,
                 data: np.ndarray,
                 target: np.ndarray):

    sample_size = len(target)
    for agent in self.population:

      for node, value in zip(self.model.nodes, agent):
        node.value = value

      score = self.evaluation_function(data, target, sample_size)

      if score > self.best_score:
        self.best = agent.copy()
        self.best_score = score

        self.mutation_rate = self.save_rate
        self.no_improvement = 0


  def cross_and_mutation(self):

    for agent in self.population:

      #cros better and agent
      agent = (agent + self.best) / 2

      #mutation
      idx = np.random.randint(0, self.n_nodes)
      if np.random.rand() <= .5:
        agent[idx] += np.random.normal() * self.mutation_rate
      else:
        agent[idx] -= np.random.normal() * self.mutation_rate


  def train(self,
            data: np.ndarray,
            target: np.ndarray,
            bar_train: bool = False,
            desc : str = 'Train'):

    self.start_pop()

    if not bar_train:
      for _ in range(self.epochs):

        self.evaluation(data, target)
        self.no_improvement += 1
        if self.no_improvement == 5:
          self.mutation_rate += self.save_rate / 5
          self.no_improvement = 0

        self.cross_and_mutation()

    else:
      for _ in trange(self.epochs, desc=desc):

        self.evaluation(data, target)

        self.no_improvement += 1
        if self.no_improvement == 5:
          self.mutation_rate += self.save_rate / 10
          self.no_improvement = 0

        self.cross_and_mutation()

    for node, value in zip(self.model.nodes, self.best):
        node.value = value

In [None]:
def model_pred(n_class: int,
               agents: list,
               data: np.ndarray):

  pred = []
  for x in data:
    probs = np.zeros(n_class)

    for tree in agents:
      result = tree.make_predict(x)
      probs[result] += 1

    pred.append(np.argmax(probs))

  return pred


def score(y_pred, y):
  s = sum(y_pred == y) / len(y)

  print(f'Score :: {s}')


def bootstrap(data: np.ndarray,
              target: np.ndarray,
              n_samples: int):

  size_sample = data.shape[0]
  n_cols = data.shape[1]

  samples = []

  for _ in range(n_samples):
    idx = np.random.choice(size_sample, size_sample, replace=True)
    sample = data[idx]
    target_sample = target[idx]
    samples.append([sample, target_sample])

  return samples

# Wine

In [None]:
data = load_wine()

df = pd.DataFrame(data['data'], columns=data['feature_names'])
Y = data['target']

for label in df:
  df[label] = (df[label] - df[label].mean()) / df[label].std(ddof=1)

X = df.to_numpy()

features = data['feature_names']
targets = data['target_names']

In [None]:
seed = 12
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=seed)

In [None]:
np.random.seed(seed)
n_models = 50
samples = bootstrap(x_train, y_train, n_models)


np.random.seed(seed)
models = []

for _ in range(n_models):
  model = Tree(features, targets, max_depth=16)
  model.create()
  models.append(model)

print('Random Models Score')
y_pred = model_pred(len(targets), models, x_test)
score(y_pred, y_test)
print()


agents = []
for model, sample in tqdm(zip(models, samples), desc='Traning Models :: ', total=n_models):

  genetic = Genetic(150, 70, model)

  # "Simple Bootstrap"
  x_sample, y_sample = sample
  genetic.train(x_sample, y_sample)

  agents.append( genetic.model )

y_pred = model_pred(len(targets), agents, x_test)
score(y_pred, y_test)

Random Models Score
Score :: 0.3333333333333333



Traning Models ::   0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100,
                             random_state=seed,
                             max_depth=8,
                             bootstrap=True)
rfc.fit(x_train, y_train)

y_pred = rfc.predict(x_test)

score(y_pred, y_test)

Score :: 0.9814814814814815


In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=seed)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

score(y_pred, y_test)

Score :: 0.9444444444444444


# Breast Cancer

In [None]:
from sklearn.datasets import load_breast_cancer

In [None]:
data = load_breast_cancer()

df = pd.DataFrame(data['data'], columns=data['feature_names'])
Y = data['target']

for label in df:
  df[label] = (df[label] - df[label].mean()) / df[label].std(ddof=1)

X = df.to_numpy()

features = list(data['feature_names'])
targets = ['class_0', 'class_1']

In [None]:
seed = 12
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=seed)

In [None]:
np.random.seed(seed)
n_models = 20
samples = bootstrap(x_train, y_train, n_models)


np.random.seed(seed)
models = []

for _ in range(n_models):
  model = Tree(features, targets, max_depth=12)
  model.create()
  models.append(model)

print('Random Models Score')
y_pred = model_pred(len(targets), models, x_test)
score(y_pred, y_test)
print()


agents = []
for model, sample in tqdm(zip(models, samples), desc='Traning Models :: ', total=n_models):

  genetic = Genetic(150, 70, model)

  # "Simple Bootstrap"
  x_sample, y_sample = sample
  genetic.train(x_sample, y_sample)

  agents.append( genetic.model )

y_pred = model_pred(len(targets), agents, x_test)
score(y_pred, y_test)

Random Models Score
Score :: 0.5555555555555556



Traning Models ::   0%|          | 0/20 [00:00<?, ?it/s]

Score :: 0.9064327485380117


In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100,
                             random_state=seed,
                             bootstrap=True,
                             max_depth=8)
rfc.fit(x_train, y_train)

y_pred = rfc.predict(x_test)

score(y_pred, y_test)

Score :: 0.9239766081871345
