# Data loading

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import random
from pprint import pprint

In [3]:
%matplotlib inline
sns.set_style("darkgrid")

In [4]:
df = pd.read_csv('/content/sample_data/Iris.csv')
df = df.drop("Id", axis=1)
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(df['Species'])

df['Species'] = encoder.transform(df['Species'])

X = df[df.columns.to_list()[:-1]]
y = df['Species']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 121)#, test_size = 0.32)

# <b> Descison Tree algorithm </b>

In [11]:
class DescisionTree:
  def __init__(self, min_samples_leaf, min_samples_split, max_depth, criterion, max_features, ml_task):
    self.data = None
    self.X = None
    self.y = None
    self.max_features = max_features
    self.ml_task = ml_task
    self.min_samples_leaf = min_samples_leaf
    self.min_samples_split = min_samples_split
    self.max_depth = max_depth
    self.metric = criterion
    self.feature_importances_ = None
    self.complete_tree = None
    self.n_entries = {}
    self.n_weighted_entries = {}
    self.parent_node = 1  # root node
    self.yes_node = 2     # left node
    self.no_node = 3      # right node
    self.leaf_count = 0
    if ml_task == 'classification': self.classes_and_counts = {};self.leaf_node_class_proba = {}
    else: self.leaf_node_loss = {}

  ''' This method is used to get the collective counts of all classes in target '''
  def get_classes_and_counts(self, data):
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
    for i in range(len(unique_classes)):
      self.classes_and_counts[unique_classes[i]] = counts_unique_classes[i]

  ''' This method is used to get the collective probabilities of all classes in target '''
  def get_probability_for_all_classes(self, data):
    label_column = data[:, -1]
    unique_classes_new, counts_unique_classes_new = np.unique(label_column, return_counts=True)
    
    classes_and_counts_new = {}
    for i in list(self.classes_and_counts.keys()):
      if i in list(unique_classes_new):
        classes_and_counts_new[i] = counts_unique_classes_new[list(unique_classes_new).index(i)]
      else:
        classes_and_counts_new[i] = 0
    array = np.array(list(classes_and_counts_new.values())) / sum(classes_and_counts_new.values())

    return [round(i, 5) for i in array]


  ''' This method checks the purity of a target vector '''
  def check_purity(self, data):
      label_column = data[:, -1]
      unique_classes = np.unique(label_column)
      if len(unique_classes) == 1:
          return True
      else:
          return False


  ''' This method performs classification '''
  def create_leaf(self, data, ml_task, current_node):
      self.leaf_count += 1
      label_column = data[:, -1]
      if ml_task == "regression":
          leaf = np.mean(label_column)
          self.leaf_node_loss[current_node] = self.mse(data)
          return str(leaf) + ' Node: '+str(current_node)
      else:
          probabilities = []
          unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
          index = counts_unique_classes.argmax()
          leaf = unique_classes[index]
          # probability = counts_unique_classes[index] / sum(counts_unique_classes)
          self.leaf_node_class_proba[current_node] = self.get_probability_for_all_classes(data)
          
          return str(leaf) + ' Node: '+str(current_node)
    
    
  ''' this function generates all possible potential splits for a given training data '''
  def get_potential_splits(self, data, random_subspace):  # randomly selecting certain features
    potential_splits = {}
    _, n_columns = data.shape
    column_indices = list(range(n_columns - 1))    # excluding the last column which is the label
    if random_subspace and random_subspace <= len(column_indices):
        column_indices = random.sample(population=column_indices, k=random_subspace)
    for column_index in column_indices:          
        values = data[:, column_index]
        unique_values = np.unique(values)
        potential_splits[column_index] = unique_values

    return potential_splits
    
    
  ''' This function splits the data into two partitions: Yes and no cases'''
  def split_data(self, data, split_column, split_value):
      split_column_values = data[:, split_column]
      type_of_feature = FEATURE_TYPES[split_column]

      if type_of_feature == "continuous":
          data_below = data[split_column_values <= split_value]
          data_above = data[split_column_values >  split_value]
      else:
          data_below = data[split_column_values == split_value]
          data_above = data[split_column_values != split_value]
      
      return data_below, data_above
    
    
  ''' This method calculates mse loss'''
  def mse(self, data):
      actual_values = data[:, -1]
      if len(actual_values) == 0:   # empty data
          mse = 0
      else:
          prediction = np.mean(actual_values)
          mse = np.mean((actual_values - prediction) **2)
      
      return mse

  ''' This method calculates entropy loss '''
  def entropy(self, data):
      label_column = data[:, -1]
      _, counts = np.unique(label_column, return_counts=True)
      probabilities = counts / counts.sum()
      entropy = sum(probabilities * -np.log2(probabilities))
      
      return entropy


  ''' This method calculates gini impurity'''
  def gini(self, data):
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)
    probabilities = counts / counts.sum()
    gini_index = 0

    for i in probabilities:
      gini_index += i ** 2
    
    return 1 - gini_index


  ''' calculating total/weighed value of the used metric '''
  def calculate_overall_metric(self, data_below, data_above, metric_function):
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n
    # weighted MSE, RMSE, Gini, and entropy
    overall_metric =  (p_data_below * metric_function(self, data_below) 
                     + p_data_above * metric_function(self, data_above))
    
    return overall_metric


  
  ''' Determining which split is the best by using the metric '''
  def determine_best_split(self, data, potential_splits, ml_task, criterion):
    first_iteration = True
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = self.split_data(data, split_column=column_index, split_value=value)
            
        
            current_overall_metric = self.calculate_overall_metric(data_below, data_above, metric_function=criterion)
            if first_iteration or current_overall_metric <= best_overall_metric:
                first_iteration = False
                best_overall_metric = current_overall_metric
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value


  ''' determining the type of a feature among all features '''
  def determine_type_of_feature(self, df):
    feature_types = []
    n_unique_values_treshold = 10

    for feature in df.columns:
        if feature != "label":
            unique_values = df[feature].unique()
            example_value = unique_values[0]
            if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
                feature_types.append("categorical")
            else:
                feature_types.append("continuous")
    
    return feature_types
 
    
  ''' THIS IS THE MAIN RECURSIVE ALGORITHM FOR DESCISION TREE'''

  def tree(self, df, ml_task, counter,current_node, min_samples_leaf, min_samples_split,max_depth, criterion, answer, max_features):

    # When the tree starts, the dataframe is converteed to numpy array, the depth of the tree is checked using counter variable and all features types are detected
    if counter == 0:
        global COLUMN_HEADERS, FEATURE_TYPES
        COLUMN_HEADERS = df.columns
        FEATURE_TYPES = self.determine_type_of_feature(df)
        data = df.values
        criterion = getattr(DescisionTree, criterion)
    else:
        data = df      

    # storing the length of data passed into the node
    self.n_entries['Node: '+str(current_node)] = [len(df)]
    # criterion = getattr(DescisionTree, criterion)

    # storing the loss/mse/rmse/gini/entropy in a specific node
    self.n_entries['Node: '+str(current_node)].append(criterion(self, data))
    
    # incrementing yes/left nodes and no/right nodes such that yes will be a even node and no will be a odd node repectively
    if (answer == 'yes answer'):
      self.yes_node += 2
    elif (answer == 'no answer'):
      self.no_node += 2 
      
    # checking if that target of the data passed is either pure, has minimum samples to create a leaf, or the depth of tree has reached its maximum depth
    if (self.check_purity(data)) or (len(data) == min_samples_leaf) or (counter == max_depth):
        leaf = self.create_leaf(data, ml_task, current_node) # creating the leaf
        return leaf 
    # if above requirements to create a leaf are not met, two new nodes will be created recursively respectively.
    else:    
        counter += 1 # when two new nodes are created, the depth of three is also incremented
        
        # if the data is not yet pure, but has not minimum samples to perform the split, a leaf is created
        if (len(data) < min_samples_split):
          leaf = self.create_leaf(data, ml_task, current_node)
          return leaf
        else:
          # getting the all possible splits, determining which split has least loss, and splitting the data into left and right nodes respectively
          potential_splits = self.get_potential_splits(data, max_features)
          split_column, split_value = self.determine_best_split(data, potential_splits, ml_task,criterion)
          data_below, data_above = self.split_data(data, split_column, split_value)
          
          # if the data seperated into left and right nodes, but there is no data, instead of creating a node, a leaft is created
          if len(data_below) == 0 or len(data_above) == 0:
              leaf = self.create_leaf(data, ml_task, current_node)
              return leaf
          
          # finding the type of a selected feature column and its name
          feature_name = COLUMN_HEADERS[split_column]
          type_of_feature = FEATURE_TYPES[split_column]

#           # creating the tree questions
          if type_of_feature == "continuous":
              question = "{} <= {} (Node: {})".format(feature_name, split_value, current_node)
          # feature is categorical
          else:
              question = "{} = {} (Node: {})".format(feature_name, split_value, current_node)

          # instantiate sub-tree
          sub_tree = {question: []}

          # creating left and right nodes recursively
          yes_answer = self.tree(data_below, ml_task, counter, self.yes_node,min_samples_leaf, min_samples_split,max_depth, criterion, 'yes answer', max_features)
          no_answer = self.tree(data_above, ml_task, counter, self.no_node,min_samples_leaf,min_samples_split, max_depth, criterion, 'no answer', max_features)
          
          # if both left and right nodes are same, only taking one value for a leaf node
          if yes_answer == no_answer:
              sub_tree = yes_answer
          else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
          
          return sub_tree

    

  ''' making probability of the predictions using this tree '''
  def predict_example_probability(self, example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")[:3]

    # ask question
    if comparison_operator == "<=":
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    # feature is categorical
    else:
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    # base case
    if not isinstance(answer, dict):
        return int(answer.split()[2])
            
    # recursive part
    else:
        residual_tree = answer
        return self.predict_example_probability(example, residual_tree)

    
  ''' fitting our tree with training data '''
  def fit(self, X,y):
    self.X = X.copy();self.y = y.copy()
    self.X['label'] = self.y;self.data = self.X

    if self.ml_task == 'classification':
      self.get_classes_and_counts(self.data.values)
    self.complete_tree = self.tree(self.data, self.ml_task, 0,self.parent_node, self.min_samples_leaf, self.min_samples_split, self.max_depth, self.metric,'parent_node', self.max_features)

    # calculating weighted entries
    for key, value in self.n_entries.items():
      self.n_weighted_entries[key] = [value[0] / len(self.X), value[1]]

    return self.complete_tree

# Random Forest

In [12]:
import warnings
warnings.filterwarnings("ignore")

In [13]:
class RandomForest:
  def __init__(self,criterion, n_estimators, n_bootstrap, ml_task, max_depth, max_features, min_samples_leaf = 1, min_samples_split = 2):
    self.n_estimators = n_estimators
    self.n_bootstrap = n_bootstrap
    self.max_features = max_features
    self.ml_task = ml_task
    self.min_samples_leaf = min_samples_leaf
    self.min_samples_split = min_samples_split
    self.max_depth = max_depth
    self.metric = criterion
    self.forest = []
    self.X = None
    self.y = None
    self.each_tree_with_class_probabilities = []
    self.n_classes = None



  ''' Performing boostraping '''
  def bootstrapping(self, X,y, n_bootstrap):
    bootstrap_indices = np.random.randint(low=0, high=len(X), size=int(n_bootstrap * len(X)))
    bootstraped_X = X.iloc[bootstrap_indices]
    bootstraped_y = y.iloc[bootstrap_indices]
    
    return bootstraped_X, bootstraped_y


  ''' Training our model '''
  def fit(self, X, y):
    self.X = X.copy(); self.y = y.copy()
    self.n_classes = len(np.unique(self.y.values))
    for i in range(self.n_estimators):
      bootstraped_X, bootstraped_y= self.bootstrapping(self.X,self.y, self.n_bootstrap)
      tree = DescisionTree(min_samples_leaf = self.min_samples_leaf, min_samples_split = self.min_samples_split, max_depth =  self.max_depth, \
                           criterion = self.metric, max_features = self.max_features, ml_task = self.ml_task)
      complete_tree = tree.fit(bootstraped_X, bootstraped_y)
      # getting probabilities for each class for individual classes
      if self.ml_task == 'classification':
        probabilities_of_each_nodes = tree.leaf_node_class_proba
        self.forest.append(complete_tree)
        self.each_tree_with_class_probabilities.append(probabilities_of_each_nodes)
      else:
        self.forest.append(complete_tree)
    
    return 'Training completed'

  ''' Making individual predictions '''
  def predict_example(self, example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")[:3]

    # ask question
    if comparison_operator == "<=":
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    # feature is categorical
    else:
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    # base case
    if not isinstance(answer, dict):
        return float(answer.split()[0])
    
    # recursive part
    else:
        residual_tree = answer
        return self.predict_example(example, residual_tree)



  ''' Making predictions on testing dataframe '''
  def decision_tree_predictions(self, test_df, tree):
      predictions = test_df.apply(self.predict_example, args=(tree,), axis=1)
      return predictions


  ''' Method for making predictions ''' 
  def predict(self, test_df):
    df_predictions = {}
    df_predictions_probability = {}
    for i in range(len(self.forest)):
        column_name = "tree_{}".format(i)
        predictions = self.decision_tree_predictions(test_df, tree=self.forest[i])

        df_predictions[column_name] = predictions

        if (self.ml_task == 'classification'):
          probabilities = self.predict_proba(test_df, self.forest[i], self.each_tree_with_class_probabilities[i])
          df_predictions_probability[column_name] = probabilities


    df = pd.DataFrame(df_predictions)#.reset_index(drop = True)

    if self.ml_task == 'classification':
      df['Prediction'] = [df.iloc[j].mode()[0] for j in range(len(df))]
      return df, self.final_probabilities(pd.DataFrame(df_predictions_probability))
    else:
      df['Prediction'] = [df.iloc[j].mean() for j in range(len(df))]    
      return df


  ''' making probability of the predictions using this tree '''
  def predict_example_probability(self, example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")[:3]

    # ask question
    if comparison_operator == "<=":
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    # feature is categorical
    else:
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    # base case
    if not isinstance(answer, dict):
        return int(answer.split()[2])
            
    # recursive part
    else:
        residual_tree = answer
        return self.predict_example_probability(example, residual_tree)


  ''' This method returns the predicted probabilities for all classes '''
  def predict_proba(self, X, complete_tree, leaf_node_class_proba):
    leaf_nodes_for_predictions = np.array(X.apply(self.predict_example_probability, axis = 1, args = (complete_tree, )))
    probabilities = [] 
    
    
    for i in leaf_nodes_for_predictions:
      for key, value in leaf_node_class_proba.items():
        if i == key:
          probabilities.append(list(value))

    return probabilities

  def final_probabilities(self, df):
    columns = df.columns.to_list()
    values = []
    for i in range(len(df)):
      value = [0 for i in range(self.n_classes)]
      row = df.iloc[i]
      for j in columns:
        value = [sum(x) for x in zip(value, row[j])]
      value = [round(i/100, 5) for i in value]
      values.append(value)
    df['final_prob'] = values
    return df

In [14]:
forest = RandomForest(ml_task = 'classification', n_estimators = 100, n_bootstrap = 0.6, max_features=3, max_depth = 15, criterion = 'gini')

In [15]:
forest.fit(X_train, y_train)

'Training completed'

In [16]:
preds, df_predictions_probability = forest.predict(X_test)

In [17]:
probabilities = [i for i in df_predictions_probability.values[:, -1]]

In [20]:
probabilities[:5]

[[0.0, 0.88, 0.12],
 [0.0, 0.0, 1.0],
 [0.0, 0.82, 0.18],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0]]

In [18]:
from sklearn.metrics import roc_auc_score
print('The AUC score measured by one class versus rest classes is: ',roc_auc_score(preds.values[:, -1],probabilities, multi_class = 'ovo'))

The AUC score measured by one class versus rest classes is:  1.0


In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test.values, preds.values[:, -1]))

              precision    recall  f1-score   support

           0       1.00      0.83      0.91        12
           1       0.88      0.94      0.91        16
           2       0.91      1.00      0.95        10

    accuracy                           0.92        38
   macro avg       0.93      0.92      0.92        38
weighted avg       0.93      0.92      0.92        38



# regression 

In [21]:
df = pd.read_csv('/content/sample_data/insurance.csv')
# df = df.drop("Id", axis=1)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [22]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [23]:
df['sex'] = encoder.fit_transform(df['sex'])
df['smoker'] = encoder.fit_transform(df['smoker'])
df['region'] = encoder.fit_transform(df['region'])

In [24]:
X = df[df.columns.to_list()[:-1]]
y = df['charges']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)#, test_size = 0.32)

In [25]:
forest = RandomForest(ml_task = 'regression', n_estimators = 100, n_bootstrap = 0.6, max_features=3, max_depth = 10, criterion = 'mse')

In [26]:
forest.fit(X_train, y_train)

'Training completed'

In [27]:
preds = forest.predict(X_train)

In [28]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_train, preds.values[:, -1]))

2955.8851274534636

In [None]:
# tree = DescisionTree(max_depth = 3, criterion = 'mse', ml_task = 'regression', min_samples_leaf = 1, min_samples_split = 2, max_features = 6)
# complete_tree = tree.fit(X_train, y_train) 

In [29]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test, forest.predict(X_test).values[:, -1]))

4609.576539446336

# sklearn random forest

In [30]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_jobs = -1, n_estimators = 100, max_features= 3)

In [31]:
rf.fit(X_train, y_train)

RandomForestRegressor(max_features=3, n_jobs=-1)

In [32]:
np.sqrt(mean_squared_error(y_test, rf.predict(X_test)))

4598.288338378989

# backup tree

In [None]:
# class DescisionTree:
#   def __init__(self, min_samples_leaf, min_samples_split, max_depth, criterion, max_features, ml_task):
#     self.data = None
#     self.X = None
#     self.y = None
#     self.max_features = max_features
#     self.ml_task = ml_task
#     self.min_samples_leaf = min_samples_leaf
#     self.min_samples_split = min_samples_split
#     self.max_depth = max_depth
#     self.metric = criterion
#     self.feature_importances_ = None
#     self.complete_tree = None
#     self.n_entries = {}
#     self.n_weighted_entries = {}
#     self.parent_node = 1  # root node
#     self.yes_node = 2     # left node
#     self.no_node = 3      # right node
#     self.leaf_count = 0
#     if ml_task == 'classification': self.classes_and_counts = {};self.leaf_node_class_proba = {}
#     else: self.leaf_node_loss = {}

#   ''' This method is used to get the collective counts of all classes in target '''
#   def get_classes_and_counts(self, data):
#     label_column = data[:, -1]
#     unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
#     for i in range(len(unique_classes)):
#       self.classes_and_counts[unique_classes[i]] = counts_unique_classes[i]

#   ''' This method is used to get the collective probabilities of all classes in target '''
#   def get_probability_for_all_classes(self, data):
#     label_column = data[:, -1]
#     unique_classes_new, counts_unique_classes_new = np.unique(label_column, return_counts=True)
    
#     classes_and_counts_new = {}
#     for i in list(self.classes_and_counts.keys()):
#       if i in list(unique_classes_new):
#         classes_and_counts_new[i] = counts_unique_classes_new[list(unique_classes_new).index(i)]
#       else:
#         classes_and_counts_new[i] = 0
#     array = np.array(list(classes_and_counts_new.values())) / sum(classes_and_counts_new.values())

#     return [round(i, 5) for i in array]


#   ''' This method checks the purity of a target vector '''
#   def check_purity(self, data):
#       label_column = data[:, -1]
#       unique_classes = np.unique(label_column)
#       if len(unique_classes) == 1:
#           return True
#       else:
#           return False


#   ''' This method performs classification '''
#   def create_leaf(self, data, ml_task, current_node):
#       self.leaf_count += 1
#       label_column = data[:, -1]
#       if ml_task == "regression":
#           leaf = np.mean(label_column)
#           self.leaf_node_loss[current_node] = self.mse(data)
#           return str(leaf) + ' Node: '+str(current_node)
#       else:
#           probabilities = []
#           unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
#           index = counts_unique_classes.argmax()
#           leaf = unique_classes[index]
#           # probability = counts_unique_classes[index] / sum(counts_unique_classes)
#           self.leaf_node_class_proba[current_node] = self.get_probability_for_all_classes(data)
          
#           return str(leaf) + ' Node: '+str(current_node)
    
    
#   ''' this function generates all possible potential splits for a given training data '''
#   def get_potential_splits(self, data, random_subspace):  # randomly selecting certain features
#     potential_splits = {}
#     _, n_columns = data.shape
#     column_indices = list(range(n_columns - 1))    # excluding the last column which is the label
#     if random_subspace and random_subspace <= len(column_indices):
#         column_indices = random.sample(population=column_indices, k=random_subspace)
#     for column_index in column_indices:          
#         values = data[:, column_index]
#         unique_values = np.unique(values)
#         potential_splits[column_index] = unique_values

#     return potential_splits
    
    
#   ''' This function splits the data into two partitions: Yes and no cases'''
#   def split_data(self, data, split_column, split_value):
#       split_column_values = data[:, split_column]
#       type_of_feature = FEATURE_TYPES[split_column]

#       if type_of_feature == "continuous":
#           data_below = data[split_column_values <= split_value]
#           data_above = data[split_column_values >  split_value]
#       else:
#           data_below = data[split_column_values == split_value]
#           data_above = data[split_column_values != split_value]
      
#       return data_below, data_above
    
    
#   ''' This method calculates mse loss'''
#   def mse(self, data):
#       actual_values = data[:, -1]
#       if len(actual_values) == 0:   # empty data
#           mse = 0
#       else:
#           prediction = np.mean(actual_values)
#           mse = np.mean((actual_values - prediction) **2)
      
#       return mse

#   ''' This method calculates entropy loss '''
#   def entropy(self, data):
#       label_column = data[:, -1]
#       _, counts = np.unique(label_column, return_counts=True)
#       probabilities = counts / counts.sum()
#       entropy = sum(probabilities * -np.log2(probabilities))
      
#       return entropy


#   ''' This method calculates gini impurity'''
#   def gini(self, data):
#     label_column = data[:, -1]
#     _, counts = np.unique(label_column, return_counts=True)
#     probabilities = counts / counts.sum()
#     gini_index = 0

#     for i in probabilities:
#       gini_index += i ** 2
    
#     return 1 - gini_index


#   ''' calculating total/weighed value of the used metric '''
#   def calculate_overall_metric(self, data_below, data_above, metric_function):
#     n = len(data_below) + len(data_above)
#     p_data_below = len(data_below) / n
#     p_data_above = len(data_above) / n
#     # weighted MSE, RMSE, Gini, and entropy
#     overall_metric =  (p_data_below * metric_function(self, data_below) 
#                      + p_data_above * metric_function(self, data_above))
    
#     return overall_metric


  
#   ''' Determining which split is the best by using the metric '''
#   def determine_best_split(self, data, potential_splits, ml_task, criterion):
#     first_iteration = True
#     for column_index in potential_splits:
#         for value in potential_splits[column_index]:
#             data_below, data_above = self.split_data(data, split_column=column_index, split_value=value)
            
        
#             current_overall_metric = self.calculate_overall_metric(data_below, data_above, metric_function=criterion)
#             if first_iteration or current_overall_metric <= best_overall_metric:
#                 first_iteration = False
#                 best_overall_metric = current_overall_metric
#                 best_split_column = column_index
#                 best_split_value = value
    
#     return best_split_column, best_split_value


#   ''' determining the type of a feature among all features '''
#   def determine_type_of_feature(self, df):
#     feature_types = []
#     n_unique_values_treshold = 10

#     for feature in df.columns:
#         if feature != "label":
#             unique_values = df[feature].unique()
#             example_value = unique_values[0]
#             if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
#                 feature_types.append("categorical")
#             else:
#                 feature_types.append("continuous")
    
#     return feature_types
 
    
#   ''' THIS IS THE MAIN RECURSIVE ALGORITHM FOR DESCISION TREE'''

#   def tree(self, df, ml_task, counter,current_node, min_samples_leaf, min_samples_split,max_depth, criterion, answer, max_features):

#     # When the tree starts, the dataframe is converteed to numpy array, the depth of the tree is checked using counter variable and all features types are detected
#     if counter == 0:
#         global COLUMN_HEADERS, FEATURE_TYPES
#         COLUMN_HEADERS = df.columns
#         FEATURE_TYPES = self.determine_type_of_feature(df)
#         data = df.values
#         criterion = getattr(DescisionTree, criterion)
#     else:
#         data = df      

#     # storing the length of data passed into the node
#     self.n_entries['Node: '+str(current_node)] = [len(df)]
#     # criterion = getattr(DescisionTree, criterion)

#     # storing the loss/mse/rmse/gini/entropy in a specific node
#     self.n_entries['Node: '+str(current_node)].append(criterion(self, data))
    
#     # incrementing yes/left nodes and no/right nodes such that yes will be a even node and no will be a odd node repectively
#     if (answer == 'yes answer'):
#       self.yes_node += 2
#     elif (answer == 'no answer'):
#       self.no_node += 2 
      
#     # checking if that target of the data passed is either pure, has minimum samples to create a leaf, or the depth of tree has reached its maximum depth
#     if (self.check_purity(data)) or (len(data) == min_samples_leaf) or (counter == max_depth):
#         leaf = self.create_leaf(data, ml_task, current_node) # creating the leaf
#         return leaf 
#     # if above requirements to create a leaf are not met, two new nodes will be created recursively respectively.
#     else:    
#         counter += 1 # when two new nodes are created, the depth of three is also incremented
        
#         # if the data is not yet pure, but has not minimum samples to perform the split, a leaf is created
#         if (len(data) < min_samples_split):
#           leaf = self.create_leaf(data, ml_task, current_node)
#           return leaf
#         else:
#           # getting the all possible splits, determining which split has least loss, and splitting the data into left and right nodes respectively
#           potential_splits = self.get_potential_splits(data, max_features)
#           split_column, split_value = self.determine_best_split(data, potential_splits, ml_task,criterion)
#           data_below, data_above = self.split_data(data, split_column, split_value)
          
#           # if the data seperated into left and right nodes, but there is no data, instead of creating a node, a leaft is created
#           if len(data_below) == 0 or len(data_above) == 0:
#               leaf = self.create_leaf(data, ml_task, current_node)
#               return leaf
          
#           # finding the type of a selected feature column and its name
#           feature_name = COLUMN_HEADERS[split_column]
#           type_of_feature = FEATURE_TYPES[split_column]

# #           # creating the tree questions
#           if type_of_feature == "continuous":
#               question = "{} <= {} (Node: {})".format(feature_name, split_value, current_node)
#           # feature is categorical
#           else:
#               question = "{} = {} (Node: {})".format(feature_name, split_value, current_node)

#           # instantiate sub-tree
#           sub_tree = {question: []}

#           # creating left and right nodes recursively
#           yes_answer = self.tree(data_below, ml_task, counter, self.yes_node,min_samples_leaf, min_samples_split,max_depth, criterion, 'yes answer', max_features)
#           no_answer = self.tree(data_above, ml_task, counter, self.no_node,min_samples_leaf,min_samples_split, max_depth, criterion, 'no answer', max_features)
          
#           # if both left and right nodes are same, only taking one value for a leaf node
#           if yes_answer == no_answer:
#               sub_tree = yes_answer
#           else:
#             sub_tree[question].append(yes_answer)
#             sub_tree[question].append(no_answer)
          
#           return sub_tree

    

#   ''' making probability of the predictions using this tree '''
#   def predict_example_probability(self, example, tree):
#     question = list(tree.keys())[0]
#     feature_name, comparison_operator, value = question.split(" ")[:3]

#     # ask question
#     if comparison_operator == "<=":
#         if example[feature_name] <= float(value):
#             answer = tree[question][0]
#         else:
#             answer = tree[question][1]
#     # feature is categorical
#     else:
#         if str(example[feature_name]) == value:
#             answer = tree[question][0]
#         else:
#             answer = tree[question][1]
#     # base case
#     if not isinstance(answer, dict):
#         return int(answer.split()[2])
            
#     # recursive part
#     else:
#         residual_tree = answer
#         return self.predict_example_probability(example, residual_tree)

    
#   ''' fitting our tree with training data '''
#   def fit(self, X,y):
#     self.X = X.copy();self.y = y.copy()
#     self.X['label'] = self.y;self.data = self.X

#     if self.ml_task == 'classification':
#       self.get_classes_and_counts(self.data.values)
#     self.complete_tree = self.tree(self.data, self.ml_task, 0,self.parent_node, self.min_samples_leaf, self.min_samples_split, self.max_depth, self.metric,'parent_node', self.max_features)

#     # calculating weighted entries
#     for key, value in self.n_entries.items():
#       self.n_weighted_entries[key] = [value[0] / len(self.X), value[1]]

#     return self.complete_tree

# Backup forest

In [None]:
# class RandomForest:
#   def __init__(self,criterion, n_estimators, n_bootstrap, ml_task, max_depth, max_features, min_samples_leaf = 1, min_samples_split = 2):
#     self.n_estimators = n_estimators
#     self.n_bootstrap = n_bootstrap
#     self.max_features = max_features
#     self.ml_task = ml_task
#     self.min_samples_leaf = min_samples_leaf
#     self.min_samples_split = min_samples_split
#     self.max_depth = max_depth
#     self.metric = criterion
#     self.forest = []
#     self.X = None
#     self.y = None
#     self.each_tree_with_class_probabilities = []
#     self.n_classes = None



#   ''' Performing boostraping '''
#   def bootstrapping(self, X,y, n_bootstrap):
#     bootstrap_indices = np.random.randint(low=0, high=len(X), size=int(n_bootstrap * len(X)))
#     bootstraped_X = X.iloc[bootstrap_indices]
#     bootstraped_y = y.iloc[bootstrap_indices]
    
#     return bootstraped_X, bootstraped_y


#   ''' Training our model '''
#   def fit(self, X, y):
#     self.X = X.copy(); self.y = y.copy()
#     self.n_classes = len(np.unique(self.y.values))
#     for i in range(self.n_estimators):
#       bootstraped_X, bootstraped_y= self.bootstrapping(self.X,self.y, self.n_bootstrap)
#       tree = DescisionTree(min_samples_leaf = self.min_samples_leaf, min_samples_split = self.min_samples_split, max_depth =  self.max_depth, \
#                            criterion = self.metric, max_features = self.max_features, ml_task = self.ml_task)
#       complete_tree = tree.fit(bootstraped_X, bootstraped_y)
#       # getting probabilities for each class for individual classes
#       if self.ml_task == 'classification':
#         probabilities_of_each_nodes = tree.leaf_node_class_proba
#         self.forest.append(complete_tree)
#         self.each_tree_with_class_probabilities.append(probabilities_of_each_nodes)
#       else:
#         self.forest.append(complete_tree)
    
#     return 'Training completed'

#   ''' Making individual predictions '''
#   def predict_example(self, example, tree):
#     question = list(tree.keys())[0]
#     feature_name, comparison_operator, value = question.split(" ")[:3]

#     # ask question
#     if comparison_operator == "<=":
#         if example[feature_name] <= float(value):
#             answer = tree[question][0]
#         else:
#             answer = tree[question][1]
#     # feature is categorical
#     else:
#         if str(example[feature_name]) == value:
#             answer = tree[question][0]
#         else:
#             answer = tree[question][1]
#     # base case
#     if not isinstance(answer, dict):
#         return float(answer.split()[0])
    
#     # recursive part
#     else:
#         residual_tree = answer
#         return self.predict_example(example, residual_tree)



#   ''' Making predictions on testing dataframe '''
#   def decision_tree_predictions(self, test_df, tree):
#       predictions = test_df.apply(self.predict_example, args=(tree,), axis=1)
#       return predictions


#   ''' Method for making predictions ''' 
#   def predict(self, test_df):
#     df_predictions = {}
#     df_predictions_probability = {}
#     for i in range(len(self.forest)):
#         column_name = "tree_{}".format(i)
#         predictions = self.decision_tree_predictions(test_df, tree=self.forest[i])

#         df_predictions[column_name] = predictions

#         if (self.ml_task == 'classification'):
#           probabilities = self.predict_proba(test_df, self.forest[i], self.each_tree_with_class_probabilities[i])
#           df_predictions_probability[column_name] = probabilities


#     df = pd.DataFrame(df_predictions)#.reset_index(drop = True)

#     if self.ml_task == 'classification':
#       df['Prediction'] = [df.iloc[j].mode()[0] for j in range(len(df))]
#       return df, self.final_probabilities(pd.DataFrame(df_predictions_probability))
#     else:
#       df['Prediction'] = [df.iloc[j].mean() for j in range(len(df))]    
#       return df


#   ''' making probability of the predictions using this tree '''
#   def predict_example_probability(self, example, tree):
#     question = list(tree.keys())[0]
#     feature_name, comparison_operator, value = question.split(" ")[:3]

#     # ask question
#     if comparison_operator == "<=":
#         if example[feature_name] <= float(value):
#             answer = tree[question][0]
#         else:
#             answer = tree[question][1]
#     # feature is categorical
#     else:
#         if str(example[feature_name]) == value:
#             answer = tree[question][0]
#         else:
#             answer = tree[question][1]
#     # base case
#     if not isinstance(answer, dict):
#         return int(answer.split()[2])
            
#     # recursive part
#     else:
#         residual_tree = answer
#         return self.predict_example_probability(example, residual_tree)


#   ''' This method returns the predicted probabilities for all classes '''
#   def predict_proba(self, X, complete_tree, leaf_node_class_proba):
#     leaf_nodes_for_predictions = np.array(X.apply(self.predict_example_probability, axis = 1, args = (complete_tree, )))
#     probabilities = [] 
    
    
#     for i in leaf_nodes_for_predictions:
#       for key, value in leaf_node_class_proba.items():
#         if i == key:
#           probabilities.append(list(value))

#     return probabilities

#   def final_probabilities(self, df):
#     columns = df.columns.to_list()
#     values = []
#     for i in range(len(df)):
#       value = [0 for i in range(self.n_classes)]
#       row = df.iloc[i]
#       for j in columns:
#         value = [sum(x) for x in zip(value, row[j])]
#       value = [round(i/100, 5) for i in value]
#       values.append(value)
#     df['final_prob'] = values
#     return df