# Assignment 1 - Computational Learning Course -  Apr-2023

<br>Roei Zaady	318747946</br>
<br>Omer Yanai	 024093866</br>

## Import

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

###############
from sklearn.base import BaseEstimator
from sklearn.utils.estimator_checks import check_estimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
###############

np.random.seed(1234) 

## 1,2,3,4

In [2]:
# 1 - Calculate entropy
def compute_entropy(y):
    if isinstance(y, int) == False:  # int means a single value, therefore there is no entropy
        if y.shape[0] > 0:  # if array is not empty
            p1 = y.sum() / y.shape[0]  # percentile of 1s
            if (p1 > 0) & (p1 < 1):
                return (-p1 * np.log2(p1) - (1 - p1) * np.log2(1 - p1)).astype(float)
    return 0


# 2 - Split dataset
def split_dataset(X, node_indices, feature):
    left_indices = list(np.array(node_indices)[X[node_indices, feature] == 1])
    right_indices = list(np.array(node_indices)[X[node_indices, feature] == 0])
    return left_indices, right_indices


# 3 - calculate information gain
def compute_information_gain(X, y, node_indices, feature):
    sp_l, sp_r = split_dataset(X, node_indices, feature)
    return compute_entropy(np.array(y[node_indices])) - (
                len(sp_l) / len(node_indices) * compute_entropy(y[sp_l]) + len(sp_r) / len(
            node_indices) * compute_entropy(y[sp_r]))


# 4 - Get best split
def get_best_split(X, y, node_indices):
    if len(y[node_indices]) > 1:
        temp_arr = []
        for num in range(X.shape[1]):
            temp_arr.append(compute_information_gain(X, y, node_indices, num))
        if len(temp_arr) > 0:
            return temp_arr.index(max(temp_arr))
    return 0

## 5

In [3]:
# 5 - Building a Tree
class MyID3(BaseEstimator):
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        _estimator_type = 'classifier'


    def build_node(self, X, y, node_indices):
        self.right_node_ = None  # right child
        self.left_node_ = None  # left child
        self.values_ = None  # [num of zeros, num of ones]
        self.entropy_ = None
        self.feature_split_on_ = None
        available_features = list(range(X.shape[1]))
        self.entropy_ = compute_entropy(y[node_indices])
        y1 = y[node_indices]
        self.values_ = [y1.tolist().count(0), y1.tolist().count(1)]
        # check if max depth have not been reached yet
        not_max_depth_yet = -1
        if self.max_depth is None:
            not_max_depth_yet = 1
            max_depth_for_child = None
        elif self.max_depth > 0:
            not_max_depth_yet = 1
            max_depth_for_child = self.max_depth - 1

        # build further branches
        if not_max_depth_yet == 1:
            if (self.values_[0] == 0) | (self.values_[1] == 0):
                return self
            self.feature_split_on_ = get_best_split(X, y, node_indices)
            if compute_information_gain(X, y, node_indices, self.feature_split_on_) == 0:
                return self
            if X.shape[1] > 0:
                available_features.remove(self.feature_split_on_)
            if (self.values_[0] > 0) & (self.values_[1] > 0):
                left_indices, right_indices = split_dataset(X, node_indices, self.feature_split_on_)
                if(len(left_indices) > 0) & (len(right_indices) > 0):
                  self.left_node_ = MyID3(max_depth=max_depth_for_child)
                  self.left_node_.build_node(X[:, available_features], y, left_indices)
                  self.right_node_ = MyID3(max_depth=max_depth_for_child)
                  self.right_node_.build_node(X[:, available_features], y, right_indices)
            return self

    def fit(self, X, y):
        X, y = check_X_y(X, y.ravel())
        self.classes_ = unique_labels(y)
        self.build_node(X, y, list(range(X.shape[0])))

    def predict_proba(self, X):
      X_copy = list(np.copy(X))
      feature = -1
      temp = self
      while temp.right_node_ is not None:
          feature = temp.feature_split_on_
          if X_copy[temp.feature_split_on_] == 0:
              temp = temp.right_node_
          else:
              temp = temp.left_node_
          X_copy.pop(feature)
      if temp.values_[0] >= temp.values_[1]:
        outcome = temp.values_[0] / (temp.values_[0] + temp.values_[1])
        return np.array([[outcome], [1 - outcome]]).reshape(2,1)
      else:
        outcome = temp.values_[1] / (temp.values_[0] + temp.values_[1])
      return np.array([[1 - outcome], [outcome]]).reshape(2,1)

    def predict(self, X):
        check_is_fitted(self)
        if X.ndim > 1:
          X = check_array(X)
        y = []
        if X.ndim == 1:
          temp = list(self.predict_proba(X))
          y = temp.index(max(temp))
        else:
          for row in range(len(X)):
            temp = self.predict_proba(X[row, :])
            y.append(list(temp).index(max(temp)))
        return y

    def tree_mapper(self): #get num of leaves in tree
      if self.right_node_ is None:
        return 1
      return 0 + self.right_node_.tree_mapper() + self.left_node_.tree_mapper()

    def get_params(self, deep=True):
      return {'max_depth': self.max_depth}

## 6

In [4]:
# 6 - Bagging
class MyBaggingID3(BaseEstimator):
    def __init__(self, n_estimators=3, max_samples=1, max_features=1, max_depth=None):
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.max_depth = max_depth

    def fit(self, X, y):
        self.trees_array_ = []
        self.features_ = []
        for time in range(self.n_estimators):
            indices = np.random.choice(X.shape[0], size=int(X.shape[0] * self.max_samples), replace=True)
            X_with_random = X[indices, :]
            y1 = np.array(y)[indices]

            if self.max_features < 1:
                num_of_features = np.random.choice(X.shape[1], int(X.shape[1] * self.max_features), replace=False)
                X_with_random = X_with_random[:, num_of_features]
                self.features_.append(num_of_features)
            else:
                num_of_features = range(X.shape[1])
                self.features_.append(num_of_features)

            temp_tree = MyID3(max_depth=self.max_depth)
            temp_tree.fit(X_with_random, y1)
            self.trees_array_.append(temp_tree)
      
    def predict_proba(self, X):
        pred_of_individual_tree = []
        scores = []
        zeros = 0
        ones = 0
        if np.array(X).ndim == 1:
            times = range(1)
        else:
            times = range(X.shape[0])
        for row in times:
            zeros, ones = 0, 0
            for i in range(self.n_estimators):
                features = self.features_[i]
                if np.array(X).ndim == 1:
                    zero_, one_ = self.trees_array_[i].predict_proba(X[features])
                else:
                    zero_, one_ = self.trees_array_[i].predict_proba(X[row, features])
                zeros = zeros + zero_
                ones = ones + one_
            scores.append([zeros / self.n_estimators, ones / self.n_estimators])
        return scores

    def predict(self, X):
        temp = self.predict_proba(X)
        score_array = []
        for cell in range(len(temp)):
          score_array.append(temp[cell].index(max(temp[cell])))
        if len(score_array)==1:
          return max(score_array)
        return score_array

## 7

In [None]:
!pip install wandb -qU        # install Weights&Biasses for data collection and reporting.

import wandb
wandb.login()  #349ecabe67af04ba71e0596c9223822e18cd88d7     #aa5982516882260d70edad6ff1b9a5d9c4e348b9    Token for login.

In [6]:
def prepare_dataset(dataset):
  if dataset=='mushroom':   ### https://www.kaggle.com/datasets/uciml/mushroom-classification
    df = pd.read_csv('https://drive.google.com/uc?id=16iZDKB6GGc6_6UAoyf7Ng8L6Yn6FofQW&export=download')
    X = df.drop(['class'],axis=1)
    y = df['class']
    encoder = LabelEncoder()
    y = encoder.fit_transform(y).reshape(-1,1)
    X = np.array(pd.get_dummies(X, columns=X.columns))

  if dataset == 'haberman':    ### from UCI
    df =  pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data",names=['A','B','C','D'])
    X = df.drop(['D'],axis=1)
    y = df['D']
    encoder = LabelEncoder()
    y = encoder.fit_transform(y).reshape(-1,1)    
    X = np.array(pd.get_dummies(df, columns=df.columns))

  if dataset == 'breast-cancer-wisconsin':    ## from UCI
    df =  pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",names=['id','Clump','Size','Shape','Adhesion','Epithelial','Nuclei','Chromatin','Normal','Mitoses','Class'])
    y = df['Class'] 
    df = df.drop(['Class', 'id'],axis=1)
    encoder = LabelEncoder()
    y = encoder.fit_transform(y).reshape(-1,1)    
    X = np.array(pd.get_dummies(df, columns=df.columns))

  if dataset=='monk':     ###  from UCI
      df =  pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/monks-problems/monks-2.test",delimiter=' ',names=['label','a1','a2','a3','a4','a5','a6','id'])
      y= df['label']
      df = df.drop(['label','id'],axis=1)
      y=y.values.reshape(-1,1)
      X = np.array(pd.get_dummies(df, columns=df.columns))

  if dataset=='tic-tac-toe':     ###  from UCI
    df =  pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/tic-tac-toe/tic-tac-toe.data",names=['1','2','3','4','5','6','7','8','9','label'])
    y= df['label']
    df = df.drop(['label'],axis=1)
    encoder = LabelEncoder()
    y = encoder.fit_transform(y).reshape(-1,1)    
    X = np.array(pd.get_dummies(df, columns=df.columns))

  return X,y.ravel()

## 7.2 - Evaluation

In [11]:
#load config fils with all combinations of dataset, classifier, hyper-params
df_runs = pd.read_csv('https://drive.google.com/uc?id=1zHPqGjzDbLzjaswQA80OFHUsKRhqR3LH&export=download',encoding='UTF-8')

#for partial run
# df_for_table = df_runs.iloc[[5,15,27,37,41,46,53,58,9,19], :]     # partial list for report.
# df_for_table

#for full run
df_for_table = df_runs

In [None]:
#def run_and_log_classifier(project,name,params):
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
scoring = {'accuracy' : make_scorer(accuracy_score),
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score),
           'f1_score' : make_scorer(f1_score),
           'auc-roc': make_scorer(roc_auc_score)}
results=[]
kfold = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2)
for run in range(df_for_table.shape[0]):
  if df_for_table.iloc[run]['depth'] == 'None':
    depth = None
  else:
    depth = int(df_for_table.iloc[run]['depth'])

  X,y = prepare_dataset(df_for_table.iloc[run]['dataset'])

  if df_for_table.iloc[run]['classifier']=='MyBaggingID3':
    clf=MyBaggingID3(n_estimators = df_for_table.iloc[run]['n_estimators'], max_depth = depth,
                     max_features=df_for_table.iloc[run]['max_features'],max_samples=df_for_table.iloc[run]['max_samples'])
  else:
    clf=BaggingClassifier(n_estimators = df_for_table.iloc[run]['n_estimators'], 
                          max_features=df_for_table.iloc[run]['max_features'],max_samples=df_for_table.iloc[run]['max_samples'], 
                          estimator= DecisionTreeClassifier(criterion="entropy", max_depth = depth))
  
  res = cross_validate(estimator=clf,
                              X=np.array(X),
                              y=y,
                              cv=kfold,
                              scoring=scoring, 
                              return_train_score=True,
                              error_score="raise")
  # for running mean values only
  # dic = pd.DataFrame(res).mean().to_dict()
  # tag = [str(x) for x in list(df_for_table.iloc[run,:])]
  # dic.update(dict(zip(df_for_table.columns,tag)))
  # run_name=f'{df_for_table.iloc[run]["dataset"]}_{df_for_table.iloc[run]["classifier"]}'  
  # print(run_name)
  # wandb_run = wandb.init(project="CL_EX1", name=run_name)
  # wandb_run.log(dic)

  #for full run
  dic = pd.DataFrame(res).to_dict()
  tag = [str(x) for x in list(df_for_table.iloc[run,:])]
  run_name=f'{df_for_table.iloc[run]["dataset"]}_{df_for_table.iloc[run]["classifier"]}_{run}'  
  print(run_name)
  config = dict(zip(df_for_table.columns,tag))
  wandb_run = wandb.init(project="CL_EX1_1", name=run_name,config=config)
  for key in res.keys():
     for r in range(len(res[key])):
          wandb_run.log({key:res[key][r]})
  wandb.finish

In [9]:
# ## get shape of each dataset
# for ds in ['tic-tac-toe','breast-cancer-wisconsin','mushroom','haberman','monk']:
#   x,y=prepare_dataset(ds)
#   print(ds,x.shape)