**Decision Trees and Random Forest**

In [77]:
import numpy as np
import pandas as pd

class Node:
    def __init__(self, predicted_class):
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None

class DecisionTreeClassifier:
  
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_classes_ = len(set(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self.grow_tree(X, y)

    def grow_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
        predicted_class = np.argmax(num_samples_per_class)
        node = Node(predicted_class=predicted_class)
        if depth < self.max_depth:
            idx, thr = self.best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self.grow_tree(X_left, y_left, depth + 1)
                node.right = self.grow_tree(X_right, y_right, depth + 1)
        return node

    def best_split(self, X, y):
        m = y.size
        if m <= 1:
            return None, None
        num_parent = [np.sum(y == c) for c in range(self.n_classes_)]
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
        best_idx, best_thr = None, None
        for idx in range(self.n_features_):
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            num_left = [0] * self.n_classes_
            num_right = num_parent.copy()
            for i in range(1, m):  
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum((num_left[x] / i) ** 2 for x in range(self.n_classes_))
                gini_right = 1.0 - sum((num_right[x] / (m - i)) ** 2 for x in range(self.n_classes_))
                gini = (i * gini_left + (m - i) * gini_right) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2  
        return best_idx, best_thr

    def predict(self, X):
        return [self._predict(inputs) for inputs in X]

    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class

def RandomForest(X_Train,Y_Train,X_Test):
    numberSamples = X_Train.shape[0]
    featuresCount = X_Train.shape[1]
    featuresInDecisionTree = int((np.ceil(np.sqrt(featuresCount))))
    # featuresInDecisionTree = 12
    samplesInDecisionTree = int(np.ceil(0.8 * numberSamples))
    randomForestSize = 7
    randomForest = []
    for i in range(randomForestSize):
        selectedFeatures = np.random.randint(featuresCount, size=featuresInDecisionTree)
        selectedSamples = np.random.randint(numberSamples, size=samplesInDecisionTree)
        Xselected = X_Train[selectedSamples][:, selectedFeatures]
        Yselected = Y_Train[selectedSamples]
        tree = DecisionTreeClassifier(max_depth = 3)
        tree.fit(Xselected,Yselected)
        randomForest.append(tree)  
    predictions = {}
    for i in range(len(randomForest)):
        column = "decision tree" + str(i)
        predictions[column] = randomForest[i].predict(X_Test)
    predictions = pd.DataFrame(predictions)
    return np.asarray(predictions.mode(axis = 1)[0])

def normalization(X_train):
    X_train= (X_train - X_train.mean()) / (X_train.max() - X_train.min())
    return X_train

def ConfusionMatrix(y_true,y_pred):
    no_of_classes = len(np.unique(Y))
    array = y_true * no_of_classes + y_pred
    x = np.histogram(array, bins=range(min(array), (pow(no_of_classes, 2) + (min(array) + 1))))
    cm = np.asarray(x[0]).reshape(no_of_classes, no_of_classes)
    return cm

def Accuracy(y_true,y_pred):
    correct = 0
    for i in range(len(y_true)):
        if y_true[i] == y_pred[i]:
            correct += 1
    accuracy = correct / float(len(y_true)) * 100.0
    return accuracy

def Recall(y_true,y_pred):
    cm = ConfusionMatrix(y_true, y_pred)
    recall = np.diag(cm) / np.sum(cm, axis=1) * 100.0
    return np.mean(recall)

def Precision(y_true,y_pred):
    cm = ConfusionMatrix(y_true, y_pred)
    precision = np.diag(cm) / np.sum(cm, axis=0) * 100.0
    return np.mean(precision)

def F1_Score(y_true,y_pred):
    precision = Precision(y_true,y_pred)
    recall = Recall(y_true,y_pred)
    score = 2.0 * ((precision * recall)/(precision + recall))
    return score/100

#main
datasets = ['project3_dataset1','project3_dataset2']
# datasets = ['project3_dataset1']
for i in datasets:
  df = pd.read_csv('/content/drive/My Drive/DM Proj3/'+i+'.txt', sep="\t", header=None)
  X = df.iloc[:, :-1]
  for i, j in X.iteritems():
        if j.dtype == np.object:
          X = pd.concat([X, pd.get_dummies(j, prefix=i)], axis = 1)
          X.drop([i],axis=1, inplace=True)
  Y = df.iloc[:,-1]
  X = normalization(X)
  X = np.asarray(X)
  Y = np.asarray(Y)
  train_dataset = np.array_split(X,10)
  ground_truth = np.array_split(Y,10)
  accuracy_dt = accuracy_rf = 0.0
  precision_dt = precision_rf = 0.0
  recall_dt = recall_rf = 0.0
  f1_score_dt = f1_score_rf = 0.0
  for i in range(10):
    X_Train = np.array(np.concatenate([y for (x,y) in enumerate(train_dataset,0) if x != i],axis=0))
    Y_Train =  np.array(np.concatenate([y for (x,y) in enumerate(ground_truth,0) if x != i],axis=0))
    X_Test = np.array(train_dataset[i])
    Y_Test = np.array(ground_truth[i])

    #Decision Tree
    clf = DecisionTreeClassifier(max_depth=3)
    clf.fit(X_Train,Y_Train)
    pred_labels_dt = clf.predict(X_Test)
    accuracy_dt += Accuracy(Y_Test,pred_labels_dt)
    precision_dt += Precision(Y_Test,pred_labels_dt)
    recall_dt += Recall(Y_Test,pred_labels_dt)
    f1_score_dt += F1_Score(Y_Test,pred_labels_dt)

    #Random Forest
    pred_labels_rf = RandomForest(X_Train,Y_Train,X_Test)
    accuracy_rf += Accuracy(Y_Test,pred_labels_rf)
    precision_rf += Precision(Y_Test,pred_labels_rf)
    recall_rf += Recall(Y_Test,pred_labels_rf)
    f1_score_rf += F1_Score(Y_Test,pred_labels_rf)
  print('Evaluation Results for Decision Tree')
  print('Accuracy:',accuracy_dt/10)
  print('Precision:',precision_dt/10)
  print('Recall:',recall_dt/10)
  print('F1 Score:',f1_score_dt/10)
  print('Evaluation Results for Random Forest')
  print('Accuracy:',accuracy_rf/10)
  print('Precision:',precision_rf/10)
  print('Recall:',recall_rf/10)
  print('F1 Score:',f1_score_rf/10)

Evaluation Results for Decision Tree
Accuracy: 92.78508771929825
Precision: 92.6219853341017
Recall: 91.62996027492564
F1 Score: 0.9211562352441531
Evaluation Results for Random Forest
Accuracy: 90.50438596491229
Precision: 90.08288346887953
Recall: 89.13928422013707
F1 Score: 0.8960554841670845
Evaluation Results for Decision Tree
Accuracy: 69.90286771507864
Precision: 66.67747826870945
Recall: 63.78491608742282
F1 Score: 0.6516852030222772
Evaluation Results for Random Forest
Accuracy: 66.22109158186865
Precision: 63.05605261631921
Recall: 54.53497797555523
F1 Score: 0.5830513822248238
