In [54]:
from random import seed
from random import randrange
from csv import reader
import numpy as np
from sklearn.model_selection import train_test_split
import math
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [64]:
class RandomForest:
    def __init__(self, sampleSize, n_trees,n_features, maxDepth,partitions,minSize):
        self.maxDepth = maxDepth
        self.partitions = partitions
        self.minSize = minSize
        self.sampleSize = sampleSize
        self.n_trees = n_trees
        self.n_features = n_features
        self.dt = DecisionTree(self.maxDepth,
                            self.partitions,
                            self.minSize,
                            self.n_features)
        
    def run(self,train_set, test_set):
        trees = list()
        np.random.seed(11)
        for i in range(self.n_trees):
            sample = self.subsample(train_set)
            tree = self.dt.buildDecisionTree(sample)
            trees.append(tree)
        predictions = [self.baggingPredict(trees, row) for row in test_set]
        actual = np.array(test_set)[:,len(test_set[0])-1]
        Accuracy,Precision,Recall,F1_Score = self.dt.evaluatePerformance(actual, predictions)
        return Accuracy,Precision,Recall,F1_Score
             
    # Make a prediction with a list of bagged trees
    def baggingPredict(self,trees, row):
        predictions = [self.dt.predict(tree, row) for tree in trees]
        return max(set(predictions), key=predictions.count)

    def subsample(self,dataset):
        sample = list()
        n_sample = round(len(dataset) * self.sampleSize)
        while len(sample) < n_sample:
            index = randrange(len(dataset))
            sample.append(dataset[index])
        return sample

In [65]:
class DecisionTree:
    def __init__(self, maxDepth,partitions,minSize,n_features = None):
        self.maxDepth = maxDepth
        self.partitions = partitions
        self.minSize = minSize
        self.n_features = n_features
         
    # Calculate accuracy percentage
    def calculateAccuracyMetrics(self,actual, predicted):
        correct = 0
        for i in range(len(actual)):
            if actual[i] == predicted[i]:
                correct += 1
        return correct / float(len(actual)) * 100.0
    
    def evaluatePerformance(self,y_test, predictedLabelList):
        data = {'Predicted': predictedLabelList ,
                'Actual':  y_test  
               }

        df = pd.DataFrame(data, columns=['Actual','Predicted'])

        confusion_matrix = pd.crosstab(df['Actual'], df['Predicted'])

        tn=confusion_matrix[0][0]
        fn=confusion_matrix[0][1]
        fp=confusion_matrix[1][0]
        tp=confusion_matrix[1][1]

        Accuracy=(tp+tn)/(tp+tn+fp+fn)*100
        Precision=(tp)/(tp+fp)
        Recall=(tp)/(tp+fn)
        F1_Score= 2*((Recall*Precision)/(Recall + Precision))
        #print("TP:",tp," TN:",tn," FN:",fn," FP:",fp)
        return Accuracy,Precision,Recall,F1_Score
    
    # Evaluate an algorithm using a cross validation split
    def fit(self,train_set,test_set):
        predicted = self.getDecisionTree(train_set, test_set)
        actual = np.array(test_set)[:,len(test_set[0])-1]
        Accuracy,Precision,Recall,F1_Score = self.evaluatePerformance(actual, predicted)
        return Accuracy,Precision,Recall,F1_Score
    
    # Split a dataset based on an attribute and an attribute value
    def doRandomSplit(self,index, value, dataset):
        left = []
        right = []
        _ = [left.append(row) if row[index] < value else right.append(row) for row in dataset]
        return left, right
    
    # Calculate the Gini index for a split dataset
    def calGiniIndex(self,groups, labels):
        # total gives the summation of all the instances combining all groups
        total = float(sum([len(group) for group in groups]))
        # initialise gini to zero
        gini = 0.0
        for group in groups:
            # gives the elements belonging to each of the group
            group_size = float(len(group))
            # ignores if the size is 0
            if group_size == 0:
                continue
            # initializing score to zero
            score = 0.0
            for label in labels:
                # calculate the score for each of the data split
                p = [row[-1] for row in group].count(label) / group_size
                score += p * p
            # weight the group score by its relative size
            gini += (1.0 - score) * (group_size / total)
        return gini
    
    # Select the best split point for a dataset
    def findBestSplit(self,dataset):
        # list pf different labels in the last column of the data
        labels = list(set(row[-1] for row in dataset))
        b_index, b_value, b_score, b_groups = 999, 999, 999, None
        if self.n_features == None:
            for index in range(len(dataset[0])-1):
                for row in dataset:
                    groups = self.doRandomSplit(index, row[index], dataset)
                    gini = self.calGiniIndex(groups, labels)
                    if gini < b_score:
                        #When selecting the best split and using it as a new node for the tree we will
                        #store the index of the chosen attribute, the value of that attribute by 
                        #which to split and the two groups of data split by the chosen split point
                        b_index, b_value, b_score, b_groups = index, row[index], gini, groups
        else:
            features = list()
            while len(features) < self.n_features:
                index = randrange(len(dataset[0])-1)
                if index not in features:
                    features.append(index)
            for index in features:
                for row in dataset:
                    groups = self.test_split(index, row[index], dataset)
                    gini = self.calGiniIndex(groups, labels)
                    if gini < b_score:
                        #When selecting the best split and using it as a new node for the tree we will
                        #store the index of the chosen attribute, the value of that attribute by 
                        #which to split and the two groups of data split by the chosen split point
                        b_index, b_value, b_score, b_groups = index, row[index], gini, groups
        return {'index':b_index, 'value':b_value, 'groups':b_groups, 'gini':b_score}
    
    # Create a terminal node value
    def leaf(self,group):
        outcomes = [row[-1] for row in group]
        return max(set(outcomes), key=outcomes.count)
    
    # Create child splits for a node or make leaf
    def split(self,node,depth):
        # the two groups of data split by the node are extracted for use
        left, right = node['groups']
        # check for a no split
        # if either the left group or the right group is empty then we assign it as the leaf node
        if not left or not right:
            node['left'] = node['right'] = self.leaf(left + right)
            return
        # check for max depth if the max depth is reached we agian create the leaf node
        if depth >= self.maxDepth:
            node['left'], node['right'] = self.leaf(left), self.leaf(right)
            return
        # We check if the the group of rows is too small . If it is too small then we again assign it as a leaf node
        if len(left) <= self.minSize:
            node['left'] = self.leaf(left)
        else:
        # if non of the conditions satisfy we recursively split the node in a similar fashion.
            node['left'] = self.findBestSplit(left)
            self.split(node['left'],depth+1)
        # We check if the number of rows in each group is too small include it as aleaf node
        if len(right) <= self.minSize:
            node['right'] = self.leaf(right)
        else:
        # if non of the above conditions satisfy we recursively split the data again
            node['right'] = self.findBestSplit(right)
            self.split(node['right'], depth+1)
     
    # Split a dataset based on an attribute and an attribute value
    def test_split(self,index, value, dataset):
        left, right = list(), list()
        for row in dataset:
            if row[index] < value:
                left.append(row)
            else:
                right.append(row)
        return left, right

    # Build a decision tree
    def buildDecisionTree(self,train):
        root = self.findBestSplit(train)
        self.split(root, 1)
        #print("\n ***********Decision Tree***********\n")
        #print('Best Split for Feature%d < %.3f Gini=%.3f' % ((root['index']+1), root['value'], root['gini']))
        #print("\n")
        #self.printTree(root,self.maxDepth)
        return root
    
    # Make a prediction with a decision tree
    def predict(self,node, row):
        #We must check if a child node is either a terminal value to be returned as the prediction, 
        #or if it is a dictionary node containing another level of the tree to be considered.
        if row[node['index']] < node['value']:
            # checks if the object is an instance of another instance
            if isinstance(node['left'], dict):
                return self.predict(node['left'], row)
            else:
                return node['left']
        else:
            # checks if the object is an instance of another instance
            if isinstance(node['right'], dict):
                return self.predict(node['right'], row)
            else:
                return node['right']
            
    # Classification and Regression Tree Algorithm
    def getDecisionTree(self,train, test):
        tree = self.buildDecisionTree(train)
        predictions = list()
        for row in test:
            prediction = self.predict(tree, row)
            predictions.append(prediction)
        return(predictions)
    
    # Print a decision tree
    def printTree(self,node, depth=0,level=0):
        if isinstance(node, dict):
            print('%s%s%d%s[Feature %d < %.3f]' % ((depth*' ','Level ',level+1,':', (node['index']+1), node['value'])))
            self.printTree(node['left'], depth+1,level+1)
            self.printTree(node['right'], depth+1,level+1)
        else:
            print('%s[%s]' % (((depth+7)*' ', node)))

In [66]:
# Split a dataset into n partitions so that we can calculate the cost based on different splits
def doCrossValidation(dataset,partitions):
    splits = list()
    datasetCopy = list(dataset)
    chunkSize = len(datasetCopy) // partitions
    leftOver = len(datasetCopy) % partitions
    start = 0
    for i in range(partitions):
        if i < leftOver:
            end = start + chunkSize + 1
        else:
            end = start + chunkSize
        splits.append(datasetCopy[start:end])
        start = end
    return splits 

In [67]:
def convertToFloat(sequence):
    for item in sequence:
        try:
            yield float(item)
        except ValueError as e:
            yield item

In [72]:
accuracy = []
precision = []
recall = []
fmeasure = []
seed(11)

original_data = pd.read_csv('project3_dataset1.txt', delimiter="\t", header=None)
#print(original_data.iloc[0])
obj_df = original_data.select_dtypes(include=['object']).copy()
obj_df = obj_df.apply(LabelEncoder().fit_transform)
for col in obj_df:
    original_data[col] = obj_df[col]
#print(original_data.iloc[0])
dataset = original_data.values

simpleRFData = dataset
numPartitions = 10
maxDepth = 10
minimumSize = 1
sampleSize = 1.0
n_features = int(math.sqrt(len(dataset[0])-1))
n_trees = 5

np.random.seed(11)
np.random.shuffle(simpleRFData)
lent = len(dataset)
trrows = math.floor(0.8*lent)
train_set = simpleRFData[0:trrows]
test_set = simpleRFData[trrows:len(simpleRFData)-1]

rf = RandomForest(sampleSize, n_trees,n_features, maxDepth,numPartitions,minimumSize)
Accuracy,Precision,Recall,F1_Score = rf.run(train_set,test_set)

recall.append(Recall)
precision.append(Precision)
fmeasure.append(F1_Score)
accuracy.append(Accuracy)

mean_accuracy = np.mean(accuracy)
mean_precision = np.mean(precision)
mean_recall = np.mean(recall)
mean_fmeasure = np.mean(fmeasure)

print('\n \nTrees: %d' % n_trees)
print("accuracy is : ",mean_accuracy)
print("precision is : ",mean_precision)
print("recall is : ",mean_recall)
print("fmeasure is : ",mean_fmeasure)



 
Trees: 5
accuracy is :  94.69026548672566
precision is :  0.972972972972973
recall is :  0.8780487804878049
fmeasure is :  0.923076923076923


# Random Forest using KFold

In [73]:
accuracy = []
precision = []
recall = []
fmeasure = []
seed(11)

numPartitions = 10
maxDepth = 10
minimumSize = 1
sampleSize = 1.0
n_features = int(math.sqrt(len(dataset[0])-1))
n_trees = 5

kfoldRFData = dataset
np.random.seed(11)
np.random.shuffle(kfoldRFData)
folds = doCrossValidation(kfoldRFData,numPartitions)

krf = RandomForest(sampleSize, n_trees,n_features, maxDepth,numPartitions,minimumSize)

for index,fold in enumerate(folds):
    print("\n\n********Fold "+str(index+1)+"*********")
    ktrain_set = list(folds)
    ktrain_set.pop(index)
    ktrain_set = sum(ktrain_set, [])
    ktest_set = fold
    Accuracy,Precision,Recall,F1_Score = krf.run(train_set,test_set)
    recall.append(Recall)
    precision.append(Precision)
    fmeasure.append(F1_Score)
    accuracy.append(Accuracy)
    print('\nTrees: %d' % n_trees)
    print("accuracy is : ",Accuracy)
    print("precision is : ",Precision)
    print("recall is : ",Recall)
    print("fmeasure is : ",F1_Score)
    
mean_accuracy = np.mean(accuracy)
mean_precision = np.mean(precision)
mean_recall = np.mean(recall)
mean_fmeasure = np.mean(fmeasure)

print("\n \nMean accuracy is : ",mean_accuracy)
print("Mean precision is : ",mean_precision)
print("Mean recall is : ",mean_recall)
print("Mean fmeasure is : ",mean_fmeasure)



********Fold 1*********

Trees: 5
accuracy is :  95.57522123893806
precision is :  0.9230769230769231
recall is :  0.9473684210526315
fmeasure is :  0.935064935064935


********Fold 2*********

Trees: 5
accuracy is :  96.46017699115043
precision is :  0.9722222222222222
recall is :  0.9210526315789473
fmeasure is :  0.9459459459459458


********Fold 3*********

Trees: 5
accuracy is :  96.46017699115043
precision is :  1.0
recall is :  0.8947368421052632
fmeasure is :  0.9444444444444444


********Fold 4*********

Trees: 5
accuracy is :  94.69026548672566
precision is :  0.9
recall is :  0.9473684210526315
fmeasure is :  0.9230769230769231


********Fold 5*********

Trees: 5
accuracy is :  96.46017699115043
precision is :  0.9722222222222222
recall is :  0.9210526315789473
fmeasure is :  0.9459459459459458


********Fold 6*********

Trees: 5
accuracy is :  96.46017699115043
precision is :  0.9722222222222222
recall is :  0.9210526315789473
fmeasure is :  0.9459459459459458


********F