In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
from scipy.stats import mode

The dataset can be downloaded from here:

https://archive.ics.uci.edu/ml/datasets/Blood+Transfusion+Service+Center

In [2]:
df = pd.read_csv(r".\dataset\transfusion.data",sep=',')

df

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [3]:
#Converting the data to numpy array.

data = np.array(df)
indices_list = [i for i in range(data.shape[0])]

## Create SubSample 

In [4]:
def Create_Subsamples(data,indices_list,ratio=1.0):
    
    #find the number of examples in a sample
    n_examples_per_samples = round(len(data)*ratio)
    
    # Randomly select n_examples_per_samples from the data
    sample_indices = random.sample(range(data.shape[0]),n_examples_per_samples)
    
    sample = data[sample_indices]
    
    return sample

In [5]:
# test this method for finding the mean of the data. 
test_data = np.array([[random.randrange(10,19)] for i in range(10)])

print("sample:",test_data)

True_mean = np.mean(test_data)
print("True Mean:",True_mean)

sample_sizes = random.sample(range(1,11),5)
ratio = 0.30
test_indices = [i for i in range(len(test_data))]

for i in sample_sizes:
    
    samples_mean = []
    
    for i in range(i):
        
        sample = Create_Subsamples(test_data,test_indices,ratio)
        samples_mean.append(np.mean(sample))
        
    print("For Sample Size of",i,"the mean of all samples is:",np.mean(samples_mean))

sample: [[11]
 [15]
 [13]
 [14]
 [10]
 [18]
 [10]
 [15]
 [14]
 [18]]
True Mean: 13.8
For Sample Size of 2 the mean of all samples is: 14.0
For Sample Size of 6 the mean of all samples is: 14.047619047619047
For Sample Size of 4 the mean of all samples is: 13.333333333333332
For Sample Size of 7 the mean of all samples is: 13.541666666666668
For Sample Size of 1 the mean of all samples is: 14.5


# Build the decision Tree

In [6]:
def split_data(value,column_index,data):
    left = data[data[:,column_index] <= value]
    right = data[data[:,column_index] > value]
    
    return left,right

In [7]:
def Gini_Index(groups,classes):
    total_samples = sum(len(group) for group in groups)
    gini_index = 0
    for group in groups:
        size = len(group)
        score = 0
        if size == 0:
            continue
        for class_label in classes:
            p = sum(group[:,-1]==class_label)/size
            score += p*p
        
        gini_index += (1 - score)*(size/total_samples)
    
    return gini_index

In [8]:
def getOptimalSplit_value(data):
    classes = np.unique(data[:,-1])
    best_index,best_gini,best_splitval,best_group = 9999,9999,9999,None
    for index in range(data.shape[1] - 1):
        for row in data:
            groups = split_data(row[index],index,data)
            gini_index = Gini_Index(groups,classes)
            if gini_index < best_gini:
                best_index,best_gini,best_splitval,best_group = index,gini_index,row[index],groups
    
    return {
        "index":best_index,
        "gini":best_gini,
        "value":best_splitval,
        "group":best_group
           }

In [9]:
# This can be used to return the most probable value that the node can return based what class type data does it have most
def terminal_nodes(groups):
    return int(mode(groups[:,-1])[0])

In [10]:
# Build the tree recursively 
def Recursive_split(node,depth,maxdepth,min_size):
    left,right = node['group']
    
    del(node['group'])
    
    if (left.size == 0) or (right.size == 0):
        array = np.array(list(left)+list(right))
        node['left'] = node['right'] = terminal_nodes(array)
        return
    
    if depth>=maxdepth:
        node['left'],node['right'] = terminal_nodes(left),terminal_nodes(right)
        return
    
    if len(left)<=min_size:
        node['left'] = terminal_nodes(left)
        
    else:
        node['left'] = getOptimalSplit_value(left)
        Recursive_split(node['left'],depth+1,maxdepth,min_size)
    
    if len(right)<=min_size:
        node['right'] = terminal_nodes(right)
        
    else:
        node['right'] = getOptimalSplit_value(right)
        Recursive_split(node['right'],depth+1,maxdepth,min_size)

In [11]:
def build_tree(data,max_depth,min_size):
    root = getOptimalSplit_value(data)
    Recursive_split(root,1,max_depth,min_size)
    return root

# Build Bagging Trees

In [12]:
def Build_Bagging_trees(data,indices_list,max_depth,min_size,ratio,ntrees):
    trees = []
    for i in range(ntrees):
        sample_data = Create_Subsamples(data,indices_list,ratio)
        tree = build_tree(sample_data,max_depth,min_size)
        trees.append(tree)
    
    return trees

In [13]:
max_depth = 5
min_size = 7
ratio = 0.40
n_trees = 5
trees = Build_Bagging_trees(data,indices_list,max_depth,min_size,ratio,n_trees)

## Predictions

In [14]:
def Predict(node,test_row):
    if test_row[node['index']] < node['value']:
        if isinstance(node['left'],dict):
            return Predict(node['left'],test_row)
        else:
            return node['left']
    else:
        if isinstance(node['right'],dict):
            return Predict(node['right'],test_row)
        else:
            return node['right']

In [15]:
def Get_Prediction(tree,data):
    Predictions = []
    for row in data:
        Prediction = Predict(tree,row)
        Predictions.append(Prediction)
    return np.array(Predictions)

In [16]:
def Bagging_predict(trees,test_data):
    Predictions_results = np.ones((len(test_data),1))
    for tree in trees:
        predictions = Get_Prediction(tree,test_data)
        Predictions_results = np.append(Predictions_results,predictions.reshape(len(predictions),1),axis=1)
    
    final_predictions_trimmed = Predictions_results[:,1:]
    
    prediction = mode(final_predictions_trimmed,axis=1)[0]
    
    return prediction

In [17]:
p = Bagging_predict(trees,data)

#Calculate the accuracy on the entire dataset
y = data[:,-1].reshape(len(data),1)

print("The accuracy on the entire dataset is given by:",np.mean(p == y)*100)

The accuracy on the entire dataset is given by: 74.59893048128342


## Finding Optimal values 

In this part i will be determining the optmial values for the number of trees using a validation dataset.

In the decision tree module(please refer to the Jupyter Notebook named Decision trees), i found the best accuracy occurs when

**max_depth** = 2, 
**min_size** = 5

In [18]:
train, validate, test = np.split(df.sample(frac=1, random_state=42), [int(.7*len(df)), int(.85*len(df))])

train, validate, test = np.array(train),np.array(validate),np.array(test)

when n_trees = 1, then its nothing but decision tree.


In [19]:
n_trees_set = [i for i in range(2,6)]

In [25]:
def Optimal_tree_number(train,validate,n_tree_set,max_depth,min_size):
    
    #make a list of indices in the train set
    indices_list = [i for i in range(train.shape[0])]
    
    # separate the labels
    y = validate[:,-1].reshape(len(validate),1)
    
    opt_no_trees = 999
    accuracy = -100
    
    for n_trees in n_tree_set:
        
        #build the trees
        trees = Build_Bagging_trees(train,indices_list,max_depth,min_size,ratio,n_trees)
                    
        #Predict the values on the validation set            
        p = Bagging_predict(trees,validate)
        
        #Calculate the accuracy model
        acc = np.mean(p == y)*100
        print("The accuracy with",n_trees," trees on the validation dataset is given by:",acc)
        
        if acc > accuracy:
            opt_no_trees,accuracy = n_trees,acc
            
    
    return opt_no_trees

In [21]:
max_depth = 2
min_size = 5
n_trees = Optimal_tree_number(train,validate,n_trees_set,max_depth,min_size)

print("The optimal number of trees are:",n_trees)

The accuracy with 2  trees on the validation dataset is given by: 81.25
The accuracy with 3  trees on the validation dataset is given by: 81.25
The accuracy with 4  trees on the validation dataset is given by: 85.71428571428571
The accuracy with 5  trees on the validation dataset is given by: 83.92857142857143
The optimal number of trees are: 4


In [24]:
#test the result on the test set.

# Build the model for max_depth = 2, min_size = 5 and n_trees = 4 with 0.40 ratio

max_depth = 2
min_size = 5
n_trees = 4
ratio = 0.40
indices_list = [i for i in range(train.shape[0])]
trees = Build_Bagging_trees(train,indices_list,max_depth,min_size,ratio,n_trees)

#Predict the values on the validation set            
p = Bagging_predict(trees,test)

y = test[:,-1].reshape(len(test),1)

#Calculate the accuracy model
acc = np.mean(p == y)*100
print("The accuracy with",n_trees,"trees on the Test dataset is given by:",acc)

The accuracy with 4 trees on the Test dataset is given by: 80.53097345132744
