In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from scipy.stats import mode
from sklearn.model_selection import train_test_split

The dataset can be downloaded from here:

https://archive.ics.uci.edu/ml/datasets/Blood+Transfusion+Service+Center

In [2]:
df = pd.read_csv(r".\dataset\transfusion.data",sep=',')

data = np.array(df)

df

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [3]:
def Calc_Indices_Samples(data):
    sample_indices_list = [i for i in range(data.shape[0])]
    
    return sample_indices_list

def Calc_Indices_features(data):
    features_indices_list = [i for i in range(data.shape[1] - 1)]
    
    return features_indices_list

In [4]:
features_indices_list = Calc_Indices_features(data)
sample_indices_list = Calc_Indices_Samples(data)

### Create Sub Features and Sub Samples

In [5]:
def Create_SubFeatures(features_indices_list,n_features):
    features_list = np.unique(random.sample(features_indices_list,n_features))
    
    return features_list

In [6]:
def Create_Subsamples(data,indices_list,ratio=1.0):
    
    #find the number of examples in a sample
    n_examples_per_samples = round(len(data)*ratio)
    
    # Randomly select n_examples_per_samples from the data
    sample_indices = random.sample(range(data.shape[0]),n_examples_per_samples)
    
    sample = data[sample_indices]
    
    return sample

### Build Random Forest

In [7]:
def split_data(value,column_index,data):
    left = data[data[:,column_index] <= value]
    right = data[data[:,column_index] > value]
    
    return left,right

In [8]:
def Gini_Index(groups,classes):
    total_samples = sum(len(group) for group in groups)
    gini_index = 0
    for group in groups:
        size = len(group)
        score = 0
        if size == 0:
            continue
        for class_label in classes:
            p = sum(group[:,-1]==class_label)/size
            score += p*p
        
        gini_index += (1 - score)*(size/total_samples)
    
    return gini_index

In [9]:
def getOptimalSplit_value(data,n_features):
    classes = np.unique(data[:,-1])
    best_index,best_gini,best_splitval,best_group = 9999,9999,9999,None
    features_indices_list = Calc_Indices_features(data)
    sub_features = Create_SubFeatures(features_indices_list,n_features)
    for index in sub_features:
        for row in data:
            groups = split_data(row[index],index,data)
            gini_index = Gini_Index(groups,classes)
            if gini_index < best_gini:
                best_index,best_gini,best_splitval,best_group = index,gini_index,row[index],groups
    
    return {
        "index":best_index,
        "gini":best_gini,
        "value":best_splitval,
        "group":best_group
           }

In [10]:
# This can be used to return the most probable value that the node can return based what class type data does it have most
def terminal_nodes(groups):
    return int(mode(groups[:,-1])[0])

In [11]:
# Build the tree recursively 
def Recursive_split(node,depth,maxdepth,min_size,n_features):
    left,right = node['group']
    
    del(node['group'])
    
    if (left.size == 0) or (right.size == 0):
        array = np.array(list(left)+list(right))
        node['left'] = node['right'] = terminal_nodes(array)
        return
    
    if depth>=maxdepth:
        node['left'],node['right'] = terminal_nodes(left),terminal_nodes(right)
        return
    
    if len(left)<=min_size:
        node['left'] = terminal_nodes(left)
        
    else:
        node['left'] = getOptimalSplit_value(left,n_features)
        Recursive_split(node['left'],depth+1,maxdepth,min_size,n_features)
    
    if len(right)<=min_size:
        node['right'] = terminal_nodes(right)
        
    else:
        node['right'] = getOptimalSplit_value(right,n_features)
        Recursive_split(node['right'],depth+1,maxdepth,min_size,n_features)

In [12]:
def build_tree(data,max_depth,min_size,n_features):
    root = getOptimalSplit_value(data,n_features)
    Recursive_split(root,1,max_depth,min_size,n_features)
    return root

In [13]:
def Build_RF(data,indices_list,max_depth,min_size,ntrees,n_features):
    trees = []
    for i in range(ntrees):
        sample_data = Create_Subsamples(data,indices_list)
        tree = build_tree(sample_data,max_depth,min_size,n_features)
        trees.append(tree)
    
    return trees

In [14]:
max_depth = 3
min_size = 5
n_trees = 7
n_features = int(np.sqrt(len(features_indices_list)))
trees = Build_RF(data,sample_indices_list,max_depth,min_size,n_trees,n_features)

### Predictions

In [15]:
def Predict(node,test_row):
    if test_row[node['index']] < node['value']:
        if isinstance(node['left'],dict):
            return Predict(node['left'],test_row)
        else:
            return node['left']
    else:
        if isinstance(node['right'],dict):
            return Predict(node['right'],test_row)
        else:
            return node['right']

In [16]:
def Get_Prediction(tree,data):
    Predictions = []
    for row in data:
        Prediction = Predict(tree,row)
        Predictions.append(Prediction)
    return np.array(Predictions)

In [17]:
def RF_predict(trees,test_data):
    Predictions_results = np.ones((len(test_data),1))
    for tree in trees:
        predictions = Get_Prediction(tree,test_data)
        Predictions_results = np.append(Predictions_results,predictions.reshape(len(predictions),1),axis=1)
    
    final_predictions_trimmed = Predictions_results[:,1:]
    
    prediction = mode(final_predictions_trimmed,axis=1)[0]
    
    return prediction

In [18]:
p = RF_predict(trees,data)

#Calculate the accuracy on the entire dataset
y = data[:,-1].reshape(len(data),1)

print("The accuracy on the entire dataset is given by:",np.mean(p == y)*100)

The accuracy on the entire dataset is given by: 77.00534759358288


### Train & Test Splits

In [19]:
X_train, X_test, y_train, y_test = train_test_split(data[:,0:-1],data[:,-1],test_size=0.3,random_state=30)

In [20]:
max_depth = 2
min_size = 5
n_trees = 7
n_features = int(np.sqrt(len(features_indices_list)))

train_data = np.append(X_train,y_train.reshape(y_train.shape[0],1),axis=1)
test_data = np.append(X_test,y_test.reshape(y_test.shape[0],1),axis=1)

In [21]:
trees = Build_RF(data,sample_indices_list,max_depth,min_size,n_trees,n_features)

In [22]:
p = RF_predict(trees,train_data)

In [23]:
# Accuracy on the train data:
print("The Accuracy on the training set is:",np.mean(p == y_train)*100)

The Accuracy on the training set is: 75.52581261950286


In [24]:
#Predictions on the test data
p = RF_predict(trees,test_data)

# The accuracy on the test set
print("The Accuracy on the testing set is:",np.mean(p == y_test)*100)

The Accuracy on the testing set is: 77.77777777777779
