In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import mode
import random

The dataset can be downloaded from here:

https://archive.ics.uci.edu/ml/datasets/Blood+Transfusion+Service+Center

In [2]:
df = pd.read_csv(r"dataset/transfusion.data",sep=',')
data = np.array(df)

df

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [3]:
def Initialize_weights(length):
    w = np.array([(1.0/length) for i in range(length)]).reshape(length,1)
    return w

In [4]:
def split_data(value,column_index,data):
    left = data[data[:,column_index] <= value]
    right = data[data[:,column_index] > value]
    
    return left,right

In [5]:
def Gini_Index(groups,classes):
    total_samples = sum(len(group) for group in groups)
    gini_index = 0
    for group in groups:
        size = len(group)
        score = 0
        if size == 0:
            continue
        for class_label in classes:
            p = sum(group[:,-1]==class_label)/size
            score += p*p
        
        gini_index += (1 - score)*(size/total_samples)
    
    return gini_index

In [6]:
def getOptimalSplit_value(data):
    classes = np.unique(data[:,-1])
    best_index,best_gini,best_splitval,best_group = 9999,9999,9999,None
    for index in range(data.shape[1] - 1):
        for row in data:
            groups = split_data(row[index],index,data)
            gini_index = Gini_Index(groups,classes)
            if gini_index < best_gini:
                best_index,best_gini,best_splitval,best_group = index,gini_index,row[index],groups
    
    return {
        "index":best_index,
        "gini":best_gini,
        "value":best_splitval,
        "group":best_group
           }

## Build Stump

In [7]:
# This can be used to return the most probable value that the node can return based what class type data does it have most
def terminal_nodes(groups):
    return int(mode(groups[:,-1])[0])

In [8]:
def Build_Stump(data):
    root = getOptimalSplit_value(data)
    left,right = root['group']
    del(root['group'])
    
    if (left.size == 0) or (right.size == 0):
        array = np.array(list(left)+list(right))
        root['left'] = root['right'] = terminal_nodes(array)
    
    else:
        root['left'],root['right'] = terminal_nodes(left),terminal_nodes(right)
    
    return root

In [9]:
def Predict(node,test_row):
    if test_row[node['index']] < node['value']:
        if isinstance(node['left'],dict):
            return Predict(node['left'],test_row)
        else:
            return node['left']
    else:
        if isinstance(node['right'],dict):
            return Predict(node['right'],test_row)
        else:
            return node['right']

In [10]:
def Get_Prediction(tree,data):
    Predictions = []
    for row in data:
        Prediction = Predict(tree,row)
        Predictions.append(Prediction)
    return np.array(Predictions)

In [11]:
def errors(data,p,w):
    perror = np.zeros((len(data),1))
    perror[p!=data[:,-1]] = 1
    error = np.sum(perror * w)/(np.sum(w))
    stage = np.log((1-error)/error)
    perror[perror == 0] = -1
    
    return perror,stage

In [12]:
def update_weights(w,perror,stage):
    wdash = w*np.exp(perror*stage)
    wdash = wdash/np.sum(wdash)
    return wdash

In [13]:
def get_cumulative_freq(w):
    cumulative_freq = np.zeros((len(w),1))
    for i in range(len(w)):
        if i == 0:
            cumulative_freq[i] = 0
        else:
            cumulative_freq[i] = np.sum(w[:(i+1),])
    
    return cumulative_freq

In [14]:
def get_index_ofdata(number,cumulative_freq):
    for i in range(len(cumulative_freq)-1):
        if (cumulative_freq[i][0] <= number < cumulative_freq[i+1][0]):
            return i

In [15]:
def Create_dataset(length,dataset,cumulative_freq):
    newdataset = []
    data_list = np.ndarray.tolist(dataset)
    for i in range(length):
        number = np.random.random(1)[0]
        index = get_index_ofdata(number,cumulative_freq)
        newdataset.append(data_list[index])
    
    return np.array(newdataset)

In [17]:
def AdaBoost_Training(data,no_stumps):
    w = Initialize_weights(len(data))
    trees = []
    stages = []
    dataset = data
    for each in range(no_stumps):
        tree = Build_Stump(dataset)
        trees.append(tree)
        pred = Get_Prediction(tree,dataset)
        perror,stage = errors(data,pred,w)
        stages.append(stage)
        w = update_weights(w,perror,stage)
        cumulative_freq = get_cumulative_freq(w)
        dataset = Create_dataset(len(dataset),dataset,cumulative_freq)
    return trees,stages

In [18]:
no_stumps = 100
trees,stages = AdaBoost(data,no_stumps)

In [19]:
def AdaBoost_Predict(trees,data,stages,length):
    model_predictions = np.ones((length,1))
    for i in range(len(trees)):
        pred = Get_Prediction(trees[i],data)
        pred[pred==0] = -1
        model_pred = stages[i] * pred
        model_predictions = np.append(model_predictions,model_pred.reshape(length,1),axis=1)
    
    model_predictions = model_predictions[:,1:]
    final_pred = np.sign(np.sum(model_predictions,axis=1))
    
    final_pred[final_pred == -1] = 0
    
    return final_pred

In [20]:
pred = AdaBoost_Predict(trees,data,stages,len(data))

In [21]:
print("The accuracy on the Training data is given by:",np.mean(pred == data[:,-1])*100)

The accuracy on the Training data is given by: 66.97860962566845
