In [1]:
import numpy as np
import pandas as pd
import random

In [2]:
spam_data=pd.read_csv("https://web.stanford.edu/~hastie/ElemStatLearn//datasets/spam.data",sep=' ',header=None)
name_list=[]
for i in range(len(spam_data.columns)-1):
    i_str= str(i)
    name_list.append("A"+i_str)
name_list.append("label")
spam_data.columns=name_list
spam_data

Unnamed: 0,A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,A48,A49,A50,A51,A52,A53,A54,A55,A56,label
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


### Function Definitions ###

In [3]:
def Is_Pure(data):
    
    label_column = data[:, -1] #extracting only the labels
    unique_labels = np.unique(label_column)
    if len(unique_labels) == 1:
        return True
    else:
        return False

In [4]:
def ClassForPureData(data):  #This function is used to return the class label of the pure data set or it can also be used when we need to pick majority class from a data
    
    label_column = data[:, -1] 
    classlabels,classcounts=np.unique(label_column,return_counts=True)
    majorityclassindex= np.argmax(classcounts)
    classi= classlabels[majorityclassindex]

    return classi

In [5]:
def splits(data,ran_ss):
    
    splits = {} #creating an empty dictionary to store the splits with key of the dictionary as attribute index and values as various splits
    row,col = data.shape #to get the number of columns to iterate over the columns
    
    column_indices= list(range(col-1)) #column indices as list
    
    if ran_ss and ran_ss<=len(column_indices):
        column_indices= random.sample(population=column_indices, k=ran_ss)
        
    for i in column_indices:        # excluding the last column which is the quality label
        splits[i] = []
        tot_col = data[:, i] #to extract the total column of ith attribute
        uni_val = np.unique(tot_col)  #to weed out the repeated or duplicate values of the column
        splits[i]=uni_val
    
    return splits

In [6]:
def split_data(data, column_to_split, value_to_split):
    
    split_column_values = data[:, column_to_split]

    left_data = data[split_column_values <= value_to_split]
    right_data = data[split_column_values > value_to_split]
    
    return left_data, right_data

In [7]:
def entropy(data):
    
    label_column = data[:, -1]
    classlabels, classcounts = np.unique(label_column, return_counts=True) #extract the counts of classes in dataset
    
    #numpy does element wise operations. So probabilities and entropy can be calculated as
    probab = classcounts/ classcounts.sum()
    entropy = sum(probab * -np.log2(probab))
     
    return entropy

In [8]:
def weighted_entropy(ldata, rdata):
    
    tot_data = len(ldata) + len(rdata)
    prob_ldata = len(ldata) / tot_data
    prob_rdata = len(rdata) / tot_data

    weigh_entro =  (prob_ldata * entropy(ldata) 
                      + prob_rdata * entropy(rdata))
    
    return weigh_entro

In [9]:
def best_split_entropy(data, potential_splits):
    
    arbit_entropy = 100
    global best_split_column
    global best_split_value
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_left, data_right = split_data(data, column_to_split=column_index, value_to_split=value)
            presentweigh_entropy = weighted_entropy(data_left, data_right)

            if presentweigh_entropy <= arbit_entropy:
                arbit_entropy = presentweigh_entropy
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value

In [10]:
def dtree_algo_entropy(df, counter=0,ran_ss=None):
    
    # data preparations
    min_samples=7
    max_depth=20
    if counter == 0:
        global COLUMN_HEADERS
        COLUMN_HEADERS = df.columns
        data = df.values
    else:
        data = df           
    
    
    # base cases
    if Is_Pure(data) or (len(data) < min_samples) or (counter == max_depth):
        classification = ClassForPureData(data)
        return classification

    
    # recursive part
    else:    
        counter += 1

        # helper functions 
        potential_splits = splits(data,ran_ss)
        split_column, split_value = best_split_entropy(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        
        # instantiate sub-tree
        feature_name = COLUMN_HEADERS[split_column]
        question = "{} <= {}".format(feature_name, split_value)
        sub_tree = {question: []}
        
        # find answers (recursion)
        yes_answer = dtree_algo_entropy(data_below, counter, ran_ss)
        no_answer = dtree_algo_entropy(data_above, counter, ran_ss)
        
        # If the answers are the same, then there is no point in asking the qestion.
        # This could happen when the data is classified even though it is not pure
        # yet (min_samples or max_depth base cases).
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
        return sub_tree

In [11]:
def classify_datapoint(data_point, model):  #This function gets input as each data point(i.e all the 11 features) and says the classification
    ques = list(model.keys())[0]  #The tree is basically a dictionary with contains of a key and value basically. Here we are picking the first key which will be the dictionary
    col_name, operator, value = ques.split()

    if data_point[col_name] <= float(value):
        ans = model[ques][0]                   #note: general form of tree is tree={question,[positive_answer,negative_answer]}
    else:
        ans = model[ques][1]

    if isinstance(ans, dict): #checking if we narrow down at a class label or again a question in the form of dictionary
        residual_tree = ans
        return classify_datapoint(data_point, residual_tree)
    else:
        return ans

In [12]:
def decision_tree_predictions(test_df, model):
    predictions=test_df.apply(classify_datapoint, args=(model,),axis=1)
    return predictions

In [13]:
def calculate_accuracy(predictions, labels):
    predictions_correct = predictions == labels
    accuracy = predictions_correct.mean()
    
    return accuracy

### Functions definitions pertaining to Random Forest

In [14]:
def bootstrapping(train_df, n_bootstrap):
    index_list = train_df.index.tolist()
    bootstra_indices = random.sample(population=index_list, k=n_bootstrap)
    df_bootstrapped = train_df.loc[bootstra_indices]
    df_oob = train_df.drop(bootstra_indices)
    #print(df_bootstrapped.info(),df_oob.info())
    return df_bootstrapped,df_oob

In [15]:
def random_forest_algorithm(train_df, n_trees, n_bootstrap, n_features):
    forest = []
    for i in range(n_trees):
        df_bootstrapped,df_oob= bootstrapping(train_df, n_bootstrap)
        tree = dtree_algo_entropy(df_bootstrapped, ran_ss=n_features)
        forest.append(tree)
    
    return forest,df_oob

In [16]:
def random_forest_predictions(test_df, forest):
    df_predictions = {}
    for i in range(len(forest)):
        column_name = "tree_{}".format(i)
        predictions = decision_tree_predictions(test_df, model=forest[i])
        df_predictions[column_name] = predictions

    df_predictions = pd.DataFrame(df_predictions)
    random_forest_predictions = df_predictions.mode(axis=1)[0]
    
    return random_forest_predictions

In [17]:
def splitTrainandTest(df, testdatasize):

    testdatasize = round(testdatasize * len(df))

    indices = df.index.tolist()
    test_points = random.sample(population=indices, k=testdatasize)

    test_set = df.loc[test_points]
    train_set = df.drop(test_points)
    
    return train_set, test_set

In [18]:
random.seed(0)
train_d,test_d= splitTrainandTest(spam_data, testdatasize=0.3)

### Model building and Testing

In [20]:
#m=20

forest1, df_oob1 = random_forest_algorithm(train_d, n_trees=5, n_bootstrap=1000, n_features=20)
predictions1 = random_forest_predictions(test_d, forest1)
oob_predictions1 = random_forest_predictions(df_oob1, forest1)
accuracy1 = calculate_accuracy(predictions1, test_d.label)
oob_accuracy1 = calculate_accuracy(oob_predictions1, df_oob1.label)
print("Accuracy using own implementation of random forest with m=20: ",accuracy1)
print("OOB error using own implementation of random forest with m=20: ",1-oob_accuracy1)

#Splitting data to pass it to sklearn function
X_train1 = train_d.iloc[:, :-1].values
Y_train1= train_d.iloc[:, -1].values.reshape(-1,1)
X_test1 = test_d.iloc[:, :-1].values
Y_test1= test_d.iloc[:, -1].values.reshape(-1,1)

#Random forest using sklearn library
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
model1=RandomForestClassifier(n_estimators=5,criterion='entropy',max_depth=20, min_samples_split=7,max_features=20) #bootstrap default as True
model1.fit(X_train1,Y_train1.ravel())
Y_pred=model1.predict(X_test1)
rs=recall_score(Y_test1,Y_pred)
skscore=model1.score(X_test1,Y_test1)
print("Accuracy using sklearn implementation of random forest with m=20: ",skscore)
print("Sensitivity (a.k.a Recall) with m=20: ",rs)

Accuracy using own implementation of random forest with m=20:  0.9231884057971015
OOB error using own implementation of random forest with m=20:  0.047276001800990564
Accuracy using sklearn implementation of random forest with m=20:  0.9355072463768116
Sensitivity (a.k.a Recall) with m=20:  0.9111498257839721


In [21]:
#m=25

forest1, df_oob1 = random_forest_algorithm(train_d, n_trees=5, n_bootstrap=1000, n_features=25)
predictions1 = random_forest_predictions(test_d, forest1)
oob_predictions1 = random_forest_predictions(df_oob1, forest1)
accuracy1 = calculate_accuracy(predictions1, test_d.label)
oob_accuracy1 = calculate_accuracy(oob_predictions1, df_oob1.label)
print("Accuracy using own implementation of random forest with m=25: ",accuracy1)
print("OOB error using own implementation of random forest with m=25: ",1-oob_accuracy1)

#Splitting data to pass it to sklearn function
X_train1 = train_d.iloc[:, :-1].values
Y_train1= train_d.iloc[:, -1].values.reshape(-1,1)
X_test1 = test_d.iloc[:, :-1].values
Y_test1= test_d.iloc[:, -1].values.reshape(-1,1)

#Random forest using sklearn library
from sklearn.ensemble import RandomForestClassifier
model1=RandomForestClassifier(n_estimators=5,criterion='entropy',max_depth=20, min_samples_split=7,max_features=25) #bootstrap default as True
model1.fit(X_train1,Y_train1.ravel())
Y_pred=model1.predict(X_test1)
rs=recall_score(Y_test1,Y_pred)
skscore=model1.score(X_test1,Y_test1)
print("Accuracy using sklearn implementation of random forest with m=25: ",skscore)
print("Sensitivity (a.k.a Recall) with m=25: ",rs)

Accuracy using own implementation of random forest with m=25:  0.9173913043478261
OOB error using own implementation of random forest with m=25:  0.048626744709590275
Accuracy using sklearn implementation of random forest with m=25:  0.9282608695652174
Sensitivity (a.k.a Recall) with m=25:  0.9041811846689896


In [22]:
#m=30

forest1, df_oob1 = random_forest_algorithm(train_d, n_trees=5, n_bootstrap=1000, n_features=30)
predictions1 = random_forest_predictions(test_d, forest1)
oob_predictions1 = random_forest_predictions(df_oob1, forest1)
accuracy1 = calculate_accuracy(predictions1, test_d.label)
oob_accuracy1 = calculate_accuracy(oob_predictions1, df_oob1.label)
print("Accuracy using own implementation of random forest with m=30: ",accuracy1)
print("OOB error using own implementation of random forest with m=30: ",1-oob_accuracy1)

#Splitting data to pass it to sklearn function
X_train1 = train_d.iloc[:, :-1].values
Y_train1= train_d.iloc[:, -1].values.reshape(-1,1)
X_test1 = test_d.iloc[:, :-1].values
Y_test1= test_d.iloc[:, -1].values.reshape(-1,1)

#Random forest using sklearn library
from sklearn.ensemble import RandomForestClassifier
model1=RandomForestClassifier(n_estimators=5,criterion='entropy',max_depth=20, min_samples_split=7,max_features=30) #bootstrap default as True
model1.fit(X_train1,Y_train1.ravel())
Y_pred=model1.predict(X_test1)
rs=recall_score(Y_test1,Y_pred)
skscore=model1.score(X_test1,Y_test1)
print("Accuracy using sklearn implementation of random forest with m=30: ",skscore)
print("Sensitivity (a.k.a Recall) with m=30: ",rs)

Accuracy using own implementation of random forest with m=30:  0.9181159420289855
OOB error using own implementation of random forest with m=30:  0.05042773525438993
Accuracy using sklearn implementation of random forest with m=30:  0.922463768115942
Sensitivity (a.k.a Recall) with m=30:  0.8902439024390244


In [23]:
#m=35

forest1, df_oob1 = random_forest_algorithm(train_d, n_trees=5, n_bootstrap=1000, n_features=35)
predictions1 = random_forest_predictions(test_d, forest1)
oob_predictions1 = random_forest_predictions(df_oob1, forest1)
accuracy1 = calculate_accuracy(predictions1, test_d.label)
oob_accuracy1 = calculate_accuracy(oob_predictions1, df_oob1.label)
print("Accuracy using own implementation of random forest with m=35: ",accuracy1)
print("OOB error using own implementation of random forest with m=35: ",1-oob_accuracy1)

#Splitting data to pass it to sklearn function
X_train1 = train_d.iloc[:, :-1].values
Y_train1= train_d.iloc[:, -1].values.reshape(-1,1)
X_test1 = test_d.iloc[:, :-1].values
Y_test1= test_d.iloc[:, -1].values.reshape(-1,1)

#Random forest using sklearn library
from sklearn.ensemble import RandomForestClassifier
model1=RandomForestClassifier(n_estimators=5,criterion='entropy',max_depth=20, min_samples_split=7,max_features=35) #bootstrap default as True
model1.fit(X_train1,Y_train1.ravel())
Y_pred=model1.predict(X_test1)
rs=recall_score(Y_test1,Y_pred)
skscore=model1.score(X_test1,Y_test1)
print("Accuracy using sklearn implementation of random forest with m=35: ",skscore)
print("Sensitivity (a.k.a Recall) with m=35: ",rs)

Accuracy using own implementation of random forest with m=35:  0.9195652173913044
OOB error using own implementation of random forest with m=35:  0.04592525889239085
Accuracy using sklearn implementation of random forest with m=35:  0.9326086956521739
Sensitivity (a.k.a Recall) with m=35:  0.9111498257839721


In [24]:
#m=40

forest1, df_oob1 = random_forest_algorithm(train_d, n_trees=5, n_bootstrap=1000, n_features=40)
predictions1 = random_forest_predictions(test_d, forest1)
oob_predictions1 = random_forest_predictions(df_oob1, forest1)
accuracy1 = calculate_accuracy(predictions1, test_d.label)
oob_accuracy1 = calculate_accuracy(oob_predictions1, df_oob1.label)
print("Accuracy using own implementation of random forest with m=40: ",accuracy1)
print("OOB error using own implementation of random forest with m=40: ",1-oob_accuracy1)

#Splitting data to pass it to sklearn function
X_train1 = train_d.iloc[:, :-1].values
Y_train1= train_d.iloc[:, -1].values.reshape(-1,1)
X_test1 = test_d.iloc[:, :-1].values
Y_test1= test_d.iloc[:, -1].values.reshape(-1,1)

#Random forest using sklearn library
from sklearn.ensemble import RandomForestClassifier
model1=RandomForestClassifier(n_estimators=5,criterion='entropy',max_depth=20, min_samples_split=7,max_features=40) #bootstrap default as True
model1.fit(X_train1,Y_train1.ravel())
Y_pred=model1.predict(X_test1)
rs=recall_score(Y_test1,Y_pred)
skscore=model1.score(X_test1,Y_test1)
print("Accuracy using sklearn implementation of random forest with m=40: ",skscore)
print("Sensitivity (a.k.a Recall) with m=40: ",rs)

Accuracy using own implementation of random forest with m=40:  0.9239130434782609
OOB error using own implementation of random forest with m=40:  0.04547501125619091
Accuracy using sklearn implementation of random forest with m=40:  0.922463768115942
Sensitivity (a.k.a Recall) with m=40:  0.9006968641114983
