In [27]:
import numpy as np  # to calculate mean and standard deviation
import pandas as pd  # to load and manipulate data
import matplotlib.pyplot as plt # to draw graphs
import seaborn as sns
import random
from pprint import pprint

# Import data
#----------------------Iris dataset------------------------
data_iris = pd.read_csv("iris.data",header = None)   # reading the file and returning a dataframe
data_iris.columns = ["Sepal_Length","Sepal_Width","Petal_Length","Petal_Width","label"]    # here slen=sepal length, siwd= sepal width, plen= petal length, pwid= petal width and class


#----------------------Titanic dataset------------------------
data_titanic = pd.read_csv("train.csv")             #reading the titanic dataset in data_titanic
data_titanic["Label"] = data_titanic.Survived       #Data preparation
data_titanic = data_titanic.drop(["PassengerId","Survived","Name","Ticket","Cabin"],axis = 1)

#handling missing values 
median_age = data_titanic.Age.median()
mode_embarked = data_titanic.Embarked.mode()[0]
data_titanic = data_titanic.fillna({"Age":median_age, "Embarked":mode_embarked})


#----------------------Bike Sharing dataset------------------------
data_bike = pd.read_csv("bike.csv", parse_dates=["dteday"])       #reading the bike dataset in data_bike
data_bike = data_bike.drop(["instant","casual","registered"], axis=1) #Data preparation
data_bike = data_bike.rename({"dteday":"date"},axis=1)

date_column = data_bike.date

data_bike["day_of_year"] = date_column.dt.dayofyear
data_bike["day_of_month"] = date_column.dt.day

data_bike["quarter"] = date_column.dt.quarter
data_bike["week"] = date_column.dt.week

data_bike["is_month_end"] = date_column.dt.is_month_end
data_bike["is_month_start"] = date_column.dt.is_month_start
data_bike["is_quarter_end"] = date_column.dt.is_quarter_end
data_bike["is_quarter_start"] = date_column.dt.is_quarter_start
data_bike["is_year_end"] = date_column.dt.is_year_end
data_bike["is_year_start"] = date_column.dt.is_year_end

data_bike = data_bike.set_index("date")

data_bike["label"] = data_bike.cnt
data_bike = data_bike.drop("cnt", axis=1)

#----------------------Servo dataset------------------------
data_servo=pd.read_csv("servo.data", sep=',', names=["motor","screw","p_gain","v_gain","label"])   #reading the servo dataset in data_servo

# this function is used to split the dataset into train and test, and takes 2 arguments: the dataframe and the desired size of the test data
def train_and_test(df,test_size):
       
    to_list = df.index.tolist()             # convert the indexes of the dataframe to a list
    test_data = random.sample(to_list,test_size)     # randomly sample k indices for test data
    
    test_df = df.loc[test_data]
    train_df = df.drop(test_data)

    return train_df,test_df

# This function checks how homogenous is the data and returns a boolean value accordingly

def homogeneity(data):
    pred_col = data[:,-1]           # take just the label column which is the last(-1) in this case
    unique = np.unique(pred_col)   # taking only the unique values 

    if len(unique) == 1:
        return True
    else:
        return False
    
#This function classifies the data according to the labels

def terminal_node(data, ml_task):
    
    pred_col = data[:,-1]
    if ml_task == "classification":
        unique_class, counts = np.unique(pred_col, return_counts=True) #take the unique classes and their respective counts 

        index = counts.argmax()   #max count of which label? then stored in index
        node = unique_class[index]   #returns the class of that max value 
    
    #regression
    else:
        node = np.mean(pred_col)

    return node

def possible_splits(data, random_subspace=None):
    splits = {}
    _, n_columns = data.shape  # takes no of columns only
    index = list(range(n_columns - 1))
    
    if random_subspace and random_subspace <= len(index):
        index = random.sample(population=index, k=random_subspace)
    
    for i in index:   
        values = data[:,i]           # taking all the values from the iterated column 
        unique_values = np.unique(values)       # storing the unique values
        splits[i] = unique_values
                    
                    
    
    return splits

# this function splits the data on the specific column(attribute) and on a particular value
def splitting(data,column_to_split,value_to_splitat):
    split_column_values = data[:,column_to_split]                       #taking only the values of the column
    type_of_feature = feature_types[column_to_split]
    if type_of_feature == "categorical":
        data_below = data[split_column_values == value_to_splitat]
        data_above = data[split_column_values != value_to_splitat]   
        
    else:
        data_below = data[split_column_values <= value_to_splitat]    # compare the column values with our best/decided value
        data_above = data[split_column_values > value_to_splitat]    # and then split them accordingly.
        
        
    return data_below,data_above

#this function calculates the mean squared error for continuous values
def meansqerror(data):
    actual_values = data[:,-1]
    if len(actual_values) != 0:  # empty data
        prediction = np.mean(actual_values)
        mse = np.mean((actual_values - prediction)**2)
       
    else:
        mse = 0

    return mse

#this function calculates the entropy
def entropy(data):
    lbl_column = data[:,-1]
    _,counts = np.unique(lbl_column, return_counts=True)

    probability = counts / counts.sum()
    entropy1 = sum(probability * -np.log2(probability))
    
    return entropy1

#below given function calculates the chosen metric i.e mse or entropy for the whole data.
def calculate_overall_metric(data_below,data_above, metric_function):
    lb = len(data_below)
    la = len(data_above)
    data_points = lb + la

    p_data_below = lb / data_points
    p_data_above = la / data_points

    overall_metric = (p_data_below * metric_function(data_below)) + (p_data_above * metric_function(data_above))

    return overall_metric

#This function chooses the best split from the list of potential splits given to it on the basis of the metric function.
def best_split(data, potential_splits, ml_task):
    first_iteration = True   
    for col_index in potential_splits:
        for value in potential_splits[col_index]:
            data_below, data_above = splitting(data,col_index,value)
            if ml_task == "classification":
                current_metric = calculate_overall_metric(data_below,data_above,metric_function=entropy)
            #regression
            else:
                current_metric = calculate_overall_metric(data_below,data_above,metric_function=meansqerror)

            if first_iteration or current_metric <= best_overall_metric:
                first_iteration = False
                best_overall_metric = current_metric
                best_split_column = col_index
                best_split_value = value
    
    return best_split_column,best_split_value

#This function determines the type of feature whether categorical or continuous.
def determine_feature(df):
    feature_types = []
    threshold = 15
    
    for col in df.columns:
        unique_values = df[col].unique()
        example = unique_values[0]
        if (isinstance(example,str)) or (len(unique_values) <= threshold):
            feature_types.append("categorical")
        else:
            feature_types.append("continuous")
            
    return feature_types


#This is the main decision tree algorithm which prints out the decision tree. It takes 6 parameters,
#out of which 4 are set to default. min samples are the number of leaf nodes and max dep is the number of subnodes.

def decision_tree_algorithm(df, ml_task, counter=0, min_sample = 2, max_dep = 5, random_subspace = None):
    
    #data preparation
    if counter == 0:
        global COLUMN_HEADERS, feature_types
        COLUMN_HEADERS = df.columns       #to take the actual names of columns
        feature_types = determine_feature(df)
        data = df.values
    
    else:
        data = df
    
    #base cases
    if (homogeneity(data)) or (len(data) < min_sample) or (counter == max_dep):
        node = terminal_node(data, ml_task)
        return node
    
    #recursive part
    else:
        counter += 1
    
        #helper function
        splits = possible_splits(data, random_subspace)
        split_column, split_value = best_split(data, splits, ml_task)
        data_below, data_above = splitting(data,split_column, split_value)
        
        #instantiate sub_tree
        feature_name = COLUMN_HEADERS[split_column]
        type_of_feature = feature_types[split_column]
        
        if type_of_feature == "continuous":
            question = "{} <= {}".format(feature_name,split_value)
        else:
            question = "{} = {}".format(feature_name,split_value)
        
        subtree = {question: []}
        
        #find answers
        if_yes = decision_tree_algorithm(data_below,ml_task, counter, min_sample, max_dep, random_subspace)
        if_no = decision_tree_algorithm(data_above,ml_task, counter, min_sample, max_dep, random_subspace)
        
        if if_yes == if_no:
            subtree = if_yes
        
        else:
            subtree[question].append(if_yes)
            subtree[question].append(if_no)
        
        return subtree
    
#this function gives the implementation of decision stumps i.e decision tree with only 1 subnode. 
def decision_stumps(df, ml_task):
    tree = decision_tree_algorithm(df, ml_task,max_dep = 1, random_subspace=None)
    
    return tree
    
#this function predicts any instance
def predict_example(example, tree):
    question = list(tree.keys())[0]
    feature, comp_op, value = question.split(" ")

    #ask question
    if comp_op == "<=":
        if example[feature] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    else:               # categorical feature
        if str(example[feature]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]

    #base case
    if not isinstance(answer,dict):
        return answer
    #recursive part
    else: 
        residual_tree = answer
        return predict_example(example,residual_tree)
    
    
#this function applies the predictions to all the decision tree.

def decision_tree_predictions(test_df, tree):
    predictions = test_df.apply(predict_example, args=(tree,), axis=1)
    return predictions

def calculate_accuracy(predictions, labels):
    predictions_correct = predictions == labels
    accuracy = predictions_correct.mean()
    
    return accuracy

# ideally r squared should be as close to 1 as possible
def r_squared(df, tree):
    labels = df.label
    mean = labels.mean()
    predictions = df.apply(predict_example, args=(tree,),axis=1)
    
    res = sum((labels - predictions) ** 2)
    tot = sum((labels - mean) ** 2)
    ans = 1 - res / tot
    
    return ans

def bootstrap_sampling(data, number):
    indices = np.random.randint(low=0, high=len(data), size= number)
    bootstrapped_data = data.iloc[indices]
    
    return bootstrapped_data


def baggingr(data, no_of_samples, no_of_bags):
    #bagged_data = {}
    dtree = []
    for i in range(no_of_bags):
        bootstrap_data = bootstrap_sampling(data, no_of_samples)
        tree = decision_tree_algorithm(data, "regression", max_dep=3)
        dtree.append(tree)
        
    return dtree
        
def baggingc(data, no_of_samples, no_of_bags):
    #bagged_data = {}
    dtree = []
    for i in range(no_of_bags):
        bootstrap_data = bootstrap_sampling(data, no_of_samples)
        tree = decision_tree_algorithm(bootstrap_data, "classification", max_dep=3)
        dtree.append(tree)
        
    return dtree

def random_forest_clas(train_df, n_trees, n_bootstrap, n_features, dt_max_depth):
    forest = []
    for i in range(n_trees):
        df_bootstrapped = bootstrap_sampling(train_df, n_bootstrap)
        tree = decision_tree_algorithm(df_bootstrapped,"classification",0,dt_max_depth,n_features)
        forest.append(tree)
    
    return forest

def random_forest_reg(train_df, n_trees, n_bootstrap, n_features, dt_max_depth):
    forest = []
    for i in range(n_trees):
        df_bootstrapped = bootstrap_sampling(train_df, n_bootstrap)
        tree = decision_tree_algorithm(df_bootstrapped,"regression",0,dt_max_depth,n_features)
        forest.append(tree)
    
    return forest

def rforest_predictions(test_df,forest):
    df_predictions = {}
    for i in range(len(forest)):
        column_name = "tree_{}".format(i)
        predictions = decision_tree_predictions(test_df, tree=forest[i])
        df_predictions[column_name] = predictions
        
    df_predictions = pd.DataFrame(df_predictions)
    random_forest_predictions = df_predictions.mode(axis=1)[0]
   
    return random_forest_predictions


def calculate_r_squared_random(df, forest):
    labels = df.label
    mean = labels.mean()
    df_predictions = []
    for i in range(len(forest)):
        column_name = "tree_{}".format(i)
        predictions = decision_tree_predictions(df, tree=forest[i])
        ss_res = sum((labels - predictions) ** 2)
        ss_tot = sum((labels - mean) ** 2)
        r_squared = 1 - ss_res / ss_tot
        df_predictions.append(r_squared)
        
    aq = np.array(df_predictions).mean()

    return aq



        
if __name__ == '__main__':
            
    print("Available datasets are: Iris, Titanic, Bike, Servo.")
            
    iris_train, iris_test = train_and_test(data_iris, 20)
            
    titanic_train, titanic_test = train_and_test(data_titanic, 20)
            
    
    bike_train, bike_test = train_and_test(data_bike, 20)
            
    servo_train, servo_test = train_and_test(data_servo, 20)
                                    
    print(" 1. Decision Stumps \n 2. Decision Tree \n 3. Bagging \n 4. Random Forest")
    choice = int(input("Enter a choice:"))
    if choice == 1:
        ds = input("Which dataset would like the implementation of Decision Stumps for?")
        if ds == "Iris" or ds == "iris":
                    
            iris_stump = decision_stumps(iris_train, "classification")

            print("\n------------------------------Decision Stump For Iris------------------------------\n")
            pprint(iris_stump)

        elif ds =="Titanic" or ds == "titanic":

            titanic_stump = decision_stumps(titanic_train, "classification")

            print("\n------------------------------Decision Stump For Titanic------------------------------\n")
            pprint(titanic_stump)

        elif ds == "Bike" or ds == "bike":

            bike_stump = decision_stumps(bike_train, "regression")

            print("\n------------------------------Decision Stump For Bike Sharing------------------------------\n")
            pprint(bike_stump)

        else:

            servo_stump = decision_stumps(servo_train, "regression")

            print("\n------------------------------Decision Stump For Servo------------------------------\n")
            pprint(servo_stump)

    elif choice == 2:
        ds = input("Which dataset would like the implementation of Decision Trees for?")
        if ds == "Iris" or ds == "iris":
            iris_tree = decision_tree_algorithm(iris_train, "classification")
            iris_predictions = decision_tree_predictions(iris_test, iris_tree)
            tree_accuracy1 = calculate_accuracy(iris_predictions, iris_test.values[:,-1])

            print("\n------------------------------Decision Tree For Iris------------------------------\n")
            pprint(iris_tree)

            print("Accuracy for Decision Tree is :",tree_accuracy1)

        elif ds =="Titanic" or ds == "titanic":
            titanic_tree = decision_tree_algorithm(titanic_train, "classification")
            titanic_predictions = decision_tree_predictions(titanic_test, titanic_tree)
            tree_accuracy2 = calculate_accuracy(titanic_predictions, titanic_test.values[:,-1])

            print("\n------------------------------Decision Tree For Titanic------------------------------\n")
            pprint(titanic_tree)

            print("Accuracy for Decision Tree is :",tree_accuracy2)

        elif ds == "Bike" or ds == "bike":
            bike_tree = decision_tree_algorithm(bike_train, "regression", max_dep=4)
            tree_accuracy3 = r_squared(bike_test, bike_tree)

            print("\n------------------------------Decision Tree For Bike Sharing------------------------------\n")
            pprint(bike_tree)

            print("Accuracy for Decision Tree is :",tree_accuracy3)

        else:
            servo_tree = decision_tree_algorithm(servo_train, "regression")
            tree_accuracy4 = r_squared(servo_test, servo_tree)

            print("\n------------------------------Decision Tree For Servo------------------------------\n")
            pprint(servo_tree)

            print("Accuracy for Decision Tree is :",tree_accuracy4)
    elif choice == 3:
        ds = input("Which dataset would like the implementation of Bagging for?")
        if ds == "Iris" or ds == "iris":
            iris_bag = baggingc(iris_train, 25, 4)
        
            iris_predictions2 = rforest_predictions(iris_test, iris_bag)
            bag_accuracy1 = calculate_accuracy(iris_predictions2, iris_test.label)
        
            print("\n------------------------------Bagging For Iris------------------------------\n")
            pprint(iris_bag)
            print("Accuracy for Bagging is :",bag_accuracy1)
        
        elif ds =="Titanic" or ds == "titanic":
            titanic_bag = baggingc(titanic_train, 25, 4)
        
            titanic_predictions2 = rforest_predictions(titanic_test, titanic_bag)
            bag_accuracy2 = calculate_accuracy(titanic_predictions2, titanic_test.Label)
        
            print("\n------------------------------Bagging For Titanic------------------------------\n")
            pprint(titanic_bag)
            print("Accuracy for Bagging is :",bag_accuracy2)
        
        
        elif ds == "Bike" or ds == "bike":
            bike_bag = baggingr(bike_train, 25, 5)

            bag_accuracy3 = calculate_r_squared_random(bike_test, bike_bag)

            print("\n------------------------------Bagging For Bike Sharing------------------------------\n")
            pprint(bike_bag)
            print("Accuracy for Bagging is :",bag_accuracy3)

        else:
            servo_bag = baggingr(servo_train, 25, 4)

            bag_accuracy4 = calculate_r_squared_random(servo_test, servo_bag)

            print("\n------------------------------Bagging For Servo------------------------------\n")
            pprint(servo_bag)
            print("Accuracy for Bagging is :",bag_accuracy4)
            
    else:
        ds = input("Which dataset would like the implementation of Random Forest for?")
        if ds == "Iris" or ds == "iris":
            iris_forest = random_forest_clas(iris_train,3,30,3,5)

            iris_predictions3 = rforest_predictions(iris_test, iris_forest)
            forest_accuracy1 = calculate_accuracy(iris_predictions3, iris_test.label)

            print("\n------------------------------Random Forest For Iris------------------------------\n")
            pprint(iris_forest)
            print("Accuracy for Random Forest is :",forest_accuracy1)
        
        
        elif ds =="Titanic" or ds == "titanic":
            titanic_forest = random_forest_clas(titanic_train,3,30,3,5)

            titanic_predictions3 = rforest_predictions(titanic_test, titanic_forest)
            forest_accuracy2 = calculate_accuracy(titanic_predictions3, titanic_test.Label)


            print("\n------------------------------Random Forest For Titanic------------------------------\n")
            pprint(titanic_forest)
            print("Accuracy for Random Forest is :",forest_accuracy2)
            
        elif ds == "Bike" or ds == "bike":
            bike_forest = random_forest_reg(bike_train,6,30,3,5)

            forest_accuracy3 = calculate_r_squared_random(bike_test, bike_forest)

            print("\n------------------------------Random Forest For Bike Sharing------------------------------\n")
            pprint(bike_forest)
            print("Accuracy for Random Forest is :",forest_accuracy3)
        else:
            servo_forest = random_forest_reg(servo_train,3,30,3,5)
        

            forest_accuracy4 = calculate_r_squared_random(servo_test, servo_forest)

            print("\n------------------------------Random Forest For Servo------------------------------\n")
            pprint(servo_forest)
            print("Accuracy for Random Forest is :",forest_accuracy4)
        

            
            
            

Available datasets are: Iris, Titanic, Bike, Servo.
 1. Decision Stumps 
 2. Decision Tree 
 3. Bagging 
 4. Random Forest
Enter a choice:4
Which dataset would like the implementation of Random Forest for?bike

------------------------------Random Forest For Bike Sharing------------------------------

[{'atemp <= 0.39835': [{'windspeed <= 0.129987': [3443.0,
                                                  {'weekday = 0': [2466.6666666666665,
                                                                   1840.3333333333333]}]},
                       {'yr = 1': [{'hum <= 0.738333': [6834.142857142857,
                                                        4934.333333333333]},
                                   {'atemp <= 0.4564': [2534.5,
                                                        4528.571428571428]}]}]},
 {'yr = 1': [{'atemp <= 0.369938': [3587.5, {'week <= 36': [6754.0, 7919.2]}]},
             {'week <= 9': [1255.3333333333333,
                            {'week <