In [36]:
import pandas as pd
import numpy as np
from pprint import pprint
import random
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
pd.options.mode.chained_assignment = None
def check_purity(data):
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)
    if len(unique_classes) == 1:
        return True
    else:
        return False

def create_leaf(data, ml_task):
    label_column = data[:, -1]
    if ml_task == "regression":
        leaf = np.mean(label_column)
    # classfication
    else:
        unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
        index = counts_unique_classes.argmax()
        leaf = unique_classes[index]
    return leaf


def get_potential_splits(data, random_subspace):
    potential_splits = {}
    _, n_columns = data.shape
    column_indicis =list(range(n_columns-1))
    if random_subspace and random_subspace <= len(column_indicis):
        column_indicis = random.sample(population=column_indicis, k=random_subspace)

    for column_index in column_indicis:  # excluding the last column which is the label
        values = data[:, column_index]
        unique_values = np.unique(values)
        potential_splits[column_index] = unique_values

    return potential_splits


def split_data(data, split_column, split_value):
    split_column_values = data[:, split_column]

    type_of_feature = FEATURE_TYPES[split_column]
    if type_of_feature == "continuous":
        data_below = data[split_column_values <= split_value]
        data_above = data[split_column_values > split_value]

    # feature is categorical
    else:
        data_below = data[split_column_values == split_value]
        data_above = data[split_column_values != split_value]

    return data_below, data_above


def calculate_mse(data):
    actual_values = data[:, -1]
    if len(actual_values) == 0:  # empty data
        mse = 0
    else:
        prediction = np.mean(actual_values)
        mse = np.mean((actual_values - prediction) ** 2)
    return mse

def calculate_overall_metric(data_below, data_above, metric_function):
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n
    overall_metric = (p_data_below * metric_function(data_below)
                      + p_data_above * metric_function(data_above))
    return overall_metric

def class_counts(data):
    counts = {}
    for row in data:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts
def gini(data):
    counts = class_counts(data)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(data))
        impurity -= prob_of_lbl**2
    return impurity

def information_gain(left, right, current_uncertainty):
    p=float(len(left)) / (len(left)+len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

def determine_best_split(data, potential_splits, ml_task):
    first_iteration = True
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            if ml_task == "regression":
                current_overall_metric = calculate_overall_metric(data_below, data_above, metric_function=calculate_mse)

            # classification
            else:
                current_overall_metric = calculate_overall_metric(data_below, data_above,
                                                                  metric_function=gini)

            if first_iteration or current_overall_metric <= best_overall_metric:
                first_iteration = False
                best_overall_metric = current_overall_metric
                best_split_column = column_index
                best_split_value = value

    return best_split_column, best_split_value


def determine_type_of_feature(df):
    feature_types = []
    n_unique_values_treshold = 5
    for feature in df.columns:
        if feature != str(df.keys()[-1]):
            unique_values = df[feature].unique()
            example_value = unique_values[0]

            if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
                feature_types.append("categorical")
            else:
                feature_types.append("continuous")

    return feature_types


def decision_tree_algorithm(df, ml_task="classification", counter=0, min_samples=2, max_depth=5, random_subspace=None):
    # data preparations
    if counter == 0:
        global COLUMN_HEADERS, FEATURE_TYPES
        COLUMN_HEADERS = df.columns
        FEATURE_TYPES = determine_type_of_feature(df)
        data = df.values
    else:
        data = df

        # base cases
    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        leaf = create_leaf(data, ml_task)
        return leaf
    
    # recursive part
    else:
        counter += 1
        # helper functions
        potential_splits = get_potential_splits(data,random_subspace)
        split_column, split_value = determine_best_split(data, potential_splits, ml_task)
        data_below, data_above = split_data(data, split_column, split_value)
        # check for empty data
        if len(data_below) == 0 or len(data_above) == 0:
            leaf = create_leaf(data, ml_task)
            return leaf
        # determine question
        feature_name = COLUMN_HEADERS[split_column]
        type_of_feature = FEATURE_TYPES[split_column]
        if type_of_feature == "continuous":
            question = "{} <= {}".format(feature_name, split_value)
        # feature is categorical
        else:
            question = "{} = {}".format(feature_name, split_value)
        # instantiate sub-tree
        sub_tree = {question: []}
        # find answers (recursion)
        yes_answer = decision_tree_algorithm(data_below, ml_task, counter, min_samples, max_depth,random_subspace)
        no_answer = decision_tree_algorithm(data_above, ml_task, counter, min_samples, max_depth,random_subspace)
        # If the answers are the same, then there is no point in asking the qestion.
        # This could happen when the data is classified even though it is not pure
        # yet (min_samples or max_depth base case).
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)

        return sub_tree
    
def predict_example(example, tree):
    if type(tree)==int or type(tree) == float or isinstance(tree,np.float64):
        return tree
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")

    # ask question
    if comparison_operator == "<=":
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]

    # feature is categorical
    else:
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]

    # base case
    if not isinstance(answer, dict):
        return answer

    # recursive part
    else:
        residual_tree = answer
        return predict_example(example, residual_tree)


def calculate_accuracy(df, tree):
    df["classification"] = df.apply(predict_example, args=(tree,), axis=1)
    df["classification_correct"] = df["classification"] == df.iloc[:,-1]
    accuracy = df["classification_correct"].mean()
    return accuracy

def score(predections,test):
    predections_correct = predections==test.iloc[:,-1]
    score = predections_correct.mean()
    return score

def grid_search(train_data,val_data,ml_task):
    i=0
    if ml_task=="regression":
        grid_search = {"max_depth": [], "min_samples": [], "MSE_train": [], "MSE_val": []}
        for max_depth in range(2, 10,2):
            for min_samples in range(2 ,10 ,2):
                tree = decision_tree_algorithm(train_data, ml_task="regression", max_depth=max_depth,
                                               min_samples=min_samples)

                train_pred = decision_tree_predictions(train_data, tree)
                MSE_train = metrics.mean_squared_error(train_data.iloc[:, -1], train_pred)
                val_pred = decision_tree_predictions(val_data, tree)
                MSE_val = metrics.mean_squared_error(val_data.iloc[:, -1], val_pred)

                grid_search["max_depth"].append(max_depth)
                grid_search["min_samples"].append(min_samples)
                grid_search["MSE_train"].append(MSE_train)
                grid_search["MSE_val"].append(MSE_val)
                i+=1
                print(f"Progress: Iteration {i}/16")
        grid_search = pd.DataFrame(grid_search)
        grid_search=grid_search.sort_values(by="MSE_val", ascending=True,ignore_index=True)
    else:
        grid_search = {"max_depth": [], "min_samples": [], "accurcy_train": [], "accurcy_val": []}
        for max_depth in range(2, 10,2):
            for min_samples in range(2 ,10 ,2):
                tree = decision_tree_algorithm(train_data,ml_task="classification", max_depth=max_depth,
                                               min_samples=min_samples)
                train_pred=decision_tree_predictions(train_data,tree)
                accurcy_train=score(train_pred,train_data)
                val_pred = decision_tree_predictions(val_data, tree)
                accurcy_val = score(val_pred, val_data)
                grid_search["max_depth"].append(max_depth)
                grid_search["min_samples"].append(min_samples)
                grid_search["accurcy_train"].append(accurcy_train)
                grid_search["accurcy_val"].append(accurcy_val)
                i+=1
                print(f"Progress: Iteration {i}/16")
        grid_search = pd.DataFrame(grid_search)
        grid_search=grid_search.sort_values("accurcy_val", ascending=False,ignore_index=True)
    return grid_search["max_depth"][0] , grid_search["min_samples"][0]

def decision_tree_predictions(test,tree):
    predictions = test.apply(predict_example, args=(tree,), axis=1)
    return predictions


# AdaBoost

In [37]:
def calculate_model_weight(error):
    return 0.5*np.log((1-error)/(error+0.000001))
def update_row_weights(row,alpha):
    if row[-3] == row['y_pred']:
        return row['weights'] * np.exp(-alpha)
    else:
        return row['weights'] * np.exp(alpha)
def create_new_dataset(df):
    indices = []
    for i in range(df.shape[0]):
        a = np.random.random()
        indices.append(df.index[(df['cumsum_upper'] >= a) & (a > df['cumsum_lower'])].tolist()[0])
    return indices
def AdaboostAlgo(data,nEstimators):
    df=data.copy()
    size=df.shape[1]
    MyModel=[]
    for x in range(0,nEstimators):
        df['weights'] = 1/df.shape[0]
        stump=decision_tree_algorithm(df.iloc[:,:-1],max_depth=1)
        df['y_pred'] = decision_tree_predictions(df.iloc[:,:-1],stump)
        errorCells=df.loc[df['y_pred']!=df.iloc[:,-3]]
        erorr=errorCells['weights'].sum()
        modelW=calculate_model_weight(erorr)
        df['updated_weights'] = df.apply(update_row_weights, args=(modelW,),axis=1)
        df['nomalized_weights'] = df['updated_weights']/df['updated_weights'].sum()
        df['cumsum_upper'] = np.cumsum(df['nomalized_weights'])
        df['cumsum_lower'] = df['cumsum_upper'] - df['nomalized_weights']
        index_values = create_new_dataset(df)
        df=df.iloc[index_values,:size]
        MyModel.append((stump,modelW))
    return MyModel
def AdaBoostPrediction(model,test):
    predections=[]
    sumPredetions=[0]*len(test)
    for x in model:
        predctions=decision_tree_predictions(test,x[0])
        for i,k in enumerate(predctions):
            if k == 1:
                sumPredetions[i]+=x[1]
            else:
                sumPredetions[i]-=x[1]
        predctions=[]
    return [(w>0)*1 for w in sumPredetions]
def ADBGridSearch(train,val):
    i=0
    grid_search = {"n_estimator": [],"accurcy_val": []}
    for nE in [5,6,7,8,10,25]:
        AdaBoostModel=AdaboostAlgo(train,nE)
        y_predAdaBoost=AdaBoostPrediction(AdaBoostModel,val)
        accurcy_val=score(y_predAdaBoost,val)
        grid_search["n_estimator"].append(nE)
        grid_search["accurcy_val"].append(accurcy_val)
        i+=1
        print(f"Progress: Iteration {i}/10")
    grid_search = pd.DataFrame(grid_search)
    grid_search=grid_search.sort_values("accurcy_val", ascending=False,ignore_index=True)
    print(grid_search)
    return grid_search["n_estimator"][0] 

# Classification area Type

## B=1 and P=0 

In [27]:
data = pd.read_csv("data-1.csv", header=0)
data["area_typeL"]=(data["area_type"]=='B')*1
data=data.drop(["area_type","Unnamed: 0"],axis='columns')
data.columns=['availability','bedrooms',"total_sqft","bath","balcony","ranked","price-in-rupees","area_typeL"]
test, train, val=data[10051:], data[:8040], data[8041:10050]
train_data=data[:10050]

### Decision Tree

In [9]:
max_d ,min_s=grid_search(train,val,ml_task="classification")
print("Grid Search Result: maxDipth="+str(max_d)+" minSamples="+str(min_s))
tree = decision_tree_algorithm(train_data,min_samples=min_s,max_depth=max_d)
y_predDT=decision_tree_predictions(test,tree)
accurcy=score(y_predDT,test)
print("My tree accurcy is :",accurcy)

Progress: Iteration 1/16
Progress: Iteration 2/16
Progress: Iteration 3/16
Progress: Iteration 4/16
Progress: Iteration 5/16
Progress: Iteration 6/16
Progress: Iteration 7/16
Progress: Iteration 8/16
Progress: Iteration 9/16
Progress: Iteration 10/16
Progress: Iteration 11/16
Progress: Iteration 12/16
Progress: Iteration 13/16
Progress: Iteration 14/16
Progress: Iteration 15/16
Progress: Iteration 16/16
Grid Search Result: maxDipth=8 minSamples=2
My tree accurcy is : 0.9060509554140127


### Adaboost

In [38]:
nEstimator=ADBGridSearch(train,val)
print("Grid Search Result: NEstimators="+str(nEstimator))
AdaBoostModel=AdaboostAlgo(train_data,nEstimator)

Progress: Iteration 1/10
Progress: Iteration 2/10
Progress: Iteration 3/10
Progress: Iteration 4/10
Progress: Iteration 5/10
Progress: Iteration 6/10
   n_estimator  accurcy_val
0            6     0.879542
1            7     0.851170
2           25     0.850174
3            5     0.848183
4            8     0.796416
5           10     0.707815
Grid Search Result: NEstimators=6


In [39]:
y_predAdaBoost=AdaBoostPrediction(AdaBoostModel,test)
accurcy=score(y_predAdaBoost,test)
print("Adaboost accurcy is :",accurcy)

Adaboost accurcy is : 0.8586783439490446


#  sensitivity and specificity

In [10]:
fp = 0
fn = 0
tp = 0
tn = 0

for actual_value, predicted_value in zip(test.iloc[:,-1], y_predDT):
    if predicted_value == actual_value:
        if predicted_value == 1: 
            tp += 1
        else:
            tn += 1
    else: 
        if predicted_value == 1: 
            fp += 1
        else:
            fn += 1
SensitivityDT=tp / (tp + fn)
SpecificityDT=tn / (tn+fp)

In [40]:
fp = 0
fn = 0
tp = 0
tn = 0

for actual_value, predicted_value in zip(test.iloc[:,-1], y_predAdaBoost):
    if predicted_value == actual_value:
        if predicted_value == 1: 
            tp += 1
        else:
            tn += 1
    else: 
        if predicted_value == 1: 
            fp += 1
        else:
            fn += 1
SensitivityAdaBoost=tp / (tp + fn)
SpecificityAdaBoost=tn / (tn+fp)

In [41]:
print("DecisionTree Sensitivity: ",SensitivityDT)
print("DecisionTree Specificity: ",SpecificityDT)
print("AdaBoost Sensitivity: ",SensitivityAdaBoost)
print("AdaBoost Specificity: ",SpecificityAdaBoost)

DecisionTree Sensitivity:  0.9514338575393154
DecisionTree Specificity:  0.6257142857142857
AdaBoost Sensitivity:  0.9875115633672525
AdaBoost Specificity:  0.06285714285714286


# Regression Price

In [15]:
data1 = pd.read_csv("data-1.csv", header=0)
data1=data1.drop(["Unnamed: 0"],axis='columns')
data1["area_type"]=(data1["area_type"]=='B')*1
data1.columns=['area_type','availability','bedrooms',"total_sqft","bath","balcony","ranked","price-in-rupees"]
test, train, val=data1[10051:], data1[:8040], data1[8041:10050]

In [16]:
max_d ,min_s=grid_search(train,val,ml_task="regression")
print("Grid Search Result: maxDipth="+str(max_d)+" minSamples="+str(min_s))
train_data=data1[:10050]
tree = decision_tree_algorithm(train_data,ml_task='regression',min_samples=min_s,max_depth=max_d)
y_pred=decision_tree_predictions(test,tree)
mse=metrics.mean_squared_error(test.iloc[:,-1],y_pred)
print("My tree MSE is :",mse)

Progress: Iteration 1/16
Progress: Iteration 2/16
Progress: Iteration 3/16
Progress: Iteration 4/16
Progress: Iteration 5/16
Progress: Iteration 6/16
Progress: Iteration 7/16
Progress: Iteration 8/16
Progress: Iteration 9/16
Progress: Iteration 10/16
Progress: Iteration 11/16
Progress: Iteration 12/16
Progress: Iteration 13/16
Progress: Iteration 14/16
Progress: Iteration 15/16
Progress: Iteration 16/16
Grid Search Result: maxDipth=6 minSamples=2
My tree MSE is : 65230071096394.2
