In [1]:
import numpy as np

def get_squared_error(y_actual,y_pred):
    return np.square(np.subtract(y_actual,y_pred))

def get_absolute_error(y_actual,y_pred):
    return np.abs(np.subtract(y_actual,y_pred))

def reg_stump_train(X_trn,y_trn):
    
    feat_list = ["X"+str(i) for i in range(X_trn.shape[1])]
    all_feat_dict = {}
    for i in range(len(feat_list)):
        feat_split_dict = {}
        X = X_trn[:,i]
        split_val_min = 0
        total_mse_min = 0
        sorted_x = list(np.sort(X,kind='mergesort'))
        split_check = [(sorted_x[i]+sorted_x[i+1])/2.0 for i in range(len(sorted_x)-1)]
        for split_val in split_check:
            left  = np.where(X<=split_val)[0]
            right = np.where(X>split_val)[0] 
            c1    = np.mean(y_trn[left])
            c2    = np.mean(y_trn[right])
            left_sum_mse  = np.sum([(c1-i)**2 for i in y_trn[left]])
            right_sum_mse = np.sum([(c2-i)**2 for i in y_trn[right]])
            total_mse_split = left_sum_mse+right_sum_mse
            feat_split_dict[split_val] = (total_mse_split)
        split_val_min,total_mse_min = min(feat_split_dict.items(),key=lambda x:x[1])
        all_feat_dict[feat_list[i]] = (split_val_min,total_mse_min)
    feature_selected,split_val_mse = min(all_feat_dict.items(),key=lambda x: x[1][1])
    split_val,min_train_mse        = split_val_mse[0], split_val_mse[1]
    
    dim = int(feature_selected[-1])
    X   = X_trn[:,dim]
    thresh = split_val
    c_left = np.mean(y_trn[np.where(X<=split_val)[0]])
    c_right = np.mean(y_trn[np.where(X>split_val)[0]])
    
    return dim,thresh,c_left,c_right

def reg_stump_predict(x,dim,c_left,c_right):
    """
    Verify if x is row of the dataframe
    """
    if x[dim]<=thresh:
        return c_left
    else:
        return c_right

def get_errors_dtree_data(X_data,y_data,dim,c_left,c_right):
    mse_list = []
    mae_list = []
    pred_list = []
    for row,actual in zip(X_data,y_data):
        predicted_val = reg_stump_predict(row,dim,c_left,c_right)
        pred_list.append(predicted_val)
        mse_list.append(get_squared_error(actual,predicted_val))
        mae_list.append(get_absolute_error(actual,predicted_val))
    
    return mse_list,mae_list,pred_list

# Enter X_trn

In [2]:
n, m = map(int, input().split()) # taking number of rows and column
X_trn = np.array([input().strip().split() for _ in range(n)], float)

5 2
0 1
2 3
4 2
7 2
10 0


In [3]:
X_trn

array([[ 0.,  1.],
       [ 2.,  3.],
       [ 4.,  2.],
       [ 7.,  2.],
       [10.,  0.]])

# Enter y_trn

In [4]:
# Number of input rows for test set
nrows = int(input("Enter number of rows: "))
y_trn_in = [0]*nrows
for i in range((nrows)):
    y_trn_in[i] = float(input())
y_trn = np.array(y_trn_in)

Enter number of rows: 5
1
2
3
8
12


In [5]:
y_trn

array([ 1.,  2.,  3.,  8., 12.])

# Get dimension, threshold, c_left and c_right of the best split 

In [6]:
dim,thresh, c_left, c_right = reg_stump_train(X_trn,y_trn)
print("dim: ",dim)
print("thresh: ",thresh)
print("c_left: ",c_left)
print("c_right: ",c_right)

dim:  0
thresh:  5.5
c_left:  2.0
c_right:  10.0


In [None]:
train_mse_li,train_mae_li,pred_list = get_errors_dtree_data(X_trn,y_trn,dim,c_left,c_right)
print(np.mean(train_mse_li))

# Read X_Val

In [None]:
p, q = map(int, input().split()) # taking number of rows and column
X_val = np.array([input().strip().split() for _ in range(p)], float)

# Read Y_val

In [None]:
# Number of input rows for test set
rowc = int(input("Enter number of rows: "))
y_val_in = [0]*rowc
for i in range((rowc)):
    y_val_in[i] = float(input())
y_val = np.array(y_val_in)

# Calculate Test MSE 

In [None]:
test_mse_li,test_mae_li,pred_list   = get_errors_dtree_data(X_val,y_val,dim,c_left,c_right)
print(np.mean(test_mse_li))