In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
def intialise_parameters(lenw):
    w = np.random.randn(1,lenw)
    b = 0
    return w,b

In [None]:
def forward_prop(X,w,b):
    z = np.dot(w,X) + b
    return z

In [None]:
def cost_function(z,y,reg_penalty='',penalty_factor=0):
    m = y.shape[1]
    J = (1/(2*m))*np.sum(np.square(z-y),dtype=np.float64)
    penalty=0
    
    if reg_penalty=='L1':
        penalty = penalty_factor * np.sum(np.abs(w))
    elif reg_penalty=='L2':
        penalty =(penalty_factor / (2 * m))  * np.sum(np.square(w))
    
    return J+penalty

In [None]:
def back_prop(X, y, z, reg_penalty="", penalty_factor=0):
    m = y.shape[1]
    dz = (1/m)*(z-y)
    dw = np.dot(dz,X.T)
    if reg_penalty=="L1":
        l1_gradient = penalty_factor * np.sign(w)
        dw += l1_gradient
    elif reg_penalty=="L2":
        l2_gradient = (penalty_factor / m) * w
        dw += l2_gradient
    db = np.sum(dz)
    return dw,db


In [None]:
def gradient_descent_update(w,b,dw,db,learning_rate=0.01):
    w = w - learning_rate*dw
    b = b - learning_rate*db
    return w,b

In [None]:
def linear_regresssion_model(X_train, y_train, X_val, y_val, learning_rate=0.01, epochs=50,stop_loss=0.01,metric='RMSE',reg_penalty="",penalty_factor=0):
    X_train=X_train.T
    X_val=X_val.T
    y_train_arr = np.array([y_train])
    y_val_arr=np.array([y_val])
    
    lenw = X_train.shape[0]
    w,b = intialise_parameters(lenw)
    
    costs_train = []
    m_train = y_train_arr.shape[1]
    m_val = y_val_arr.shape[1]
    errors=[]
    error=0
    for i in range(1,epochs+1):
        z_train = forward_prop(X_train,w,b)
        cost_train = cost_function(z_train,y_train_arr,reg_penalty,penalty_factor)
        dw, db = back_prop(X_train,y_train_arr,z_train,reg_penalty,penalty_factor)
        w,b = gradient_descent_update(w,b,dw,db,learning_rate)
        costs_train.append(cost_train)
        if cost_train<stop_loss:
            print('reached stoploss')
            break
        
        z_val = forward_prop(X_val,w,b)    
        
        if metric=="MAE":
            error = (1/m_val)*np.sum(np.abs(z_val - y_val_arr),dtype=np.float64)
        elif metric=="MSE":
            error = (1/m_val)*np.sum(np.square(z_val - y_val_arr),dtype=np.float64)
        elif metric == "RMSE":
            error = np.sqrt((1/m_val) * np.sum(np.square(z_val - y_val_arr),dtype=np.float64),dtype=np.float64) 
        errors.append(error)
    return z_val,y_val_arr,w,b,errors
        

In [None]:
dataset=pd.read_csv('/kaggle/input/water-quality-dataset-with-wqi-result/WQI Dataset.csv',index_col='Unnamed: 0')
print(dataset.shape)

In [None]:
dataset.head()

In [None]:

# Selecting the desired columns
selected_columns = ['Alkalinity-total (as CaCO3)', 'Ammonia-Total (as N)',
                   'BOD - 5 days (Total)', 'Chloride', 'Conductivity @25°C',
                   'Dissolved Oxygen', 'ortho-Phosphate (as P) - unspecified', 'pH',
                   'Temperature', 'Total Hardness (as CaCO3)', 'True Colour']

dataset = dataset[selected_columns]

In [None]:
columns_to_scale = ['Alkalinity-total (as CaCO3)', 'Ammonia-Total (as N)', 'Chloride', 'Conductivity @25°C',
                   'Dissolved Oxygen', 'ortho-Phosphate (as P) - unspecified', 'pH',
                   'Temperature', 'Total Hardness (as CaCO3)', 'True Colour']


means = dataset[columns_to_scale].mean()
std_devs = dataset[columns_to_scale].std()
dataset[columns_to_scale] = (dataset[columns_to_scale] - means) / std_devs
dataset.head()

In [None]:
y=dataset['BOD_avg']
y.describe()

In [None]:
X=dataset.drop(columns=['BOD_avg'])
X.describe()

In [None]:
X.head()

In [None]:
def train_test_split_custom(X, y, test_size=0.2, random_state=None):
    # Set seed for reproducibility
    if random_state is not None:
        np.random.seed(random_state)
    
    # Shuffle indices
    num_samples = len(X)
    indices = np.arange(num_samples)
    np.random.shuffle(indices)
    
    # Determine split index
    split_index = int((1 - test_size) * num_samples)
    
    # Split data
    X_train, X_test = X.iloc[indices[:split_index]], X.iloc[indices[split_index:]]
    y_train, y_test = y.iloc[indices[:split_index]], y.iloc[indices[split_index:]]
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_val, y_train, y_val = train_test_split_custom(X, y, test_size=0.2, random_state=5)


In [None]:
z_val,y_val_arr,w,b,errors_n=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.2,epochs=700)
print(w)

In [None]:
plt.plot(errors_n)
plt.xlabel('Iterations/epochs')
plt.ylabel('RMSE')

plt.title('epochs :- 700  learning_rate 0.01' )
plt.show()

In [None]:
# z_val,y_val,w,b,errors=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.01,epochs=50)
# epchs 50 
# learning rate 0.02 - 1
final_error = []
learningRates=[]

for i in range(2, 66):
    learningRate = (i)/100
    z_val,y_val_arr,w,b,errors=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate = learningRate,epochs=400)
    final_error.append(errors[-1])
    learningRates.append(learningRate)

plt.plot(learningRates,final_error)  
plt.xlabel('learning_rate')
plt.ylabel('RMSE')
plt.title('Learning Rate vs RMSE')
plt.show()

#### Optimal learning rate would be at 0.1 to 0.2

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(z_val[0], label='Predicted', color='blue')
plt.plot(y_val_arr[0], label='Actual', color='red')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Plot of Predicted and Actual Values (Validation Data)')
plt.legend()
plt.show()

In [None]:
len(y_val)

In [None]:
from sklearn import linear_model

In [None]:
linear_regression = linear_model.LinearRegression()
model = linear_regression.fit(X_train,y_train)
prediction = linear_regression.predict(X_val)


In [None]:
prediction.shape

In [None]:
MAE_val_with_sklearn = (1/len(y_val))*np.sum(np.abs(prediction-y_val.T))

In [None]:
MAE_val_with_sklearn

In [None]:
plt.figure(figsize=(30, 4))
plt.plot(prediction, label='Predicted_inbuilt', color='blue')
plt.plot(z_val[0], label='Predicted_', color='black',linestyle='--')
# plt.plot(y_val_arr[0], label='Actual', color='red')

plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Plot of Predicted and Actual Values (Validation Data)')
plt.legend()
plt.show()

In [None]:
z_val,y_val_arr,w,b,errors_l1=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.2,epochs=400,reg_penalty="L1",penalty_factor=0.1)
z_val,y_val_arr,w,b,errors_l2=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.2,epochs=400,reg_penalty="L2",penalty_factor=0.1)
z_val,y_val_arr,w,b,errors_n=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.2,epochs=400)

In [None]:
plt.plot(errors_l1, label='L1 Penalty', color='blue')
plt.plot(errors_l2, label='L2 Penalty', color='green')
plt.plot(errors_n, label='No Penalty', color='red')

plt.xlabel('Iterations/epochs')
plt.title('Epochs: 400, Learning Rate: 0.01, L1,L2 Penalty Factor: 0.4')
plt.ylabel('RMSE')

plt.legend()
plt.show()

In [None]:
error_reg_l1 = []
error_reg_l2 = []
penalty_list=[]
for i in range(2, 99):
    penalty = (i)/100
    z_val,y_val_arr,w,b,errors_l1=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.5,epochs=100,reg_penalty="L1",penalty_factor=penalty)
    z_val,y_val_arr,w,b,errors_l2=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.5,epochs=100,reg_penalty="L2",penalty_factor=penalty)
    error_reg_l1.append(errors_l1[-1])
    error_reg_l2.append(errors_l2[-1])
    penalty_list.append(penalty)
plt.plot(penalty_list,error_reg_l1,label='L1 Penalty', color='blue')
plt.plot(penalty_list,error_reg_l2,label='L2 Penalty', color='green')
plt.title('Penalty Factor vs RMSE')
plt.xlabel('Penalty Factor')
plt.ylabel('RMSE')

plt.legend()
plt.show()


In [None]:
X_train, X_val, y_train, y_val = train_test_split_custom(X, y, test_size=0.2, random_state=5)
z_val,y_val_arr,w,b,errors_80=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.4,epochs=100)
X_train, X_val, y_train, y_val = train_test_split_custom(X, y, test_size=0.3, random_state=5)
z_val,y_val_arr,w,b,errors_70=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.4,epochs=100)
X_train, X_val, y_train, y_val = train_test_split_custom(X, y, test_size=0.4, random_state=5)
z_val,y_val_arr,w,b,errors_60=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.4,epochs=100)
plt.plot(errors_80, label='20-80', color='blue')
plt.plot(errors_70, label='30-70', color='green')
plt.plot(errors_60, label='40-60', color='red')
plt.ylabel('RMSE')
plt.xlabel('No of iterations/epochs')
plt.title('Epochs vs RMSE')
plt.legend()
plt.show()

##### good split could be 40-60
##### epochs could be 200
##### L1 penalty factor doesnt affect

In [None]:
X_train, X_val, y_train, y_val = train_test_split_custom(X, y, test_size=0.4, random_state=5)
z_val,y_val_arr,w,b,errors_l1=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.1,epochs=100,reg_penalty="L1",penalty_factor=0.1)
print(w)

In [None]:
error_split = []
error_split_l1=[]
error_split_l2=[]
split_list=[]
for i in range(1, 80):
    split= (i)/100
    X_train, X_val, y_train, y_val = train_test_split_custom(X, y, test_size=split, random_state=5)
    z_val,y_val_arr,w,b,errors_norm=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.2,epochs=100)
    z_val,y_val_arr,w,b,errors_l1=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.2,epochs=100,reg_penalty="L1",penalty_factor=0.1)
    z_val,y_val_arr,w,b,errors_l2=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.2,epochs=100,reg_penalty="L2",penalty_factor=0.1)
    error_split.append(errors_norm[-1])
    error_split_l1.append(errors_l1[-1])
    error_split_l2.append(errors_l2[-1])
    split_list.append(split)
plt.plot(split_list,error_split,label='No Penalty', color='blue')
plt.plot(split_list,error_split_l1,label='L1 penallty',color='red')
plt.plot(split_list,error_split_l2,label='L2 penallty',color='green')
plt.xlabel('Test Split')
plt.ylabel('RMSE')
plt.title('Test Split')
plt.legend()
plt.show()


In [None]:
class MyPCA:
    
    def __init__(self, n_components):
        self.n_components = n_components   
        
    def fit(self, X):
        # Standardize data 
        X = X.copy()
        self.mean = np.mean(X, axis = 0)
        self.scale = np.std(X, axis = 0)
        X_std = (X - self.mean) / self.scale
        
        # Eigendecomposition of covariance matrix       
        cov_mat = np.cov(X_std.T)
        eig_vals, eig_vecs = np.linalg.eig(cov_mat) 
        
        # Adjusting the eigenvectors that are largest in absolute value to be positive    
        max_abs_idx = np.argmax(np.abs(eig_vecs), axis=0)
        signs = np.sign(eig_vecs[max_abs_idx, range(eig_vecs.shape[0])])
        eig_vecs = eig_vecs*signs[np.newaxis,:]
        eig_vecs = eig_vecs.T
       
        eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[i,:]) for i in range(len(eig_vals))]
        eig_pairs.sort(key=lambda x: x[0], reverse=True)
        eig_vals_sorted = np.array([x[0] for x in eig_pairs])
        eig_vecs_sorted = np.array([x[1] for x in eig_pairs])
        
        self.components = eig_vecs_sorted[:self.n_components,:]
        
        # Explained variance ratio
        self.explained_variance_ratio = [i/np.sum(eig_vals) for i in eig_vals_sorted[:self.n_components]]
        
        self.cum_explained_variance = np.cumsum(self.explained_variance_ratio)

        return self

    def transform(self, X):
        X = X.copy()
        X_std = (X - self.mean) / self.scale
        X_proj = X_std.dot(self.components.T)
        
        return X_proj
    

# ---------------------------------------------------------
my_pca = MyPCA(n_components = 5).fit(X)

print('Components:\n', my_pca.components)
print('Explained variance ratio from scratch:\n', my_pca.explained_variance_ratio)
print('Cumulative explained variance from scratch:\n', my_pca.cum_explained_variance)

X_proj = my_pca.transform(X)
print('Transformed data shape from scratch:', X_proj.shape)


In [None]:
X_train, X_val, y_train, y_val = train_test_split_custom(X_proj, y, test_size=0.4, random_state=5)
z_val,y_val_arr,w,b,errors_norm=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.2,epochs=200)


In [None]:
my_pca = MyPCA(n_components = 5).fit(X)
X_proj = my_pca.transform(X)
X_train, X_val, y_train, y_val = train_test_split_custom(X_proj, y, test_size=0.4, random_state=5)
z_val,y_val_arr,w,b,errors_norm=linear_regresssion_model(X_train, y_train, X_val, y_val,learning_rate=0.2,epochs=200)


In [None]:
plt.plot(errors_norm)
plt.xlabel('Iterations/epochs')
plt.ylabel('RMSE')

plt.title('epochs :- 100  learning_rate 0.01' )
plt.show()

##### lets take upto 5 features from pca and do the polynomial analysis for it 

In [None]:
errors_list = []
colors = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'cyan']

for i in range(1, 10):
    my_pca = MyPCA(n_components=i).fit(X)
    X_proj = my_pca.transform(X)
    X_train, X_val, y_train, y_val = train_test_split_custom(X_proj, y, test_size=0.4, random_state=5)
    z_val, y_val_arr, w, b, errors_norm = linear_regresssion_model(X_train, y_train, X_val, y_val, learning_rate=0.2, epochs=200)
    plt.plot(errors_norm, label=f'n_components={i}', color=colors[i-1])

plt.legend()
plt.show()


In [None]:
class SVR:
    def __init__(self, epsilon=0.5):
        self.epsilon = epsilon
        self.W = None
        self.intercept_ = None

    def _compute_loss(self, X, y):
        y_pred = np.dot(X, self.W) + self.intercept_
        error = np.maximum(0, np.abs(y_pred - y) - self.epsilon)
        loss = np.linalg.norm(self.W) / 2 + np.mean(error)
        return loss

    def fit(self, X, y, epochs=100, learning_rate=0.01):
        feature_len = X.shape[-1] if len(X.shape) > 1 else 1
        
        self.W = np.random.randn(feature_len)
        self.intercept_ = np.random.randn(1)

        for epoch in range(epochs):
            loss = self._compute_loss(X, y)
            # print("{}/{}: loss: {}".format(epoch + 1, epochs, loss))

            # Compute gradients
            y_pred = np.dot(X, self.W) + self.intercept_
            error = y_pred - y
            
            grad_W = np.dot(X.T, np.where(np.abs(error) > self.epsilon, np.sign(error), 0))
            grad_b = np.sum(np.where(np.abs(error) > self.epsilon, np.sign(error), 0))
            # Update weights
            self.W = self.W - learning_rate * grad_W / len(y)
            self.intercept_ -= learning_rate * grad_b / len(y)
        
        return self

    def predict(self, X):
        if len(X.shape) == 1:
            X = X.reshape(-1, 1)
        y_pred = np.dot(X, self.W) + self.intercept_
        return y_pred

In [None]:
def train_test_split_custom_with_array(X, y, test_size=0.2, random_state=None):
    # Set seed for reproducibility
    if random_state is not None:
        np.random.seed(random_state)
    
    # Shuffle indices
    num_samples = len(X)
    indices = np.arange(num_samples)
    np.random.shuffle(indices)
    
    # Determine split index
    split_index = int((1 - test_size) * num_samples)
    
    # Split data
    X_train, X_test = X.iloc[indices[:split_index]].values, X.iloc[indices[split_index:]].values
    y_train, y_test = y.iloc[indices[:split_index]].values, y.iloc[indices[split_index:]].values
    
    return X_train, X_test, y_train, y_test


In [None]:
def compute_rmse(y_test,y_pred):
    rmse = 0
    for i in range(len(y_test)):
        rmse += (y_pred[i] - y_test[i])**2
    rmse = (rmse / len(y_test))**0.5
    return rmse 

In [None]:
X_train,X_test,y_train,y_test=train_test_split_custom_with_array(X, y, test_size=0.2, random_state=24)

In [None]:
model=SVR(epsilon=0.5)
model.fit(X_train,y_train,epochs=100,learning_rate=0.01)
y_pred=model.predict(X_test)
print(compute_rmse(y_test,y_pred))

In [None]:
error_epsilon = []
epsilon_values = []
d_epsilon = 0.01
while d_epsilon <= 0.8:
    model = SVR(epsilon=d_epsilon)
    model.fit(X_train, y_train, epochs=100, learning_rate=0.01)
    y_pred = model.predict(X_test)
    error = compute_rmse(y_test, y_pred)
    error_epsilon.append(error)
    epsilon_values.append(d_epsilon)
    d_epsilon = round(d_epsilon + 0.01, 2)  # Increment by 0.01 and round to 2 decimal places

# Plotting
plt.plot(epsilon_values, error_epsilon, marker='o', linestyle='-')
plt.title('RMSE vs. Epsilon')
plt.xlabel('Epsilon Value')
plt.ylabel('RMSE')
plt.grid(True)
plt.show()


In [None]:
error_lr = []
learning_rates = []
d_lr = 0.01
while d_lr <= 0.7:
    model = SVR(epsilon=0.5)
    model.fit(X_train, y_train, epochs=100, learning_rate=d_lr)
    y_pred = model.predict(X_test)
    error = compute_rmse(y_test, y_pred)
    error_lr.append(error)
    learning_rates.append(d_lr)
    d_lr = round(d_lr + 0.01, 2)  

plt.plot(learning_rates, error_lr, marker='o', linestyle='-')
plt.title('RMSE vs. Learning Rate')
plt.xlabel('Learning Rate')
plt.ylabel('RMSE')
plt.grid(True)
plt.show()




In [None]:
error_epoch = []
epochs_values = []
d_epoch = 50
while d_epoch <= 500:
    model = SVR(epsilon=0.5)
    model.fit(X_train, y_train, epochs=d_epoch, learning_rate=0.01)
    y_pred = model.predict(X_test)
    error = compute_rmse(y_test, y_pred)
    error_epoch.append(error)
    epochs_values.append(d_epoch)
    d_epoch += 50  # Increment by 50 for the next iteration

# Plotting
plt.plot(epochs_values, error_epoch, marker='o', linestyle='-')
plt.title('RMSE vs. Number of Epochs')
plt.xlabel('Number of Epochs')
plt.ylabel('RMSE')
plt.grid(True)
plt.show()




##### with pca 

In [None]:
rmse_list = []

for n_features in range(1, 11):
    my_pca = MyPCA(n_components=n_features).fit(X)
    X_proj = my_pca.transform(X)
    X_train, X_val, y_train, y_val = train_test_split_custom_with_array(X_proj, y, test_size=0.4, random_state=5)
    model = SVR(epsilon=0.5)
    model.fit(X_train, y_train, epochs=250, learning_rate=0.20)
    y_pred = model.predict(X_val)
    rmse = compute_rmse(y_val, y_pred)
    rmse_list.append(rmse)

print(rmse_list)


In [None]:
n_features = list(range(1, 11))

# Plot RMSE values
plt.plot(n_features, rmse_list, marker='o', linestyle='-')
plt.xlabel('Number of Features')
plt.ylabel('RMSE')
plt.title('RMSE vs Number of Features')
plt.grid(True)
plt.show()

In [None]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, var_red=None, value=None):
        ''' constructor ''' 
        
        # for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.var_red = var_red
        
        # for leaf node
        self.value = value

In [None]:
class DecisionTreeRegressor():
    def __init__(self, min_samples_split=2, max_depth=2):
        ''' constructor '''
        
        # initialize the root of the tree 
        self.root = None
        
        # stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.column_names = None  # To store column names
    
    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree '''
        
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        best_split = {}
        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["var_red"]>0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["var_red"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_var_red = -float("inf")
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_var_red = self.variance_reduction(y, left_y, right_y)
                    # update the best split if needed
                    if curr_var_red>max_var_red:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["var_red"] = curr_var_red
                        max_var_red = curr_var_red
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def variance_reduction(self, parent, l_child, r_child):
        ''' function to compute variance reduction '''
        
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
        return reduction
    
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        
        val = np.mean(Y)
        return val
                
    def print_tree(self, columns, tree=None, indent=" "):
        ''' function to print the tree '''
        
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print(columns[tree.feature_index], "<=", tree.threshold, "?", tree.var_red)
            print("%sleft:" % (indent), end="")
            self.print_tree(columns, tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(columns, tree.right, indent + indent)
    
    def fit(self, X, Y):
        ''' function to train the tree '''
        
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
        self.column_names = columns
        
    def make_prediction(self, x, tree):
        ''' function to predict new dataset '''
        
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)
    
    def predict(self, X):
        ''' function to predict a single data point '''
        
        predictions = [self.make_prediction(x, self.root) for x in X]
        return predictions


In [None]:
X_train,X_test,y_train,y_test=train_test_split_custom_with_array(X, y, test_size=0.2, random_state=24)

In [None]:
X_train

In [None]:
X_train,X_test,y_train,y_test=train_test_split_custom_with_array(X, y, test_size=0.2, random_state=24)
regressor = DecisionTreeRegressor(min_samples_split=3, max_depth=3)
y_train_reshaped =y_train.reshape(-1,1)
columns=list(X.columns)

regressor.fit(X_train,y_train_reshaped)
regressor.print_tree(columns)

In [None]:
y_pred=regressor.predict(X_test)
compute_rmse(y_test,y_pred)

In [None]:
errors_mse=[]
X_train,X_test,y_train,y_test=train_test_split_custom_with_array(X, y, test_size=0.2, random_state=24)
for i in range(2,10):
    regressor = DecisionTreeRegressor(min_samples_split=i, max_depth=3)
    y_train_reshaped =y_train.reshape(-1,1)
    columns=list(X.columns)
    regressor.fit(X_train,y_train_reshaped)
    regressor.print_tree(columns)
    y_pred=regressor.predict(X_test)
    errors_mse.append(compute_rmse(y_test,y_pred))
    print(i)


In [None]:
# Plotting the error_mse
plt.plot(range(2, 10), errors_mse, marker='o')
plt.xlabel('min_samples_split')
plt.ylabel('RMSE')
plt.title('RMSE vs min_samples_split')
plt.grid(True)
plt.show()

In [None]:
errors_mse=[]
X_train,X_test,y_train,y_test=train_test_split_custom_with_array(X, y, test_size=0.2, random_state=24)
for i in range(2,7):
    regressor = DecisionTreeRegressor(min_samples_split=7, max_depth=i)
    y_train_reshaped =y_train.reshape(-1,1)
    columns=list(X.columns)
    regressor.fit(X_train,y_train_reshaped)
    regressor.print_tree(columns)
    y_pred=regressor.predict(X_test)
    errors_mse.append(compute_rmse(y_test,y_pred))
    print(i)

In [None]:
# Plotting the error_mse
plt.plot(range(2, 7), errors_mse, marker='o')
plt.xlabel('depth of tree')
plt.ylabel('RMSE')
plt.title('RMSE vs depth of tree')
plt.grid(True)
plt.show()

In [None]:
rmse_list = []

for n_features in range(1, 11):
    my_pca = MyPCA(n_components=n_features).fit(X)
    X_proj = my_pca.transform(X)
    X_train,X_test,y_train,y_test=train_test_split_custom_with_array(X_proj, y, test_size=0.2, random_state=24)
    regressor = DecisionTreeRegressor(min_samples_split=3, max_depth=3)
    y_train_reshaped =y_train.reshape(-1,1)
    columns=list(X.columns)
    regressor.fit(X_train,y_train_reshaped)
    regressor.print_tree(columns)
    y_pred=regressor.predict(X_test)
    errors_mse.append(compute_rmse(y_test,y_pred))
    rmse_list.append(rmse)

print(rmse_list)

In [None]:
n_features = list(range(1, 11))

# Plot RMSE values
plt.plot(n_features, rmse_list, marker='o', linestyle='-')
plt.xlabel('Number of Features')
plt.ylabel('RMSE')
plt.title('RMSE vs Number of Features with PCA')
plt.grid(True)
plt.show()
