# Download datasets for the tasks

In [23]:
from IPython.display import clear_output

In [24]:
! pip install gdown

clear_output()

In [25]:
! mkdir dataset1 dataset2 dataset3

mkdir: cannot create directory ‘dataset1’: File exists
mkdir: cannot create directory ‘dataset2’: File exists
mkdir: cannot create directory ‘dataset3’: File exists


In [26]:
# Download Dataset 1

! gdown 1By83K1gVOS6GUoHN2h8pZZjCqitLTsQN
! gdown 1mBWsE9ewIV7rHQ8Dk6LgkGMT32JnQAZC
! gdown 1jhXQkY_J4263gXnFyLXs9hxw5uDp0cNL
! gdown 11YIfbn-ygv-bKl12xU6eT5xGI5AcTJhi

! mv /content/Test-20.csv /content/dataset1/Test-20.csv
! mv /content/Train-20-Sample-1.csv /content/dataset1/Train-20-Sample-1.csv
! mv /content/Train-20-Sample-2.csv /content/dataset1/Train-20-Sample-2.csv
! mv /content/Val-20.csv /content/dataset1/Val-20.csv

clear_output()

In [27]:
# Download Dataset 2

! gdown 154ZWvPbKwPJPWV8QO2mOAHBtxAFZxVy7
! gdown 17n1hG0ifJFuRY12XUCdsfqN6HBiH3YXD
! gdown 1LwyB0aCKu3WPAa_ztk3fE7afooUDmBAB
! gdown 1XDB7wb3W6zI7JadIAkFO6Y7-2OXxGa11

! mv /content/test_20.csv /content/dataset2/test_20.csv
! mv /content/train200_20.csv /content/dataset2/train200_20.csv
! mv /content/train50_20.csv /content/dataset2/train50_20.csv
! mv /content/val_20.csv /content/dataset2/val_20.csv

clear_output()

In [28]:
# Download Dataset 3

! gdown 1-V7GeVgTvGf1hwQUnPnpgXI6I0SmjO1L
! gdown 1HIJqAlIZqLuvuu2MUuLOF2_S-glCwqx1
! gdown 117YVhUEJgzkS3b4NRHeoMwhuKn6wsAjk
! gdown 15WV-ccQYy-tnpSERYPEyquS85vew4u9s
! gdown 1W_NKsoyhSpVwWPquPu-rMBaqz-F6c4vU
! gdown 10bWojbbstJ1EPguMacZGg4ZmWBNAO6wG

! mv /content/train_data.csv /content/dataset3/train_data.csv
! mv /content/train_label.csv /content/dataset3/train_label.csv
! mv /content/val_data.csv /content/dataset3/val_data.csv
! mv /content/val_label.csv /content/dataset3/val_label.csv
! mv /content/test_data.csv /content/dataset3/test_data.csv
! mv /content/test_label.csv /content/dataset3/test_label.csv

clear_output()

# Imports



> Importing numpy and pandas for data handling

> Importing matplotlib and mpl_toolkits for plotting the observations

> Importing random for random initialization of cluster centers for Gaussian basis

In [29]:
# Import necessary libraries
import random  # for generating pseudo-random numbers
import pandas as pd  # for data manipulation using DataFrames
import numpy as np  # for numerical operations
import math  # for mathematical functions

from sklearn.preprocessing import PolynomialFeatures  # for creating polynomial features
from sklearn.metrics import mean_squared_error  # for calculating mean squared error
from sklearn.model_selection import train_test_split  # for splitting the dataset
import matplotlib.pyplot as plt  # for creating visualizations
from mpl_toolkits.mplot3d import Axes3D  # for 3D plots
from matplotlib.patches import Patch  # for custom legends
import warnings  # for handling warnings

In [30]:
# Import necessary libraries
import random  # for generating pseudo-random numbers
import pandas as pd  # for data manipulation using DataFrames
import numpy as np  # for numerical operations
import math  # for mathematical functions

from sklearn.model_selection import train_test_split  # for splitting the dataset
import matplotlib.pyplot as plt  # for creating visualizations
from mpl_toolkits.mplot3d import Axes3D  # for 3D plots
from matplotlib.patches import Patch  # for custom legends
import warnings  # for handling warnings

In [31]:
debug = False

# Classes for Polynomial and Gaussian Basis



The class `RegressionHelper`
> Common functions are defined here.

1.   Plotting functions
2.   Getting the RMS Error
3.   Finding the best fit paramenters

> The RegressionHelper class provides functionality for polynomial regression, including obtaining optimal model parameters, calculating root mean square (RMS) errors, and generating various plots for analysis.

> Key methods include `get_best_fit_parameters` for obtaining optimal regression parameters, `calculate_rms_error` for RMS error calculation, and several plot generation methods like `plot_polynomial_with_training_points`, `plot_features_3d`, and `plot_scatter`.

> The class incorporates regularization, handles polynomial feature transformations, and allows visualization of the fitted polynomial surface and scatter plots for model evaluation.



In [32]:
class RegressionHelper():
    def get_best_fit_parameters(self, design_matrix, y, lambda_):
        """
        Calculate the best fit parameters for regularized linear regression.

        Parameters:
        - design_matrix: The design matrix representing the input features.
        - y: Target values of the data.
        - lambda_: Regularization parameter.

        Returns:
        - w*: The best fit parameters.
        - w* = ((phi.t * phi + lambda * I)^-1) * phi.t * y_train
        """

        D = design_matrix.shape[1]
        identity = lambda_ * np.identity(D)

        design_matrix_transpose = np.transpose(design_matrix)

        return np.matmul(np.matmul(np.linalg.inv((np.matmul(design_matrix_transpose, design_matrix) + identity)), design_matrix_transpose), y)


    def calculate_rms_error(self, design_matrix, y, w):
        """
        Calculate the Root Mean Squared (RMS) error for the given model.

        Parameters:
        - design_matrix: The design matrix representing the input features.
        - y: Target values of the data.
        - w: Model parameters.

        Returns:
        - RMS error.
        """
        y_ = np.matmul(design_matrix, w)
        return mean_squared_error(y, y_, squared=False)

    ### Methods for generating graphs

    def plot_polynomial_with_training_points(self, coefficients, degree, X_train, y_train, title, t_rms, v_rms, coefficients_without_reg=None):
        """
        Plot a polynomial curve along with training points.

        Parameters:
        - coefficients: Coefficients of the polynomial.
        - degree: Degree of the polynomial.
        - X_train: Training feature values.
        - y_train: Training target values.
        - title: Title of the plot.
        - t_rms: Train RMS error.
        - v_rms: Validation RMS error.
        - coefficients_without_reg: Coefficients of the polynomial without regularization (optional).
        """

        def calculate_polynomial_value(x, degree, coefficients):
            """
            Calculate the value of a polynomial for a given input 'x'.

            Parameters:
            - x: Input value.
            - degree: Degree of the polynomial.
            - coefficients: Coefficients of the polynomial.

            Returns:
            - Result of the polynomial evaluation.
            """
            result = 0
            for i in range(degree + 1):
                result += coefficients[i] * (x ** i)
            return result

        X_max = np.max(X_train)
        X_min = np.min(X_train)
        y_max = np.max(y_train)
        y_min = np.min(y_train)

        X_list = np.linspace(round(X_min) - 1, round(X_max) + 1, 500)
        X = []
        y = []
        y0 = []
        for x in X_list:
            y_ = calculate_polynomial_value(x, degree, coefficients)
            y0_ = 0
            if(coefficients_without_reg is not None):
                y0_ = calculate_polynomial_value(x, degree, coefficients_without_reg)
            if(y_ <= y_max + 100 and y_ >= y_min - 100):
                X.append(x)
                y.append(y_)
                y0.append(y0_)


        plt.figure(figsize=(10, 6))
        plt.plot(X, y, color="blue", label="Approximated polynomial")
        if(coefficients_without_reg is not None):
            plt.plot(X, y0, color="green", label="Approximated polynomial (without regularisation)")
        plt.scatter(X_train, y_train, color="red", label="Training points")
        plt.legend()
        plt.title(title + f"\nTrain RMS Error = {t_rms:.5f} Validation RMS Error = {v_rms:.5f}")

        plt.autoscale()

        plt.savefig("Line Plot: " + title+".jpeg", format="jpeg")

        # plt.show()
        plt.close()

    def plot_features_3d(self, polynomial_value, X, y, title, t_rms, v_rms):
        """
        Plot the 3D surface of the input features x1, x2, and y.

        Parameters:
        - polynomial_value: Function to calculate the polynomial value.
        - X: Input features.
        - y: Target values.
        - title: Title of the plot.
        - t_rms: Train RMS error.
        - v_rms: Validation RMS error.
        """

        if self.optimal_w is None:
            print("Please fit the model first.")
            return

        X1 = X[:, 0]
        X2 = X[:, 1]

        fig = plt.figure(figsize=(10, 6))
        ax = fig.add_subplot(111, projection='3d')

        ax.scatter(X1, X2, y, c='red', marker='o', label='Data points')

        # Create a meshgrid of x1 and x2 values
        X1_max = np.max(X1)
        X1_min = np.min(X1)
        X2_max = np.max(X2)
        X2_min = np.min(X2)
        y_max = np.max(y)
        y_min = np.min(y)

        X1_list = np.linspace(round(X1_min) - 1, round(X1_max) + 1, 100)
        X2_list = np.linspace(round(X2_min) - 1, round(X2_max) + 1, 100)

        X1_, X2_ = np.meshgrid(X1_list, X2_list)
        feature_values = np.zeros((100, 100))

        for i in range(100):
            for j in range(100):
                inp = np.reshape(np.array([X1_[i, j], X2_[i, j]]), (-1, 2))
                feature_values[i, j] = polynomial_value(inp)

        surf = ax.plot_surface(X1_, X2_, feature_values, cmap='viridis', alpha=0.8, label="Approximated polynomial")

        ax.set_xlabel('x1')
        ax.set_ylabel('x2')
        ax.set_zlabel('y')
        plt.title(title + f"\nTrain RMS Error = {t_rms:.5f} Validation RMS Error = {v_rms:.5f}")


        # Create a legend with proxy artists
        proxy_data = plt.Line2D([0], [0], linestyle="none", marker="o", c='red')
        proxy_surface = plt.Line2D([0], [0], linestyle="-", c='blue')

        ax.legend([proxy_surface, proxy_data], ['Approximated surface', 'Data points'], loc="upper right")

        plt.autoscale()

        fig.savefig("Surface Plot: " + title+".jpeg", format="jpeg")

        # plt.show()
        plt.close()

    def plot_scatter(self, y_true, y_pred, title, t_rms, v_rms):
        """
        Plot a scatter plot of true vs predicted values.

        Parameters:
        - y_true: True target values.
        - y_pred: Predicted values.
        - title: Title of the plot.
        - t_rms: Train RMS error.
        - v_rms: Validation RMS error.
        """

        xlabel = "True values"
        ylabel = "Predicted values"
        plt.figure(figsize=(10, 6))
        plt.scatter(y_true, y_pred, alpha=0.5)
        plt.title(title + f"\nTrain RMS Error = {t_rms:.5f} Validation RMS Error = {v_rms:.5f}")
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.grid(True)

        plt.autoscale()

        plt.savefig("Scatter Plot: " + title + ".jpeg", format="jpeg")

        # plt.show()
        plt.close()


    def plot_scatter_best_model(self, y_true, y_pred, title, e_rms, split):
        """
        Plot a scatter plot for the best model.

        Parameters:
        - y_true: True target values.
        - y_pred: Predicted values.
        - title: Title of the plot.
        - e_rms: RMS error.
        - split: Type of split (e.g., train, test).
        """

        xlabel = "True values"
        ylabel = "Predicted values"
        plt.figure(figsize=(10, 6))
        plt.scatter(y_true, y_pred, alpha=0.5)
        plt.title(title + f"\n{split} RMS Error = {e_rms:.5f}")
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.grid(True)

        plt.autoscale()

        plt.savefig(split + " Scatter Plot: " + title + ".jpeg", format="jpeg")

        # plt.show()
        plt.close()

> The `generate_polynomial_design_matrix` function efficiently creates a polynomial design matrix for a dataset with a specified degree, utilizing a combinatorial approach.

> The recursive `calculate_polynomial_features` function generates polynomial features for a single data point, avoiding redundant computations.

The PolynomialBasisRegression
> Functions specific to Polynomial basis are defined here


1.   Generating the polynomial basis Design Matrix
2.   Checking the optimal hyperparameters from a given list
3.   Getting approximations of the inputs from our model


> The PolynomialBasisRegression class extends a RegressionHelper class and is designed for polynomial regression tasks.

> It includes methods to generate polynomial design matrices, fit models with varying degrees and regularization parameters, perform cross-validation, and visualize the results through scatter plots.

> The class allows for training, validation, and testing of polynomial regression models with flexibility in hyperparameter tuning

In [33]:
class PolynomialBasisRegression(RegressionHelper):
    def __init__(self, X_train, y_train, X_val, y_val, X_test, y_test):
        """
        Initialize PolynomialBasisRegression with training, validation, and test datasets.
        X_train: Training input data with shape (N, d), where N is the number of samples and d is the number of features.
        y_train: Target output of the training data.
        X_val: Validation input data.
        y_val: Target output of the validation data.
        X_test: Test input data.
        y_test: Target output of the test data.
        """
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.X_test = X_test
        self.y_test = y_test

        self.X_train_design_matrix = None
        self.X_val_design_matrix = None
        self.X_test_design_matrix = None
        self.lambda_ = None
        self.m = None
        self.optimal_w = None
        self.optimal_w0 = None


    def generate_polynomial_design_matrix(self, X, m):
        """
        Generate a polynomial design matrix for input data X up to a specified degree m.
        X: Input data with shape (N, d).
        m: Degree of the polynomial.
        Returns the design matrix with shape (N, D), where D is the number of features in the polynomial.
        """
        # # Using Sklearn Library
        # polynomial_design_matrix = PolynomialFeatures(m)
        # return polynomial_design_matrix.fit_transform(X)

        # Without Using Sklearn Library
        n, d = X.shape
        D = int(math.factorial(m + d) / (math.factorial(m) * math.factorial(d)))

        polynomial_design_matrix = np.ones((n, D))

        for i in range(n):
            polynomial_design_matrix[i, :] = np.reshape(np.array(self.calculate_polynomial_features(X[i, :].tolist(), m)), (1, -1))

        return polynomial_design_matrix


    def calculate_polynomial_features(self, x, m):
        """
        Recursively calculate polynomial features for a single data point up to a specified degree.
        x: Input data point.
        m: Degree of the polynomial.
        Returns a list of polynomial features.
        """

        n = len(x)
        features = []

        if(n == 1):
            for i in range(m+1):
                features.append((x[0] ** i))

            return features;

        for i in range(m+1):
            remaining_features = self.calculate_polynomial_features(x[1:], m-i)
            for r in remaining_features:
                features.append((x[0] ** i) * r)

        return features


    def fit(self, m, lambda_=0):
        """
        Fit the polynomial regression model.
        m: Degree of the polynomial.
        lambda_: Regularization parameter (default is 0 for no regularization).
        """

        self.lambda_ = lambda_
        self.m = m
        self.X_train_design_matrix = self.generate_polynomial_design_matrix(self.X_train, self.m)
        self.optimal_w = self.get_best_fit_parameters(self.X_train_design_matrix, self.y_train, self.lambda_)

        if(debug):
            print("Design matrix shape: ", self.X_train_design_matrix.shape)
            print("estimated paramters: ")
            print(self.optimal_w)


        train_error = self.calculate_rms_error(self.X_train_design_matrix, self.y_train, self.optimal_w)

        self.val_design_matrix = self.generate_polynomial_design_matrix(self.X_val, self.m)
        val_error = self.calculate_rms_error(self.val_design_matrix, self.y_val, self.optimal_w)

        title = f"Degree = {self.m}, Regularization Parameter = {self.lambda_}"
        self.plot_scatter(self.y_train, self.get_approximation(self.X_train_design_matrix, self.optimal_w), title, train_error, val_error)


    def fit_cv(self, m_list, lambda_list, plot=True):
        """
        Perform cross-validation to find the best combination of degree and regularization parameter.
        m_list: List of degrees of polynomials to test.
        lambda_list: List of regularization parameters to test.
        plot: If True, plot results for each combination.
        Returns the best degree and regularization parameter.
        """

        best_model_error : float = 1e9
        values_dict = {
            "Degree" : [],
            "Regularization Parameter" : [],
            "Training RMS Error" : [],
            "Validation RMS Error" : [],
            "Testing RMS Error" : [],
        }

        for m in m_list:
            for lambda_ in lambda_list:
                values_dict["Degree"].append(m)
                values_dict["Regularization Parameter"].append(lambda_)

                train_design_matrix = self.generate_polynomial_design_matrix(self.X_train, m)
                optimal_w = self.get_best_fit_parameters(train_design_matrix, self.y_train, lambda_)
                optimal_w0 = self.get_best_fit_parameters(train_design_matrix, self.y_train, 0)

                train_error = self.calculate_rms_error(train_design_matrix, self.y_train, optimal_w)
                values_dict["Training RMS Error"].append(train_error)

                # Find validation error and find the best model
                val_design_matrix = self.generate_polynomial_design_matrix(self.X_val, m)
                val_error = self.calculate_rms_error(val_design_matrix, self.y_val, optimal_w)
                values_dict["Validation RMS Error"].append(val_error)

                test_design_matrix = self.generate_polynomial_design_matrix(self.X_test, m)
                test_error = self.calculate_rms_error(test_design_matrix, self.y_test, optimal_w)
                values_dict["Testing RMS Error"].append(test_error)

                if(val_error < best_model_error):
                    best_model_error = val_error
                    self.X_train_design_matrix = train_design_matrix
                    self.X_val_design_matrix = val_design_matrix
                    self.X_test_design_matrix = test_design_matrix
                    self.optimal_w = optimal_w
                    self.optimal_w0 = optimal_w0
                    self.lambda_ = lambda_
                    self.m = m

                if(plot):
                        title = f"Training Data Size = {self.X_train.shape[0]}, Degree = {m}, Regularization Parameter = {lambda_}"
                        d = self.X_train.shape[1]

                        if(d == 1):
                            self.plot_polynomial_with_training_points(optimal_w.tolist(), m, self.X_train, self.y_train, title, train_error, val_error, optimal_w0.tolist())
                        elif(d == 2):
                            def polynomial_value(x):
                                design_matrix = self.generate_polynomial_design_matrix(x, m)
                                return np.matmul(design_matrix, optimal_w)

                            self.plot_features_3d(polynomial_value, self.X_train, self.y_train, title, train_error, val_error)
                            self.plot_scatter(self.y_train, self.get_approximation(train_design_matrix, optimal_w), title, train_error, val_error)
                        else:
                            self.plot_scatter(self.y_train, self.get_approximation(train_design_matrix, optimal_w), title, train_error, val_error)

        csv_name = f"Train_{self.X_train.shape[0]}_HyperParameters_vs_ERMS.csv"
        values_df = pd.DataFrame(values_dict)
        values_df.to_csv(csv_name, index=False)

        if(debug):
            print("Best m: ", self.m, " best lambda: ", self.lambda_)
        return self.m, self.lambda_


    def generate_scatter_plots(self):
        """
        Generate scatter plots for the training, validation, and test datasets.
        """
        title = f"Training Data Size = {self.X_train.shape[0]}, Degree = {self.m}, Regularization Parameter = {self.lambda_}"
        self.plot_scatter_best_model(self.y_train, self.get_approximation(self.X_train_design_matrix, self.optimal_w), title, self.calculate_rms_error(self.X_train_design_matrix, self.y_train, self.optimal_w), "Train")

        self.plot_scatter_best_model(self.y_val, self.get_approximation(self.X_val_design_matrix, self.optimal_w), title, self.calculate_rms_error(self.X_val_design_matrix, self.y_val, self.optimal_w), "Validation")

        if(self.X_test_design_matrix is None):
            self.X_test_design_matrix = self.generate_polynomial_design_matrix(self.X_test, self.m)
        self.plot_scatter_best_model(self.y_test, self.get_approximation(self.X_test_design_matrix, self.optimal_w), title, self.calculate_rms_error(self.X_test_design_matrix, self.y_test, self.optimal_w), "Test")



    def get_approximation(self, design_matrix, w):
        """
        Calculate the approximation using the learned parameters and design matrix.
        design_matrix: Design matrix of input data.
        w: Model parameters.
        Returns the predicted values.
        """
        return np.matmul(design_matrix, w)

The GaussianBasisRegression
> Functions specific to Gaussian basis are defined here


1.   Generating the gaussian basis Design Matrix
2.   Checking the optimal hyperparameters from a given list
3.   Defining the K-Means clustering methods

> The GaussianBasisRegression class extends the RegressionHelper and focuses on regression tasks using Gaussian basis functions.

> It provides functionalities for k-means clustering to determine cluster means, generating Gaussian design matrices, and fitting models with varying numbers of clusters, covariance values, and regularization parameters.

> The class supports cross-validation for selecting optimal hyperparameters, and it includes methods for visualizing scatter plots of the training, validation, and testing data.


In [34]:
class GaussianBasisRegression(RegressionHelper):
    def __init__(self, X_train, y_train, X_val, y_val, X_test, y_test):
        """
        Initialize GaussianBasisRegression with training, validation, and test datasets.
        X_train: Training input data with shape (N, d), where N is the number of samples and d is the number of features.
        y_train: Target output of the training data.
        X_val: Validation input data.
        y_val: Target output of the validation data.
        X_test: Test input data.
        y_test: Target output of the test data.
        """

        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.X_test = X_test
        self.y_test = y_test

        self.X_train_design_matrix = None
        self.X_val_design_matrix = None
        self.X_test_design_matrix = None

        self.K = None
        self.sigma_ = None
        self.lambda_ = None
        self.optimal_w = None
        self.cluster_means = None


    def k_means_clustering(self, X, K, max_iters=300):
        """
        Perform k-means clustering on input data X to find K cluster centroids.
        X: Input data.
        K: Number of clusters.
        max_iters: Maximum number of iterations.
        Returns the cluster centroids.
        """
        n, d = X.shape

        # Randomly initialize cluster centroids
        centroids = X[np.random.choice(n, K, replace=False)]

        for _ in range(max_iters):
            # Assign each data point to the nearest cluster
            distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
            labels = np.argmin(distances, axis=1)

            # Update cluster centroids
            new_centroids = np.array([X[labels == k].mean(axis=0) if np.sum(labels == k) > 0 else centroids[k] for k in range(K)])

            # Check for convergence
            if np.all(new_centroids == centroids):
                break

            centroids = new_centroids

            if(_ == max_iters - 1):
                if(debug):
                    print("max iters reached")

        return centroids


    def generate_gaussian_design_matrix(self, X, K, sigma_):
        """
        Generate a Gaussian design matrix for input data X using K clusters and a specified standard deviation (sigma_).
        X: Input data.
        K: Number of clusters.
        sigma_: Standard deviation.
        Returns the cluster means and the Gaussian design matrix.
        """
        n = X.shape[0]
        d = X.shape[1]
        cluster_means = self.k_means_clustering(X, K)

        gaussian_design_matrix = np.zeros((n, K));
        for i in range(n):
            for j in range(K):
                gaussian_design_matrix[i, j] = np.exp(-1 * pow((np.linalg.norm((X[i, :] - np.reshape(cluster_means[j], (-1))), axis=0) / (sigma_ + 1e-6)), 2))

        return cluster_means, gaussian_design_matrix


    def calculate_gaussian_design_matrix(self, X, cluster_means, sigma_):
        """
        Calculate the Gaussian design matrix for input data X using pre-computed cluster means and a specified standard deviation (sigma_).
        X: Input data.
        cluster_means: Cluster means.
        sigma_: Standard deviation.
        Returns the Gaussian design matrix.
        """
        n = X.shape[0]
        d = X.shape[1]
        K = len(cluster_means)

        gaussian_design_matrix = np.zeros((n, K));
        for i in range(n):
            for j in range(K):
                gaussian_design_matrix[i, j] = np.exp(-1 * pow((np.linalg.norm((X[i, :] - np.reshape(cluster_means[j], (-1))), axis=0) / sigma_), 2))

        return gaussian_design_matrix


    def fit(self, K, sigma_, lambda_=0):
        """
        Fit the Gaussian basis regression model.
        K: Number of clusters.
        sigma_: Standard deviation (covariance).
        lambda_: Regularization parameter (default is 0 for no regularization).
        """

        self.K = K
        self.sigma_ = sigma_
        self.lambda_ = lambda_
        cluster_means, self.X_train_design_matrix = self.generate_gaussian_design_matrix(self.X_train, self.K, sigma_)

        if(debug):
            print("Shape of gaussian_design_matrix: ", self.X_train_design_matrix.shape)
            print("Design matrix:")
            print(self.X_train_design_matrix)


        self.optimal_w = self.get_best_fit_parameters(self.X_train_design_matrix, self.y_train, self.lambda_)
        train_error = self.calculate_rms_error(self.X_train_design_matrix, self.y_train, self.optimal_w)

        self.val_design_matrix = self.calculate_gaussian_design_matrix(self.X_val, cluster_means, sigma_)
        val_error = self.calculate_rms_error(self.val_design_matrix, self.y_val, self.optimal_w)

        title = f"Number of Clusters = {self.K}, Covariance = {self.sigma_}, Regularization Parameter = {self.lambda_}"
        self.plot_scatter(self.y_train, self.get_approximation(self.X_train_design_matrix, self.optimal_w), title, train_error, val_error)

        if(debug):
            print(self.optimal_w)


    def fit_cv(self, K_list, lambda_list, sigma_list, plot=True):
        """
        Perform cross-validation to find the best combination of number of clusters (K), regularization parameter, and standard deviation (sigma_).
        K_list: List of numbers of clusters to test.
        lambda_list: List of regularization parameters to test.
        sigma_list: List of standard deviations to test.
        plot: If True, plot results for each combination.
        Returns the best values for K, sigma_, and lambda_.
        """

        best_model_error : float = 1e9

        values_dict = {
            "Number of clusters" : [],
            "Covariance" : [],
            "Regularization Parameter" : [],
            "Training RMS Error" : [],
            "Validation RMS Error" : [],
            "Testing RMS Error" : [],
        }

        for K in K_list:
            for sigma_ in sigma_list:
                for lambda_ in lambda_list:
                    values_dict["Number of clusters"].append(K)
                    values_dict["Covariance"].append(sigma_)
                    values_dict["Regularization Parameter"].append(lambda_)

                    cluster_means, train_design_matrix = self.generate_gaussian_design_matrix(self.X_train, K, sigma_)
                    optimal_w = self.get_best_fit_parameters(train_design_matrix, self.y_train, lambda_)
                    train_error = self.calculate_rms_error(train_design_matrix, self.y_train, optimal_w)
                    values_dict["Training RMS Error"].append(train_error)

                    val_design_matrix = self.calculate_gaussian_design_matrix(self.X_val, cluster_means, sigma_)
                    val_error = self.calculate_rms_error(val_design_matrix, self.y_val, optimal_w)
                    values_dict["Validation RMS Error"].append(val_error)

                    test_design_matrix = self.calculate_gaussian_design_matrix(self.X_test, cluster_means, sigma_)
                    test_error = self.calculate_rms_error(test_design_matrix, self.y_test, optimal_w)
                    values_dict["Testing RMS Error"].append(test_error)

                    if(val_error < best_model_error):
                        best_model_error = val_error
                        self.X_train_design_matrix = train_design_matrix
                        self.X_val_design_matrix = val_design_matrix
                        self.X_test_design_matrix = test_design_matrix
                        self.optimal_w = optimal_w
                        self.cluster_means = cluster_means
                        self.K = K
                        self.sigma_ = sigma_
                        self.lambda_ = lambda_


                    if(plot):
                        title = f"Training Data Size = {self.X_train.shape[0]}, Number of Clusters = {K}, Covariance = {sigma_}, Regularization Parameter = {lambda_}"
                        self.plot_scatter(self.y_train, self.get_approximation(train_design_matrix, optimal_w), title, train_error, val_error)

        csv_name = f"Train_{self.X_train.shape[0]}_HyperParameters_vs_ERMS.csv"
        values_df = pd.DataFrame(values_dict)
        values_df.to_csv(csv_name, index=False)

        if(debug):
            print("Best m: ", self.m, " best lambda: ", self.lambda_)

        return self.K, self.sigma_, self.lambda_


    def generate_scatter_plots(self):
        """
        Generate scatter plots for the training, validation, and test datasets.
        """
        title = f"Number of Clusters = {self.K}, Covariance = {self.sigma_}, Regularization Parameter = {self.lambda_}"
        self.plot_scatter_best_model(self.y_train, self.get_approximation(self.X_train_design_matrix, self.optimal_w), title, self.calculate_rms_error(self.X_train_design_matrix, self.y_train, self.optimal_w), "Train")

        self.plot_scatter_best_model(self.y_val, self.get_approximation(self.X_val_design_matrix, self.optimal_w), title, self.calculate_rms_error(self.X_val_design_matrix, self.y_val, self.optimal_w), "Validation")

        if(self.X_test_design_matrix is None):
            self.X_test_design_matrix = self.calculate_design_matrix(self.X_test, self.cluster_means, self.sigma_)
        self.plot_scatter_best_model(self.y_test, self.get_approximation(self.X_test_design_matrix, self.optimal_w), title, self.calculate_rms_error(self.X_test_design_matrix, self.y_test, self.optimal_w), "Test")


    def get_approximation(self, design_matrix, w):
        """
        Calculate the approximation using the learned parameters and design matrix.
        design_matrix: Design matrix of input data.
        w: Model parameters.
        Returns the predicted values.
        """
        return np.matmul(design_matrix, w)

# Task 1

## Polynomial Curve fitting for Dataset 1

###Loading the dataset

In [35]:
train_data_10 = pd.read_csv("/content/dataset1/Train-20-Sample-1.csv")
train_data_100 = pd.read_csv("/content/dataset1/Train-20-Sample-2.csv")

val_data = pd.read_csv("/content/dataset1/Val-20.csv")

test_data = pd.read_csv("/content/dataset1/Test-20.csv")

In [36]:
train_data_10.head()

Unnamed: 0.1,Unnamed: 0,input,output
0,0,-2.0,57.0
1,1,-1.555556,19.367932
2,2,-1.111111,4.615303
3,3,-0.666667,0.802469
4,4,-0.222222,0.672001


In [37]:
train_data_100.head()

Unnamed: 0.1,Unnamed: 0,input,output
0,0,-1.095477,4.351505
1,1,-0.351759,0.570138
2,2,0.572864,4.420734
3,3,0.693467,6.31988
4,4,-1.015075,3.185732


In [38]:
val_data.head()

Unnamed: 0.1,Unnamed: 0,input,output
0,0,-1.658291,25.503054
1,1,0.190955,1.5258
2,2,-0.251256,0.643356
3,3,0.552764,4.164552
4,4,1.155779,22.416845


In [39]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,input,output
0,0,-0.834171,1.518354
1,1,-1.19598,6.286141
2,2,-1.316583,9.461611
3,3,0.050251,1.108618
4,4,1.698492,74.263946


### Modifying the dataset shape as per the model requirements

In [40]:
X_train_10 = np.reshape(train_data_10["input"].to_numpy(), (-1, 1))
print("X_train shape: ", X_train_10.shape)
y_train_10 = train_data_10["output"].to_numpy()
print("y_train shape: ", y_train_10.shape)

X_train shape:  (10, 1)
y_train shape:  (10,)


In [41]:
X_train_100 = np.reshape(train_data_100["input"].to_numpy(), (-1, 1))
print("X_train shape: ", X_train_100.shape)
y_train_100 = train_data_100["output"].to_numpy()
print("y_train shape: ", y_train_100.shape)

X_train shape:  (100, 1)
y_train shape:  (100,)


In [42]:
X_val = np.reshape(val_data["input"].to_numpy(), (-1, 1))
print("X_val shape: ", X_val.shape)
y_val = val_data["output"].to_numpy()
print("y_val shape: ", y_val.shape)

X_val shape:  (50, 1)
y_val shape:  (50,)


In [43]:
X_test = np.reshape(test_data["input"].to_numpy(), (-1, 1))
print("X_test shape: ", X_test.shape)
y_test = test_data["output"].to_numpy()
print("y_test shape: ", y_test.shape)

X_test shape:  (50, 1)
y_test shape:  (50,)


### Calling the Polynomial Regressor Function

$λ = [0, 0.01, 0.1, 1, 10, 100]$

$m = [1, 2, 3, 6, 9]$

In [44]:
regressor_10 = PolynomialBasisRegression(X_train_10, y_train_10, X_val, y_val, X_test, y_test)

In [45]:
m_list = [1, 2, 3, 6, 9]
lambda_list = [0, 0.01, 0.1, 1, 10, 100]
best_m_10, best_lambda_10 = regressor_10.fit_cv(m_list, lambda_list)

In [46]:
print(f"Best m and lambda: ({best_m_10}, {best_lambda_10})")

Best m and lambda: (6, 0)


### Saving the images and outputs

In [47]:
! mkdir Task1
! mkdir Task1/Train_10
! mkdir Task1/Train_100

In [48]:
! mv *.jpeg /content/Task1/Train_10

In [49]:
! mv Train_10* /content/Task1/Train_10

### Repeating the same for 100 sample inputs

In [50]:
regressor_100 = PolynomialBasisRegression(X_train_100, y_train_100, X_val, y_val, X_test, y_test)

best_m_100, best_lambda_100 = regressor_100.fit_cv(m_list, lambda_list)

In [51]:
print(f"Best m and lambda: ({best_m_100}, {best_lambda_100})")

Best m and lambda: (6, 0)


### Saving the images and outputs

In [52]:
! mv *.jpeg /content/Task1/Train_100

In [53]:
! mv Train_100* /content/Task1/Train_10

In [54]:
! zip -r Task1.zip Task1

clear_output()

# Task 2

## Linear Regression using Polynomial Basis Functions for Dataset 2

### Loading the dataset

In [55]:
train_data_50 = pd.read_csv("/content/dataset2/train50_20.csv")
train_data_200 = pd.read_csv("/content/dataset2/train200_20.csv")

val_data = pd.read_csv("/content/dataset2/val_20.csv")

test_data = pd.read_csv("/content/dataset2/test_20.csv")

In [56]:
train_data_50.head()

Unnamed: 0,x1,x2,y
0,4.196034,-1.861206,-86.743762
1,4.195306,-2.878746,-87.670003
2,-1.857585,-2.905802,11.751357
3,-0.524578,-4.360768,0.533864
4,-1.659228,-3.440815,8.373797


In [57]:
train_data_200.head()

Unnamed: 0,x1,x2,y
0,2.566228,-3.984255,-17.056241
1,-1.904468,-1.716479,10.266735
2,4.221313,-4.247278,-88.204123
3,1.567227,-4.266392,-1.34057
4,-0.636488,-3.041608,2.994853


In [58]:
val_data.head()

Unnamed: 0,x1,x2,y
0,-3.254541,2.809744,49.884895
1,-4.291853,-1.333328,115.721208
2,-0.997809,4.77896,2.66978
3,0.769221,-0.655799,1.742669
4,3.54917,-3.495514,-52.700054


In [59]:
test_data.head()

Unnamed: 0,x1,x2,y
0,2.889509,-0.746918,-24.672114
1,2.599972,1.040721,-15.89824
2,-1.510051,-4.454603,6.539734
3,-0.461467,3.422101,1.094065
4,0.949684,1.48168,2.387349


### Modifying the dataset shape as per the model requirement

In [60]:
X_train_50 = np.reshape(train_data_50[['x1', 'x2']].to_numpy(), (-1, 2))  # Extract input features as a 2D array
y_train_50 = train_data_50['y'].to_numpy() # Extract target values as a 1D array

# Check the shapes
print("X_train shape:", X_train_50.shape)
print("y_train shape:", y_train_50.shape)

X_train shape: (50, 2)
y_train shape: (50,)


In [61]:
X_train_200 = np.reshape(train_data_200[['x1', 'x2']].to_numpy(), (-1, 2)) # Extract input features as a 2D array
y_train_200 = train_data_200['y'].to_numpy() # Extract target values as a 1D array

# Check the shapes
print("X_train shape:", X_train_200.shape)
print("y_train shape:", y_train_200.shape)

X_train shape: (200, 2)
y_train shape: (200,)


In [62]:
X_val = np.reshape(val_data[['x1', 'x2']].to_numpy(), (-1, 2))  # Extract input features as a 2D array
y_val = val_data['y'].to_numpy() # Extract target values as a 1D array

# Check the shapes
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_val shape: (100, 2)
y_val shape: (100,)


In [63]:
X_test = np.reshape(test_data[['x1', 'x2']].to_numpy(), (-1, 2)) # Extract input features as a 2D array
y_test = test_data['y'].to_numpy() # Extract target values as a 1D array

# Check the shapes
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_test shape: (100, 2)
y_test shape: (100,)


### Calling the Polynomial Regressor Function

$λ = [0, 0.01, 0.1, 1, 10, 100]$

$m = [1, 2, 3, 6, 9]$

In [64]:
regressor_50 = PolynomialBasisRegression(X_train_50, y_train_50, X_val, y_val, X_test, y_test)

In [65]:
m_list = [1, 2, 3, 6, 9]
lambda_list = [0, 0.01, 0.1, 1, 10, 100]

best_m_50, best_lambda_50 = regressor_50.fit_cv(m_list, lambda_list)

In [66]:
print(f"Best m and lambda: ({best_m_50}, {best_lambda_50})")

Best m and lambda: (3, 1)


### Generating the scatter plots

In [67]:
regressor_50.generate_scatter_plots()

### Saving the images and outputs

In [68]:
! mkdir Task2

! mkdir Task2/Train_50
! mkdir Task2/Train_200

! mkdir Task2/Train_50/SurfacePlots
! mkdir Task2/Train_50/ScatterPlots

! mkdir Task2/Train_200/SurfacePlots
! mkdir Task2/Train_200/ScatterPlots

In [69]:
! mv Scatter* /content/Task2/Train_50/ScatterPlots
! mv Surface* /content/Task2/Train_50/SurfacePlots

In [70]:
! mv *.jpeg /content/Task2/Train_50

### Repeating the same process for 200 sample inputs

In [71]:
regressor_200 = PolynomialBasisRegression(X_train_200, y_train_200, X_val, y_val, X_test, y_test)

In [72]:
best_m_200, best_lambda_200 = regressor_200.fit_cv(m_list, lambda_list)

In [73]:
regressor_200.generate_scatter_plots()

In [74]:
print(f"Best m and lambda: ({best_m_200}, {best_lambda_200})")

Best m and lambda: (3, 10)


### Saving the images and outputs

In [75]:
! mv Scatter* /content/Task2/Train_200/ScatterPlots
! mv Surface* /content/Task2/Train_200/SurfacePlots

In [76]:
! mv *.jpeg /content/Task2/Train_200

In [77]:
! zip -r Task2.zip Task2/

clear_output()

# Task 3

## Linear Regression using Gaussian Basis Functions for Dataset 3

### Loading the dataset

In [78]:
data_column_names = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']
label_column_name = ['y']

train_data = pd.read_csv("/content/dataset3/train_data.csv", header=None, names = data_column_names)
train_label = pd.read_csv("/content/dataset3/train_label.csv", header=None, names = label_column_name)

val_data = pd.read_csv("/content/dataset3/val_data.csv", header=None, names = data_column_names)
val_label = pd.read_csv("/content/dataset3/val_label.csv", header=None, names = label_column_name)

test_data = pd.read_csv("/content/dataset3/test_data.csv", header=None, names = data_column_names)
test_label = pd.read_csv("/content/dataset3/test_label.csv", header=None, names = label_column_name)

In [79]:
train_data.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7
0,-1.4719,-97.412,-37.469,-912.58,2.2587,5.0204,1.4235
1,2.5281,259.59,115.53,1376.4,-6.5413,-5.9796,-0.57653
2,-1.4719,-103.41,-35.469,-847.58,-0.84133,3.0204,0.42347
3,2.5281,155.59,65.531,1187.4,-4.1413,1.0204,-0.57653
4,-1.4719,-115.41,-37.469,-1014.6,-0.041327,-1.9796,0.42347


In [80]:
val_data.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7
0,0.52806,-48.412,15.531,-47.584,-1.7413,5.0204,1.4235
1,2.5281,110.59,40.531,902.42,-3.0413,1.0204,-0.57653
2,-1.4719,-73.412,-6.4694,-32.584,-1.0413,-0.97959,0.42347
3,0.52806,30.588,-4.4694,673.42,2.1587,0.020408,-0.57653
4,0.52806,55.588,0.53061,919.42,2.9587,-0.97959,-0.57653


In [81]:
test_data.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7
0,-1.4719,-78.412,-29.469,-731.58,-1.5413,-1.9796,0.42347
1,0.52806,-38.412,17.531,-170.58,-2.0413,-2.9796,1.4235
2,-0.47194,-63.412,-1.4694,-147.58,0.35867,2.0204,0.42347
3,0.52806,4.588,-14.469,-329.58,-0.54133,-5.9796,-0.57653
4,2.5281,67.588,5.5306,243.42,-2.0413,-0.97959,-0.57653


In [82]:
train_label.head()

Unnamed: 0,y
0,8.8541
1,-9.4459
2,13.854
3,-7.9459
4,2.5541


### Modifying the dataset shape as per the model requirement

In [83]:
X_train = np.reshape(train_data.to_numpy(), (-1, 7))
y_train = train_label.to_numpy()

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (274, 7)
y_train shape: (274, 1)


In [84]:
X_val = np.reshape(val_data.to_numpy(), (-1, 7))
y_val = val_label.to_numpy()

print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_val shape: (78, 7)
y_val shape: (78, 1)


In [85]:
X_test = np.reshape(test_data.to_numpy(), (-1, 7))
y_test = test_label.to_numpy()

print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_test shape: (40, 7)
y_test shape: (40, 1)


### Calling the Gaussian Regressor Function

$K = [70, 80, 90, 100, 120]$

$σ = [30, 35, 40, 45]$

$λ = [0, 0.01, 0.1, 1, 10, 100]$

*After trial and error, we found that large values of K perform best on the given dataset*

In [86]:
regressor = GaussianBasisRegression(X_train, y_train, X_val, y_val, X_test, y_test)

In [87]:
K_list = [70, 80, 90, 100, 120]
sigma_list = [30, 35, 40, 45]
lambda_list = [0, 0.01, 0.1, 1, 10, 100]

best_k, best_sigma, best_lambda = regressor.fit_cv(K_list, lambda_list, sigma_list)

In [88]:
print(f"Best k, sigma, lambda: ({best_k}, {best_sigma}, {best_lambda})")

Best k, sigma, lambda: (120, 45, 0.1)


### Saving the images and outputs

In [89]:
! mkdir Task3

! mkdir Task3/Dataset2
! mkdir Task3/Dataset3

! mkdir Task3/Dataset3/All
! mkdir Task3/Dataset3/Best

In [90]:
! mv *.jpeg Task3/Dataset3/All

In [91]:
regressor.generate_scatter_plots()

In [92]:
! mv *.jpeg Task3/Dataset3/Best

# Task 3

## Linear Regression using Gaussian Basis Functions for Dataset 2

### Loading the dataset

In [93]:
train_data_50 = pd.read_csv("/content/dataset2/train50_20.csv")
train_data_200 = pd.read_csv("/content/dataset2/train200_20.csv")

val_data = pd.read_csv("/content/dataset2/val_20.csv")

test_data = pd.read_csv("/content/dataset2/test_20.csv")

### Modifying the dataset shape as per model requirements

In [94]:
X_train_50 = np.reshape(train_data_50[['x1', 'x2']].to_numpy(), (-1, 2))  # Extract input features as a 2D array
y_train_50 = train_data_50['y'].to_numpy() # Extract target values as a 1D array

# Check the shapes
print("X_train shape:", X_train_50.shape)
print("y_train shape:", y_train_50.shape)

X_train shape: (50, 2)
y_train shape: (50,)


In [95]:
X_train_200 = np.reshape(train_data_200[['x1', 'x2']].to_numpy(), (-1, 2)) # Extract input features as a 2D array
y_train_200 = train_data_200['y'].to_numpy() # Extract target values as a 1D array

# Check the shapes
print("X_train shape:", X_train_200.shape)
print("y_train shape:", y_train_200.shape)

X_train shape: (200, 2)
y_train shape: (200,)


In [96]:
X_val = np.reshape(val_data[['x1', 'x2']].to_numpy(), (-1, 2))  # Extract input features as a 2D array
y_val = val_data['y'].to_numpy() # Extract target values as a 1D array

# Check the shapes
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_val shape: (100, 2)
y_val shape: (100,)


In [97]:
X_test = np.reshape(test_data[['x1', 'x2']].to_numpy(), (-1, 2)) # Extract input features as a 2D array
y_test = test_data['y'].to_numpy() # Extract target values as a 1D array

# Check the shapes
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_test shape: (100, 2)
y_test shape: (100,)


### Calling the Gaussian Regressor Function

$K = [5, 10, 15, 20]$

$σ = [5, 10, 15, 20]$

$λ = [0, 0.01, 0.1, 1, 10, 100]$


In [98]:
regressor_50_gaussian = GaussianBasisRegression(X_train_50, y_train_50, X_val, y_val, X_test, y_test)

In [99]:
K_list = [5, 10, 15, 20]
sigma_list = [5, 10, 15, 20]
lambda_list = [0, 0.01, 0.1, 1, 10, 100]

best_k, best_sigma, best_lambda = regressor_50_gaussian.fit_cv(K_list, lambda_list, sigma_list)

In [100]:
print(f"Best k, sigma, lambda: ({best_k}, {best_sigma}, {best_lambda})")

Best k, sigma, lambda: (15, 20, 0)


### Storing the plots and outputs

In [101]:
! mkdir Task3/Dataset2/Train_50
! mkdir Task3/Dataset2/Train_200

! mkdir Task3/Dataset2/Train_50/All
! mkdir Task3/Dataset2/Train_50/Best

! mkdir Task3/Dataset2/Train_200/All
! mkdir Task3/Dataset2/Train_200/Best

In [102]:
! mv *.jpeg Task3/Dataset2/Train_50/All

In [103]:
regressor_50_gaussian.generate_scatter_plots()

In [104]:
! mv *.jpeg Task3/Dataset2/Train_50/Best

### Repeating the same process for 200 sample inputs

In [105]:
regressor_200_gaussian = GaussianBasisRegression(X_train_200, y_train_200, X_val, y_val, X_test, y_test)

In [106]:
K_list = [5, 10, 15, 20]
sigma_list = [5, 10, 15, 20]
lambda_list = [0, 0.01, 0.1, 1, 10, 100]

best_k, best_sigma, best_lambda = regressor_200_gaussian.fit_cv(K_list, lambda_list, sigma_list)

In [107]:
print(f"Best k, sigma, lambda: ({best_k}, {best_sigma}, {best_lambda})")

Best k, sigma, lambda: (20, 20, 0)


### Saving the images and outputs

In [108]:
! mv *.jpeg Task3/Dataset2/Train_200/All

In [109]:
regressor_200_gaussian.generate_scatter_plots()

In [110]:
! mv *.jpeg Task3/Dataset2/Train_200/Best

In [111]:
! mv Train_200* Task3/Dataset2/Train_200
! mv Train_50* Task3/Dataset2/Train_50
! mv Train_274* Task3/Dataset3

In [112]:
! zip -r Task3.zip Task3

clear_output()

# Task 4

## Linear Regression using Polynomial Basis Functions for Dataset 4

### Dataset 4:

Estimate the median housing price in a California neighbourhood based on a given set of features that include:

1. Latitute
2. Longitute
3. Median housing age
4. Total rooms in neighbourhood
5. Total bedrooms
6. Population
7. Number of households
8. Median Income


This dataset is present by default in the Google Colab Disk Space in the sample_data folder

### Loading the dataset

In [113]:
data = (pd.read_csv("/content/sample_data/california_housing_test.csv")).sample(n=500, replace=False, random_state=42)

X = data[["housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"]]
y = data[["median_house_value"]]

# Manually changing the scale of different features to be similar
X["housing_median_age"] = X["housing_median_age"] / 10
X["total_rooms"] = X["total_rooms"] / 1000
X["total_bedrooms"] = X["total_bedrooms"] / 100
X["population"] = X["population"] / 100
X["households"] = X["households"] / 100

y["median_house_value"] = y["median_house_value"] / 1e5

clear_output()

In [114]:
X.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
1801,4.4,1.014,2.25,7.04,2.38,1.6554
1190,1.8,3.895,6.89,10.86,3.75,3.3672
1817,3.5,2.361,4.58,17.27,4.67,4.5281
251,3.6,1.751,4.38,11.75,4.19,3.0739
2505,3.9,1.613,3.8,11.13,3.56,2.825


In [115]:
y.head()

Unnamed: 0,median_house_value
1801,1.194
1190,1.336
1817,1.736
251,2.186
2505,2.767


### Splitting the data for cross_validation

In [116]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=0.5, test_size=0.5, random_state=42)

### Modifying the data shape as per the model requirement

In [117]:
X_train = np.reshape(X_train.to_numpy(), (-1, 6))
X_val = np.reshape(X_val.to_numpy(), (-1, 6))
X_test = np.reshape(X_test.to_numpy(), (-1, 6))

y_train = np.reshape(y_train.to_numpy(), (-1,))
y_val = np.reshape(y_val.to_numpy(), (-1,))
y_test = np.reshape(y_test.to_numpy(), (-1,))

### Calling the Polynomial Basis Regressor

$m = [1, 2, 3, 4, 5, 8]$

$λ = [0, 0.01, 0.1, 1, 10, 100]$

In [118]:
regressor = PolynomialBasisRegression(X_train, y_train, X_val, y_val, X_test, y_test)

In [119]:
m_list = [1, 2, 3, 4, 5, 8]
lambda_list = [0, 0.01, 0.1, 1, 10, 100]

best_m, best_lambda = regressor.fit_cv(m_list, lambda_list)

In [120]:
print(f"Best m and lambda: ({best_m}, {best_lambda})")

Best m and lambda: (1, 100)


### Saving the images and outputs

In [121]:
! mkdir Task4Norm
! mkdir Task4Norm/All
! mkdir Task4Norm/Best

In [122]:
! mv *.jpeg Task4Norm/All

In [123]:
regressor.generate_scatter_plots()

In [124]:
! mv *.jpeg Task4Norm/Best

In [125]:
! zip -r Task4Norm.zip Task4Norm

clear_output()

In [126]:
! zip -r program_data.zip /content

  adding: content/ (stored 0%)
  adding: content/.config/ (stored 0%)
  adding: content/.config/configurations/ (stored 0%)
  adding: content/.config/configurations/config_default (deflated 15%)
  adding: content/.config/default_configs.db (deflated 98%)
  adding: content/.config/.last_opt_in_prompt.yaml (stored 0%)
  adding: content/.config/active_config (stored 0%)
  adding: content/.config/logs/ (stored 0%)
  adding: content/.config/logs/2023.09.29/ (stored 0%)
  adding: content/.config/logs/2023.09.29/13.23.16.546815.log (deflated 56%)
  adding: content/.config/logs/2023.09.29/13.22.57.260061.log (deflated 86%)
  adding: content/.config/logs/2023.09.29/13.22.20.814732.log (deflated 91%)
  adding: content/.config/logs/2023.09.29/13.23.15.769577.log (deflated 57%)
  adding: content/.config/logs/2023.09.29/13.23.06.117571.log (deflated 58%)
  adding: content/.config/logs/2023.09.29/13.22.47.612187.log (deflated 58%)
  adding: content/.config/.last_update_check.json (deflated 22%)
  ad