<a href="https://colab.research.google.com/github/RasheedKhan123/PrincipalComponentAnalysis/blob/main/LinearRegression_withPCA_and_withoutPCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

class principleComponentAnalysis:
    def __init__(self, x=None, y=None,m=None,var_retain=None):
        # Initialize method to set the independent and dependent variables
        self.x = x
        self.y = y
        self.m = m
        self.var_retain = var_retain

    def dimensionality(self, x):
        # Method to get the shape of the input matrix/array
        return x.shape

    def if_matrix_invertible(self, x):
        # Method to check if a matrix is invertible
        try:
            determinant = np.linalg.det(x)
            return determinant != 0
        except np.linalg.LinAlgError:
            return False

    def scale(self,x):
        if self.x is not None:
            return (x - x.mean()) / x.std()
        else:
            raise ValueError("Input data 'x' is not provided.")

    def find_eigens(self,x):
        eigenvalues, eigenvectors = np.linalg.eig(x)
        return eigenvalues,eigenvectors

    def principle_components(self, x , m = None, var_retain = None):
        #finding covariance matrix
        covmat = np.matmul(x.T,x)
        lambd, a = self.find_eigens(covmat)
        # sort it
        order = np.argsort(lambd)[::-1]
        lambd = lambd[order]
        a = a[:,order]
        if m is not None:
          transform_matrix = a[:,:m]
          return np.matmul(x, transform_matrix)
        elif var_retain is not None:
          cumulative_variance = np.cumsum(lambd) / np.sum(lambd)
          m = np.argmax(cumulative_variance >= var_retain) + 1
          transformed_matrix = a[:,:m]
          return np.matmul(x, transformed_matrix)
        else:
          raise ValueError("You must provide either 'm' or 'var_retain'.")


In [None]:
# Defining Linear Regression class
class LinearRegression:
    def __init__(self, X, y):
        # Initialize method to set the independent and dependent variables
        self.X = X
        self.y = y
        self.coefficients = None

    def dimensionality(self, x):
        # Method to get the shape of the input matrix/array
        return x.shape

    def if_matrix_invertible(self, x):
        # Method to check if a matrix is invertible
        try:
            determinant = np.linalg.det(x)
            return determinant != 0
        except np.linalg.LinAlgError:
            return False

    def scale(self,X):
        if self.X is not None:
            return (X - X.mean()) / X.std()
        else:
            raise ValueError("Input data 'x' is not provided.")

    def train_test_split(self, X, y, train_ratio):
        # Method to split the data into training and testing sets based on the provided ratio
        self.X = X
        self.y = y
        order = np.array(range(X.shape[0]))
        np.random.shuffle(order)
        X_shuffled = X[order, :]
        y_shuffled = y[order]
        train_size = int(X.shape[0] * train_ratio)
        X_train, X_test = X[order, :][:train_size+1, :].copy(), X[order, :][train_size+1:, :].copy()
        y_train, y_test = y[order][:train_size+1].copy(), y[order][train_size+1:].copy()
        return X_train, y_train, X_test, y_test

    def compute_coeff(self, X, y):
        # Method to compute the coefficients of the linear regression model
        self.X = X
        ones = np.ones([X.shape[0], 1])  # Creating a column of ones for the bias term
        X_new = np.hstack([ones, X])  # Augmenting the data with the ones column
        # Calculating the coefficients using the normal equation
        return np.matmul(np.linalg.inv(np.matmul(X_new.T, X_new)), np.matmul(X_new.T, y))

    def compute_loss(self, y, y_pred):
        # Method to compute the Mean Squared Error (MSE) loss
        return np.dot(y - y_pred, y - y_pred) / len(y)


In [None]:
def main():
    # Import necessary module from scikit-learn
    from sklearn.datasets import fetch_california_housing
    from sklearn.preprocessing import PolynomialFeatures

    # Fetch the California housing dataset
    X, y = fetch_california_housing(return_X_y=True)

    # Initialize the Linear Regression model with the loaded data
    lg = LinearRegression(X, y)
    pca = principleComponentAnalysis(X,y)

    # Print the dimensions of the loaded data
    print("DIMENSIONALITY-LinearRegression:", lg.dimensionality(x=X))
    print("DIMENSIONALITY-PCA:", pca.dimensionality(x=X))


    # Check and print whether the matrix is invertible
    is_invertible = lg.if_matrix_invertible(X)
    print("\nIS INVERTIBLE IN LinearRegression:", is_invertible)
    is_invertible = pca.if_matrix_invertible(X)
    print("\nIS INVERTIBLE IN PCA:", is_invertible)

    # Scale the data and print the first row of the scaled data
    print("\nFIRST ROW AFTER SCALED IN LinearRegression:\n")
    print(np.apply_along_axis(lg.scale, axis=0, arr=X)[:1])
    print("\nFIRST ROW AFTER SCALED IN PCA:\n")
    print(np.apply_along_axis(pca.scale, axis=0, arr=X)[:1])

    X, y = fetch_california_housing(return_X_y=True)
    beta = lg.compute_coeff(X,y)
    X_raw = np.column_stack((np.ones(X.shape[0]), X))
    y_raw_pred = np.dot(X_raw, beta)
    print(lg.compute_loss(y,y_raw_pred))

    X, y = fetch_california_housing(return_X_y=True)
    poly = PolynomialFeatures(degree=3,include_bias=False)
    X = poly.fit_transform(X)
    X = np.apply_along_axis(pca.scale, axis=0, arr=X)

    X_pca = pca.principle_components(X,m=6)
    print(X_pca)
    beta = lg.compute_coeff(X_pca,y)
    X_raw_pca = np.column_stack((np.ones(X_pca.shape[0]), X_pca))
    print(X_pca.shape)
    y_pca_pred = np.dot(X_raw_pca, beta)
    print(lg.compute_loss(y,y_pca_pred))

# If the script is being run as the main module, execute the main function
if __name__ == "__main__":
    main()


DIMENSIONALITY-LinearRegression: (20640, 8)
DIMENSIONALITY-PCA: (20640, 8)

IS INVERTIBLE IN LinearRegression: False

IS INVERTIBLE IN PCA: False

FIRST ROW AFTER SCALED IN LinearRegression:

[[ 2.34476576  0.98214266  0.62855945 -0.15375759 -0.9744286  -0.04959654
   1.05254828 -1.32783522]]

FIRST ROW AFTER SCALED IN PCA:

[[ 2.34476576  0.98214266  0.62855945 -0.15375759 -0.9744286  -0.04959654
   1.05254828 -1.32783522]]
0.5243209861846072
[[ 1.94515615  6.71743778  3.30807603 11.57833915  1.2853284   2.76127731]
 [ 2.5910673   2.24610702 -8.39291112  7.82031622  1.89645113  3.54818978]
 [ 3.02694541  8.61287512  4.84289515 12.20563945 -3.93793978  2.62567431]
 ...
 [-1.69929655 -1.53976998  1.47042045 -4.48742037  2.75399914  4.49381542]
 [-1.74290784 -1.04383443  2.18200422 -4.17075192  3.06295967  4.5349151 ]
 [-1.11030568 -1.30606462 -0.16200549 -3.81181345  2.54560317  4.53849902]]
(20640, 6)
0.705328419971186


### Verifying the answer

In [None]:
def standard_scaling(x):
    return (x-x.mean())/x.std()

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import PolynomialFeatures

    # Fetch the California housing dataset
X, y = fetch_california_housing(return_X_y=True)
X = np.apply_along_axis(standard_scaling, axis=0, arr=X)
# Create a linear regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X, y)

# Make predictions on the test data
y_pred = model.predict(X)

# Evaluate the model
mse = mean_squared_error(y, y_pred)
print (mse)

0.5243209861846072


In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
# Instantiate the PCA model with the number of components you want to retain\
X, y = fetch_california_housing(return_X_y=True)
poly = PolynomialFeatures(degree=3,include_bias=False)
X = poly.fit_transform(X)
X = np.apply_along_axis(standard_scaling, axis=0, arr=X)
pca = PCA(n_components=6)
X_pcaed_sk= pca.fit_transform(X)
#print(X_pcaed_sk)
model.fit(X_pcaed_sk, y)
# Make predictions on the test data
y_pred = model.predict(X_pcaed_sk)
# Evaluate the model
mse = mean_squared_error(y, y_pred)
print(mse)

0.7053284199711828
