In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


import os
print(os.listdir("../content/sample_data"))
# read the data from file
housing = pd.read_csv("../content/sample_data/california_housing_train.csv")

['anscombe.json', 'README.md', 'california_housing_train.csv', 'california_housing_test.csv', 'mnist_train_small.csv', 'mnist_test.csv']


In [17]:
# initializing our inputs and outputs

# indep vars
X = []
x_means = []
numerical_features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
for feature in numerical_features:
    X_i = housing[feature].values
    X.append(X_i)
    x_means.append(np.mean(X_i))

# dependend variable
Y = housing['median_house_value'].values

# mean of inputs and outputs
x_means = [np.mean(X_i) for X_i in X]
y_mean = np.mean(Y)  # Calculate mean of the dependent variable

# total number of values
n = len(X[0])

# using the formula to calculate the coefficients
coefficients = [0] * len(numerical_features)  # Initialize coefficients list with zeros
denominator = [0] * len(numerical_features)  # Initialize denominator list with zeros

for j in range(len(numerical_features)):
    for i in range(n):
        numerator = (X[j][i] - x_means[j]) * (Y[i] - y_mean)
        numerator += numerator
        denominator[j] += (X[j][i] - x_means[j]) ** 2

    coefficients[j] = numerator / denominator[j]

# calculating y-intercept
b0 = y_mean
for j in range(len(numerical_features)):
    b0 -= coefficients[j] * x_means[j]

# Print the coefficients
for i, feature in enumerate(numerical_features):
    print(f"{feature}: {coefficients[i]}")
print("b0:", b0)


longitude: 15.789833001990818
latitude: -14.265617536429758
housing_median_age: -1.9593289501204454
total_rooms: 0.002298224536473304
total_bedrooms: 0.0178683234887386
population: 0.006275516780709943
households: 0.020735921009521784
median_income: 3.164207197644173
b0: 209705.64269140907


In [18]:
# compute predicted values
predicted_values = b0 + np.dot(coefficients, X)

# calc residuals
residuals = Y - predicted_values

# square each residual to get the sqaured error
squared_errors = residuals ** 2

# compute MSE
mse = np.mean(squared_errors)

# calc TSS (total sum of squares)
tss = np.sum((Y - np.mean(Y)) ** 2)

# cal residual sum of squares (RSS)
rss = np.sum(residuals ** 2)

# calc r-sqaured score
r_squared = 1 - (rss/tss)

# Print MSE and R-squared
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2) Score:", r_squared)

Mean Squared Error (MSE): 13450096969.370235
R-squared (R2) Score: 0.0001000133791659863
