In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


data = pd.read_csv('boston_house_prices.csv')

# Features and target variable
X = data.drop(['MEDV'], axis=1).values
y = data['MEDV'].values

# Adding a column of ones to X to account for the intercept (bias term)
X = np.c_[np.ones(X.shape[0]), X]

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Gradient Descent implementation
def gradient_descent(X, y, learning_rate=0.01, n_iterations=1000):
    m = X.shape[0]  # Number of samples
    n = X.shape[1]  # Number of features (including the intercept term)
    theta = np.zeros(n)  # Initial coefficients (weights)
    
    for i in range(n_iterations):
        predictions = X.dot(theta)
        errors = predictions - y
        gradients = (1/m) * X.T.dot(errors)
        theta -= learning_rate * gradients
        
        # Optional: print the cost every 100 iterations
        if i % 100 == 0:
            cost = (1/(2*m)) * np.sum(errors ** 2)
            print(f"Iteration {i}: Cost {cost}")
    
    return theta

# Hyperparameters
learning_rate = 0.01
n_iterations = 1000

# Running gradient descent
theta = gradient_descent(X_train, y_train, learning_rate, n_iterations)

# Making predictions on the test set
y_pred = X_test.dot(theta)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Regression Coefficients: {theta[1:]}')  # Skip the first coefficient as it corresponds to the intercept
print(f'Intercept: {theta[0]}')
print(f'Mean Squared Error: {mse}')
print(f'R2 Score (Model Accuracy): {r2}')

# Plotting Actual vs Predicted values
plt.scatter(y_test, y_pred)
plt.plot([min(y_test), max(y_test)], [min(y_pred), max(y_pred)], color='red')
plt.xlabel('Actual MEDV')
plt.ylabel('Predicted MEDV')
plt.title('Actual vs Predicted')
plt.show()


Iteration 0: Cost 301.06069209039543
Iteration 100: Cost nan
Iteration 200: Cost nan
Iteration 300: Cost nan
Iteration 400: Cost nan
Iteration 500: Cost nan
Iteration 600: Cost nan
Iteration 700: Cost nan
Iteration 800: Cost nan
Iteration 900: Cost nan


ValueError: Input contains NaN.