In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
data_path = Path('data/diabetes.csv')
df = pd.read_csv(data_path)
X = df.drop(columns=['target']).values
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)


In [None]:
X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test_b  = np.c_[np.ones((X_test.shape[0], 1)), X_test]

theta_best = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ y_train

theta_best


In [None]:
y_pred = X_test_b @ theta_best
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE (Closed‑Form): {mse:.4f}")


In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel("True Targets")
plt.ylabel("Predicted Targets")
plt.title("Least Squares Prediction Performance")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()
