# Part 1: Linear Regression

For this assignment, we will implement Linear Regression as learned in class. We will use an analytical approach and a gradient descent approach for this assignment, respectively.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Load the diabetes dataset
data = load_diabetes()
X = data.data
y = data.target
print("features:", X.shape)
print("labels:", y.shape)
feature_names = data.feature_names

# TODO: Split the dataset into training and testing sets
# You may use train_test_split in sklearn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)
print("training data:", X_train.shape, y_train.shape)
print("test data:", X_test.shape,  y_test.shape)

## Analytical Approach

For this part, you will use the analytical approach to implement a linear regression model, where the model weights are directly computed from the training data. Please refer to the course slides about how to obtain the weights.

In [None]:
# TODO: Create a bias vector and integrate it to the input matrix
bias_train = np.ones((X_train.shape[0],1))
bias_test = np.ones((X_test.shape[0],1))

X_train_w_bias = np.hstack((bias_train, X_train))
X_test_w_bias = np.hstack((bias_test, X_test))

# TODO: Compute the weights for the linear regression model
weights_w_bias = np.linalg.inv(X_train_w_bias.T.dot(X_train_w_bias)).dot(X_train_w_bias.T).dot(y_train)

# TODO: Make predictions on the test data using the weights
y_pred = X_test_w_bias @ weights_w_bias

# TODO: Evaluate the predicted results using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


In [None]:
# TODO: Visualize the model's performance with the regression line
plt.figure(figsize=(8, 3))
plt.scatter(y_test, y_pred, alpha=0.6, color="blue", label="Predictions")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label="Ideal Fit")
plt.xlabel("Actual Diabetes Progression")
plt.ylabel("Predicted Diabetes Progression")
plt.title("Actual vs. Predicted Diabetes Progression")
plt.legend(loc="upper right")
plt.show()

In [None]:
# Separate and print the model weight parameters
# weights (coefficients)
weights = weights_w_bias[1:]
# bias( intercept)
bias = weights_w_bias[0]
print("Weights:", weights)
print("Bias:", bias)

# TODO: Plot a histogram chart to visualize the parameters
plt.figure(figsize=(8, 3))
x = np.arange(len(weights))
plt.bar(x, weights, edgecolor = "black", width = 0.6)
plt.xticks(x, data.feature_names, rotation=45, ha="right")
plt.xlabel("Features")
plt.ylabel("Coefficient Value")
plt.title("Linear Regression Coefficients by Feature")
plt.tight_layout()
plt.show()

# TODO: Show the most and least important parameters
important = np.abs(weights)
most_param = np.argmax(important)
least_param = np.argmin(important)
print ("Most important: feature = ", feature_names[most_param],", coef =", weights[most_param])
print ("Least important: feature = ", feature_names[least_param],", coef =", weights[least_param])


## Gradient Descent Approach

For this part, you will use a gradient descent approach to implement a linear regression model for the same diabetes dataset. Please refer to the course slides about the gradient calculation and update.

In [None]:
# Rescale the inputs and outputs
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_mean = y_train.mean()
y_std  = y_train.std()
y_train_scaled = (y_train - y_mean) / y_std
# TODO: Determine learning rate, and training epochs
lr = 1e-4       # Try options from 0.1 to 1e-4
num_steps = 15   # Try options from 5 to 20

# TODO: Initialize model parameters
weights = np.zeros(X_train_scaled.shape[1])
bias = 0

In [None]:
# Model training
for step in range(num_steps):
  # TODO: Compute gradients on weights and bias
  y_pred_scaled = X_train_scaled @ weights + bias
  y_pred = y_pred_scaled * y_std + y_mean
  error = y_train - y_pred
  grad_weights = -(2/X_train_scaled.shape[0]) * (X_train_scaled.T @ error)
  grad_bias = -(2/X_train_scaled.shape[0]) * np.sum(error)

  # TODO: Apply gradient descent on the weights using learning rate
  weights = weights - lr * grad_weights
  bias = bias - lr * grad_bias

  # TODO: Calculate Mean Squared Error during training
  mse = mean_squared_error(y_train,y_pred)
  print("Training error:", mse)

In [None]:
# TODO: Make predictions on the test data
y_pred = X_test_scaled @ weights + bias

# TODO: Calculate Mean Squared Error for evaluation
mse = mean_squared_error(y_test, y_pred)
# Print the evaluation metrics
print("Mean Squared Error:", mse)

In [None]:
# TODO: Visualize the model's performance with the regression line
plt.figure(figsize=(8, 3))
y_pred_test = X_test_scaled @ weights + bias
# Scatter plot of actual vs predicted
plt.scatter(y_test, y_pred_test, alpha=0.6, color="blue", label="Predictions")

# Predicted == Actual
lo = min(y_test.min(), y_pred_test.min())
hi = max(y_test.max(), y_pred_test.max())
plt.plot([lo, hi], [lo, hi], "r--", linewidth = 2, label="Ideal(y=x)")

# Labels and title
plt.xlabel("Actual Diabetes Progression")
plt.ylabel("Predicted Diabetes Progression")
plt.title("Actual vs. Predicted Diabetes Progression")
plt.legend(loc = "upper right")
plt.tight_layout()
plt.show()


In [None]:
# Print the model weight parameters
print("Weights:", weights)
print("Bias:", bias)

# TODO: Plot a histogram chart to visualize the parameters
plt.figure(figsize=(8, 3))
x = np.arange(len(weights))
plt.bar(x, weights, edgecolor = "black", width = 0.6)
plt.xticks(x, data.feature_names, rotation=45, ha="right")
plt.xlabel("Features")
plt.ylabel("Coefficient Value")
plt.title("Linear Regression Coefficients by Feature")
plt.tight_layout()
plt.show()
# TODO: Show the most and least important parameters
important = np.abs(weights)
most_param = np.argmax(important)
least_param = np.argmin(important)
print ("Most important: feature = ", feature_names[most_param],", coef =", weights[most_param])
print ("Least important: feature = ", feature_names[least_param],", coef =", weights[least_param])

## Use a new dataset california housing to train a linear regression model

* Load the dataset from fetch_california_housing [1 point]
* Train a linear regression model using gradient descent [4 points]
* Report the model's performance on the test set [2 points].
* For this dataset, how do the data split (try 20/80 and 50/50 training/test split ratios) and hyper-parameters (learning rate, training epochs) affect the outcome? Show and interpret your results [6 points].




In [None]:
from sklearn.datasets import fetch_california_housing

# TODO: Load the dataset, train a linear regression model using gradient descent,
# and evaluate the model's performance on the test set

# Load datasets
data = fetch_california_housing()
X = data.data
Y = data.target
print("features:", X.shape)
print("labels:", Y.shape)

# Train/test split
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(
    X, Y, test_size=0.2, random_state=50  # 20/80
)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(
    X, Y, test_size=0.5, random_state=42  # 50/50
)

# Split 1: 20/80
# Scale inputs and outputs
x_scaler_1 = StandardScaler()
X_train_1_s = x_scaler_1.fit_transform(X_train_1)
X_test_1_s = x_scaler_1.transform(X_test_1)

y_scaler_1 = StandardScaler()
y_train_1_s = y_scaler_1.fit_transform(y_train_1.reshape(-1, 1)).reshape(-1)

# Initialize parameters
n_features = X_train_1_s.shape[1]
w1 = np.zeros(n_features)
b1 = 0.0

# GD hyperparams
lr = 0.05
epochs = 500

# Training loop
m1 = X_train_1_s.shape[0]
for step in range(epochs):
    y_pred_1_s = X_train_1_s @ w1 + b1
    err1 = y_train_1_s - y_pred_1_s
    grad_w1 = -(2 / m1) * (X_train_1_s.T @ err1)
    grad_b1 = -(2 / m1) * np.sum(err1)
    w1 -= lr * grad_w1
    b1 -= lr * grad_b1

# Evaluate on test set
y_pred_test_1_s = X_test_1_s @ w1 + b1
y_pred_test_1 = y_scaler_1.inverse_transform(y_pred_test_1_s.reshape(-1, 1)).reshape(-1)
mse_1 = mean_squared_error(y_test_1, y_pred_test_1)
print(f"[20/80] Test MSE: {mse_1:.4f}")

# Plot Actual vs Predicted for split 1
plt.figure(figsize=(8, 3))
plt.scatter(y_test_1, y_pred_test_1, alpha=0.5, label="Predictions")
lo1 = min(y_test_1.min(), y_pred_test_1.min())
hi1 = max(y_test_1.max(), y_pred_test_1.max())
plt.plot([lo1, hi1], [lo1, hi1], "r--", linewidth=2, label="Ideal (y=x)")
plt.xlabel("Actual Median Value ($100k)")
plt.ylabel("Predicted Median Value ($100k)")
plt.title(f"California Housing — Actual vs Predicted (20/80) | lr={lr}, epochs={epochs}")
plt.legend(loc="upper right")
plt.tight_layout()
plt.show()

# Split 2: 50/50
# Scale features and target (fit on training only)
x_scaler_2 = StandardScaler()
X_train_2_s = x_scaler_2.fit_transform(X_train_2)
X_test_2_s = x_scaler_2.transform(X_test_2)

y_scaler_2 = StandardScaler()
y_train_2_s = y_scaler_2.fit_transform(y_train_2.reshape(-1, 1)).reshape(-1)

# Initialize parameters
n_features = X_train_2_s.shape[1]
w2 = np.zeros(n_features)
b2 = 0.0

# Training loop
m2 = X_train_2_s.shape[0]
for step in range(epochs):
    y_pred_2_s = X_train_2_s @ w2 + b2
    err2 = y_train_2_s - y_pred_2_s
    grad_w2 = -(2 / m2) * (X_train_2_s.T @ err2)
    grad_b2 = -(2 / m2) * np.sum(err2)
    w2 -= lr * grad_w2
    b2 -= lr * grad_b2

# Evaluate on test set
y_pred_test_2_s = X_test_2_s @ w2 + b2
y_pred_test_2 = y_scaler_2.inverse_transform(y_pred_test_2_s.reshape(-1, 1)).reshape(-1)
mse_2 = mean_squared_error(y_test_2, y_pred_test_2)
print(f"[50/50] Test MSE: {mse_2:.4f}")

# Plot Actual vs Predicted for split 2
plt.figure(figsize=(8, 3))
plt.scatter(y_test_2, y_pred_test_2, alpha=0.5, label="Predictions")
lo2 = min(y_test_2.min(), y_pred_test_2.min())
hi2 = max(y_test_2.max(), y_pred_test_2.max())
plt.plot([lo2, hi2], [lo2, hi2], "r--", linewidth=2, label="Ideal (y=x)")
plt.xlabel("Actual Median Value ($100k)")
plt.ylabel("Predicted Median Value ($100k)")
plt.title(f"California Housing — Actual vs Predicted (50/50) | lr={lr}, epochs={epochs}")
plt.legend(loc="upper right")
plt.tight_layout()
plt.show()


# Part 2: Getting Familiar with PyTorch

In this section, you will learn to use essential PyTorch functions.

Make sure you have the library installed. Run the cell below to check and install them if needed.

In [None]:
# Check and install required libraries
!pip install numpy pandas matplotlib scikit-learn torch --quiet

In [None]:
import torch
torch.manual_seed(0)

## PyTorch Tensor Construction

Let's start with some basic PyTorch tensor operations.

In [None]:
# Task 1: Create tensors with data
t1 = torch.ones(5,3) # A 2-D tensor with values of all ones and size of 5x3
t2 = torch.zeros(5,3) # A 2-D tensor with values of all zeros and size of 5x3
t3 = torch.eye(3) # A 2-D tensor of an identity matrix with size of 3x3
t4 = torch.rand(3,4) # A 2-D tensor with random values and size of 3x4
t5 = torch.arange(7)# A 1-D tensor with values from [0, 7) with size of 7.

print(t1)
print(t2)
print(t3)
print(t4)
print(t5)

In [None]:
import numpy as np

# Task 2: Convert tensors from existing data and to numpy arrays
t1 = torch.tensor([1,2,3,4])  # Create a tensor from Python list [1,2,3,4]
t2 = torch.tensor(np.array([1,2,3,4])) # Create a tensor from numpy array np.array([1,2,3,4])
t3 = t2.clone() # Clone a tensor from an existing tensor t2
t4 = t3.numpy() # Convert tensor t3 to a numpy array
t5 = t3.to("cuda" if torch.cuda.is_available() else "cpu") # Move tensor t4 to GPU

print(t1)
print(t2)
print(t3)
print(t4)
print(t5)

## Pivoting and Reshaping tensors
In the following section we cover common methods used to pivot and reshape tensors, namely:
1. Flatten
1. Squeeze
1. Reshape
1. Transpose

In [None]:
t = torch.rand(size=(3,4,5))
print(t)

# Task 3: Pivot and reshape tensors
t1 = t.flatten() # Flatten tensor t
t2 = t.unsqueeze(0) # Add a new dimension to t at dimension 0
t3 = t.squeeze() # Remove the dimension in t with size of 1
t4 = t.reshape(12,5) # Reshape t to size of 12x5
t5 =t.transpose(0,1) # Transpose tensor t

print(t1)
print(t2)
print(t3)
print(t4)
print(t5)

## Tensor Stack and Repeat
1. Cat
2. Stack
3. Repeat

In [None]:
tx = torch.rand(size=(3,4))
ty = torch.rand(size=(3,4))

# Task 4: Stack and repeat tensors
t1 = torch.cat([tx, ty],dim = 1) # Concatenate tx and ty at dimension 1
t2 = torch.cat([tx, ty], dim = 0) # Stack tx and ty at dimension 0
t3 = tx.repeat(1,3) # Repeat tx for 3 times at dimension 1

print(t1)
print(t2)
print(t3)

## Mathematical Operations
1. Point-wise/element-wise operations
1. Redution operations
1. Vector/Matrix operations

In [None]:
tx = torch.rand(size=(3,4))
ty = torch.rand(size=(3,4))
tz = torch.randn(4)

# Task 5: Math operations
t1 = tx*ty # Element-wise multiplication of tx and ty
t2 = torch.matmul(tx,tz) # Matrix multiplication of tx and tz
t3 = torch.sum(ty,dim=1)  # Calculate the sum of ty along dimension 1

print(t1)
print(t2)
print(t3)

## Autograd
(Some of the content is borrowed from the PyTorch website on autograd: https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html)
`torch.autograd` is PyTorch’s automatic differentiation engine that computes gradients.

In [None]:
# Task 6.1: Compute gradient for scalar inputs
x = torch.tensor([2.], requires_grad=True)
y = x ** 2


# TODO - compute the gradient
y.backward()


print(x.grad)
print(2*x == x.grad)

# Task 6.2: Compute gradient for vectors
x = torch.tensor([2., 3.], requires_grad=True)
y = x ** 2

# TODO - compute the gradient
y.sum().backward()

print(x.grad)
print(2*x == x.grad)

`torch.autograd` tracks operations on all tensors which have their `requires_grad` flag set to `True`.
The output tensor of an operation will require gradients even if only a single input tensor has `requires_grad=True`.

In [None]:
x = torch.rand(5, 5)
y = torch.rand(5, 5)
z = torch.rand((5, 5), requires_grad=True)

a = x + y
b = x + z

# Task 6.3: Check the gradients of a and b
a_grad = a.requires_grad
b_grad = b.requires_grad
print(f"Does `a` require gradients?: {a_grad}")
print(f"Does `b` require gradients?: {b_grad}")

##Understand dataset creation

Write down what functions in torch.utils.data can be used to create a dataset from tensors and load each batch of data for training and testing. (reference https://docs.pytorch.org/docs/stable/data.html)

ANSWER

##Understand PyTorch models
Write down what basic torch.nn function can be used for a linear model. (reference https://docs.pytorch.org/docs/stable/nn.html)

ANSWER

##Understand optimizers
Write down at least two optimizers in torch.optim that are useful to update model weight parameters. (reference https://docs.pytorch.org/docs/main/optim.html)


##Understand loss function
Write down two loss functions in torch.nn that can be used for binary classification. (reference https://docs.pytorch.org/docs/stable/nn.html)

ANSWER