### 3.1. Gradient Descent on Salary Dataset

### 1. Imports

In [119]:
import numpy as np
import matplotlib.pyplot as plt
import math, copy

from sklearn.model_selection import train_test_split

### 2. Get and separate data

In [120]:
data = np.loadtxt('salary_data.csv', delimiter=',', usecols=(0, 1), skiprows=1)
X = data[:, 0]
y = data[:, 1]
print(X)
print(y)

[ 1.1  1.3  1.5  2.   2.2  2.9  3.   3.2  3.2  3.7  3.9  4.   4.   4.1
  4.5  4.9  5.1  5.3  5.9  6.   6.8  7.1  7.9  8.2  8.7  9.   9.5  9.6
 10.3 10.5]
[ 39343.  46205.  37731.  43525.  39891.  56642.  60150.  54445.  64445.
  57189.  63218.  55794.  56957.  57081.  61111.  67938.  66029.  83088.
  81363.  93940.  91738.  98273. 101302. 113812. 109431. 105582. 116969.
 112635. 122391. 121872.]


### 3. Train test split

In [121]:
splits = []
for i in range(9):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-0.1*(i+1), random_state=42)
    print(f'Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}')
    splits.append((X_train, X_test, y_train, y_test))

Train size: 3, Test size: 27
Train size: 6, Test size: 24
Train size: 9, Test size: 21
Train size: 12, Test size: 18
Train size: 15, Test size: 15
Train size: 18, Test size: 12
Train size: 21, Test size: 9
Train size: 24, Test size: 6
Train size: 27, Test size: 3


### 4. Functions

In [122]:
def compute_cost(x, y, w, b): 
    m = x.shape[0] 
    total_cost = 0

    for i in range(m):
        f_wb = np.dot(w, x[i]) + b
        total_cost += (f_wb - y[i]) ** 2
    total_cost /= 2 * m

    return total_cost

In [123]:
def compute_gradient(x, y, w, b):
    m = x.shape[0]
    dj_dw = 0
    dj_db = 0
    
    for i in range(m):
        f_wb = np.dot(w, x[i]) + b
        dj_dw += (f_wb - y[i]) * x[i]
        dj_db += f_wb - y[i]
    dj_dw /= m
    dj_db /= m
        
    return dj_dw, dj_db

In [124]:
def gradient_descent(x, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters, verbose=True): 
    m = len(x)
    
    J_history = []
    w_history = []
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    
    for i in range(num_iters):
        dj_dw, dj_db = gradient_function(x, y, w, b )  

        w = w - alpha * dj_dw               
        b = b - alpha * dj_db               

        if i<100000:      # prevent resource exhaustion 
            cost =  cost_function(x, y, w, b)
            J_history.append(cost)

        if verbose and i % math.ceil(num_iters/100) == 0:
            w_history.append(w)
            print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.2f}   ")
        
        # early stopping using delta cost
        if i > 0 and abs(J_history[-1] - J_history[-2]) < 1e-3:
            print(f"Early stopping at iteration {i}")
            break

    return w, b, J_history, w_history

### 5. Test gradient descent functions

In [125]:
X_train, X_test, y_train, y_test = splits[6]

In [126]:
# Compute cost with some initial values for paramaters w, b
initial_w = 2
initial_b = 1

cost = compute_cost(X_train, y_train, initial_w, initial_b)
print(type(cost))
print(f'Cost at initial w: {cost:.3f}')

<class 'numpy.float64'>
Cost at initial w: 2863557882.699


In [127]:
# Compute and display gradient with w initialized to zeroes
initial_w = 0
initial_b = 0

tmp_dj_dw, tmp_dj_db = compute_gradient(X_train, y_train, initial_w, initial_b)
print('Gradient at initial w, b (zeros):', tmp_dj_dw, tmp_dj_db)

Gradient at initial w, b (zeros): -413134.0809523809 -71057.33333333333


In [128]:
# test run
w_init = 0
b_init = 0
iterations = 100000
tmp_alpha = 0.001
# run gradient descent
w_final, b_final, J_hist, p_hist = gradient_descent(X_train, y_train, w_init, b_init, compute_cost, compute_gradient, tmp_alpha, iterations)
print(f"(w,b) found by gradient descent: ({w_final:8.4f},{b_final:8.4f})")

Iteration    0: Cost 2691501095.52   
Iteration 1000: Cost 57364145.40   
Iteration 2000: Cost 41207544.10   
Iteration 3000: Cost 31135261.76   
Iteration 4000: Cost 24856040.69   
Iteration 5000: Cost 20941474.37   
Iteration 6000: Cost 18501071.43   
Iteration 7000: Cost 16979685.41   
Iteration 8000: Cost 16031229.16   
Iteration 9000: Cost 15439946.44   
Iteration 10000: Cost 15071331.39   
Iteration 11000: Cost 14841530.90   
Iteration 12000: Cost 14698269.61   
Iteration 13000: Cost 14608958.25   
Iteration 14000: Cost 14553280.13   
Iteration 15000: Cost 14518569.50   
Iteration 16000: Cost 14496930.34   
Iteration 17000: Cost 14483440.14   
Iteration 18000: Cost 14475030.14   
Iteration 19000: Cost 14469787.21   
Iteration 20000: Cost 14466518.69   
Iteration 21000: Cost 14464481.03   
Iteration 22000: Cost 14463210.73   
Iteration 23000: Cost 14462418.80   
Iteration 24000: Cost 14461925.10   
Iteration 25000: Cost 14461617.32   
Iteration 26000: Cost 14461425.45   
Iteration

### 6. Find and Save Parameters for all splits

In [129]:
parameters = []

for split in splits:
    X_train, X_test, y_train, y_test = split
    pct = X_train.shape[0] / (X_train.shape[0] + X_test.shape[0])
    w_init = 0
    b_init = 0
    iterations = 10000
    tmp_alpha = 0.001
    # run gradient descent
    w_final, b_final, J_hist, p_hist = gradient_descent(X_train, y_train, w_init, b_init, compute_cost, compute_gradient, tmp_alpha, iterations, verbose=False)
    print(f"(w,b) found by gradient descent for pct = {pct:.2f}: ({w_final:8.4f},{b_final:8.4f})")
    parameters.append((w_final, b_final))

(w,b) found by gradient descent for pct = 0.10: (13184.9772,11788.9591)
(w,b) found by gradient descent for pct = 0.20: (12275.8822,13995.6238)
(w,b) found by gradient descent for pct = 0.30: (10098.0898,22077.0060)
(w,b) found by gradient descent for pct = 0.40: (9985.3764,22434.8584)
(w,b) found by gradient descent for pct = 0.50: (9616.1976,25191.4029)
(w,b) found by gradient descent for pct = 0.60: (9736.4901,23840.7755)
(w,b) found by gradient descent for pct = 0.70: (9693.9244,23672.8649)
(w,b) found by gradient descent for pct = 0.80: (9769.3088,22991.8389)
(w,b) found by gradient descent for pct = 0.90: (9767.5589,23515.1411)


### 7. Evaluation Functions and Utils

In [130]:
def predict(w0, w1, x):
    return w0 + w1 * x

def rss(y_true, y_pred):
    return np.sum((y_true - y_pred)**2)


def r2_score(y_true, y_pred):
    ss_res = rss(y_true, y_pred)
    ss_tot = np.sum((y_true - y_true.mean())**2)
    return 1 - ss_res/ss_tot

### 8. Plots and saving

In [132]:
import os
param_rows = []
metric_rows = []
pred_rows = []
line_colors = plt.cm.viridis(np.linspace(0, 1, 9))
feature_col = 'YearsExperience'
target_col = 'Salary'
PLOT_DIR = 'ass3_1/plots'

# For overlay plot of all hypotheses
fig_all, ax_all = plt.subplots(figsize=(7,5))
ax_all.set_title('Regression Lines Across Training Splits')
ax_all.set_xlabel(feature_col)
ax_all.set_ylabel(target_col)
ax_all.scatter(X, y, s=25, c='lightgray', label='All data')

for theta, split in zip(parameters, splits):
    w1, w0 = theta
    X_train, X_test, y_train, y_test = split
    y_train_pred = predict(w0, w1, X_train)
    y_test_pred = predict(w0, w1, X_test)

    train_rss = rss(y_train, y_train_pred)
    test_rss = rss(y_test, y_test_pred)
    train_rss_mean = train_rss / len(y_train)
    test_rss_mean = test_rss / len(y_test)

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    pct = int(len(X_train) / (len(X_train) + len(X_test)) * 100)

    param_rows.append({
        'train_pct': pct,
        'w0': w0,
        'w1': w1,
        'n_train': len(X_train),
        'n_test': len(X_test)
    })

    metric_rows.append({
        'train_pct': pct,
        'train_rss_mean': train_rss_mean,
        'test_rss_mean': test_rss_mean,
        'train_r2': train_r2,
        'test_r2': test_r2
    })

    for xv, yv, yhat in zip(X_train, y_train, y_train_pred):
        pred_rows.append({
            'train_pct': pct,
            'set': 'train',
            'x': xv,
            'y': yv,
            'y_hat': yhat
        })

    for xv, yv, yhat in zip(X_test, y_test, y_test_pred):
        pred_rows.append({
            'train_pct': pct,
            'set': 'test',
            'x': xv,
            'y': yv,
            'y_hat': yhat
        })

    # Individual plot for this split
    fig, ax = plt.subplots(figsize=(6,4))
    ax.scatter(X_train, y_train, c='blue', alpha=0.7, label='Train')
    ax.scatter(X_test, y_test, c='orange', alpha=0.7, label='Test')
    # line over full feature range
    x_line = np.linspace(X.min(), X.max(), 100)
    y_line = predict(w0, w1, x_line)
    ax.plot(x_line, y_line, color='red', label=f'Hypothesis (w0={w0:.2f}, w1={w1:.2f})')
    ax.set_title(f'Train % = {pct}')
    ax.set_xlabel(feature_col)
    ax.set_ylabel(target_col)
    ax.legend()
    fname = os.path.join(PLOT_DIR, f'hypothesis_{pct}pct.png')
    fig.tight_layout()
    fig.savefig(fname, dpi=120)
    plt.close(fig)

    # Add line to overlay plot
    ax_all.plot(x_line, y_line, color=line_colors[i], label=f'{pct}% (w1={w1:.2f})')

# Finish overlay plot
ax_all.legend(fontsize='small', ncol=2)
fig_all.tight_layout()
fig_all.savefig(os.path.join(PLOT_DIR, 'all_hypotheses.png'), dpi=130)
plt.close(fig_all)

### 9. Saving CSS

In [133]:
import pandas as pd
SAVE_DIR = 'ass3_1'

params_df = pd.DataFrame(param_rows)
metrics_df = pd.DataFrame(metric_rows)
predictions_df = pd.DataFrame(pred_rows)

params_csv = f'{SAVE_DIR}/parameters.csv'
metrics_csv = f'{SAVE_DIR}/metrics.csv'
predictions_csv = f'{SAVE_DIR}/results.csv'

params_df.to_csv(params_csv, index=False)
metrics_df.to_csv(metrics_csv, index=False)
predictions_df.to_csv(predictions_csv, index=False)

print(f'Saved {params_csv}, {metrics_csv}, {predictions_csv}')

Saved ass3_1/parameters.csv, ass3_1/metrics.csv, ass3_1/results.csv


### 10. Training pct vs RSS and R2 Plots

In [134]:
fig_rss, ax_rss = plt.subplots(figsize=(6,4))
ax_rss.plot(metrics_df['train_pct'], metrics_df['train_rss_mean'], marker='o', label='Train Mean RSS')
ax_rss.plot(metrics_df['train_pct'], metrics_df['test_rss_mean'], marker='s', label='Test Mean RSS')
ax_rss.set_xlabel('Training %')
ax_rss.set_ylabel('Mean RSS')
ax_rss.set_title('Training % vs Mean RSS')
ax_rss.legend()
fig_rss.tight_layout()
fig_rss.savefig(os.path.join(PLOT_DIR, 'training_pct_vs_mean_rss.png'), dpi=130)
plt.close(fig_rss)

fig_r2, ax_r2 = plt.subplots(figsize=(6,4))
ax_r2.plot(metrics_df['train_pct'], metrics_df['train_r2'], marker='o', label='Train R2')
ax_r2.plot(metrics_df['train_pct'], metrics_df['test_r2'], marker='s', label='Test R2')
ax_r2.set_xlabel('Training %')
ax_r2.set_ylabel('R^2')
ax_r2.set_title('Training % vs R^2')
ax_r2.legend()
fig_r2.tight_layout()
fig_r2.savefig(os.path.join(PLOT_DIR, 'training_pct_vs_r2.png'), dpi=130)
plt.close(fig_r2)

print('Saved performance plots.')

Saved performance plots.
