In [1]:
import pandas as pd
import numpy as np


In [2]:
# laod data
df = pd.read_csv("multiple_linear_regression_dataset.csv")
print(df.head())
print(f'Column names = {df.columns}')
print(f'Shape = {df.shape}')

   age  experience  income
0   25           1   30450
1   30           3   35670
2   47           2   31580
3   32           5   40130
4   43          10   47830
Column names = Index(['age', 'experience', 'income'], dtype='str')
Shape = (20, 3)


Separate Inputs & outputs

In [3]:
# inputs => age, exp
# outputs => income
# feats model need to handle => 2 (input)

X = df.drop("income", axis=1)
y = df["income"]
print(X.shape)
print(y.shape)

(20, 2)
(20,)


Initialize model param

In [None]:
n_feat = X.shape[1]

w = np.zeros(n_feat)
b = 0.0

# Why do we need one weight per feature?  
# => To show importance of every feat

# Why is bias separate? 
# => offset
# Would initializing with large values be risky? 
# => No (WRONG!!! It would be risky as it wont converge)

Forward Pass (Prediction)

In [None]:
def predict(X, w, b):
    y_pred = X.dot(w) + b
    return y_pred

# Why is there no activation function? 
# => Becz we want the linear relationship (doesn't matter)

# What kind of values can y_hat take? 
# => continuous numbers

# How is this different from logistic regression? 
# => Predict number than prob

Loss function (MSE)

In [6]:
def mean_sq_err(y, y_pred):
    loss = np.mean((y_pred - y)**2)
    return loss

Compute Gradients

In [7]:
def compute_gradients(X, y, y_pred):
    n = len(y)
    
    dw = (2/n) * X.T.dot(y_pred-y)
    db = (2/n) * (y_pred-y).sum()
    
    return dw, db

# Why does X appear in dw but not in db?
# => dw depends on X (input feat) but b is constant only depends on err 

# Why does the error term appear everywhere?
# err tells the model ki kitna galat => minimise err 

# What happens if error is zero?
# err = 0 => gradient is also 0

Update Param (Gradient Descent)

In [8]:
def update_param(w, b, dw, db, lr):
    w = w - lr*dw
    b = b - lr*db

    return w, b

Training Loop

In [None]:
lr = 0.0001
epochs = 1000

for e in range(epochs):
    y_pred = predict(X, w, b)
    loss = mean_sq_err(y, y_pred)
    dw, db = compute_gradients(X, y, y_pred)
    w, b = update_param(w, b, dw, db, lr)
    
    if e % 100 == 0:
        print(f'Epoch {e}, Loss: {loss}')
        
# Does loss decrease over time?
# Yes, as the epoch inc, loss dec

# What happens if it increases?
# we chose the lr too high

# How do learning rate and epochs interact?
# low lr => higher epochs to converge

Epoch 0, Loss: 52437225.92316201
Epoch 100, Loss: 52229371.55442862
Epoch 200, Loss: 52056736.5554484
Epoch 300, Loss: 51907205.76492838
Epoch 400, Loss: 51772858.03029303
Epoch 500, Loss: 51648514.27565856
Epoch 600, Loss: 51530788.2170944
Epoch 700, Loss: 51417465.7126904
Epoch 800, Loss: 51307098.97667964
Epoch 900, Loss: 51198741.27339558


Final Eval

In [None]:
new_candidate = np.array([4.5,68])
predicted_salary = new_candidate.dot(w) + b
print(predicted_salary)

107418.97925148901
