In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
from sklearn.datasets import load_diabetes
warnings.filterwarnings('ignore')


we'll implement linear regression from scratch using gradient descent. Starting with dataset loading, we'll cover the mathematical foundations and step-by-step code implementation.

The goal is to understand how linear regression works, how gradient descent optimizes model parameters, and how to build it without high-level machine learning libraries.

Table of Contents
Importing Libraries
Setting up the necessary libraries for data manipulation, model implementation, and visualization.

Loading and Exploring the Dataset
Understanding the structure of the dataset and initial data exploration.

Preparing the Data
Preprocessing the data by scaling features and splitting into training and testing sets.

Initializing Parameters
Defining the initial parameters for the model, including weights and bias.

Defining the Prediction Function
Implementing the model's prediction function to make estimates based on input data.

Defining the Cost Function
Formulating the cost function to measure the accuracy of predictions against actual values.

Computing the Gradients
Calculating the gradients for weights and bias to optimize the cost function.

Updating Parameters Using Gradient Descent
Applying gradient descent to adjust parameters and minimize the cost function.

Training the Model
Training the model using the data and updating parameters through iterative optimization.

Evaluating Model Performance with Test Data
Assessing the model's performance using test data and relevant metrics.

Conclusion
Summarizing the key findings and insights from the model implementation.

Comparison with Sklearn Linear Regression
Side by side comparison of the algorithm that we've written with the algorithms predefined in sklearn to check performance

1. Importing Libraries
The following code imports essential libraries for linear regression and dataset loading:

numpy: For numerical computing and array manipulation.
load_diabetes: Loads the Diabetes dataset for regression tasks.
matplotlib.pyplot: For visualizations such as loss curves and predictions.

In [46]:
data = load_diabetes()


In [47]:
x = data.data
y = data.target

print("Features names: ",data.feature_names)
print("Target names: ",data.target)

Features names:  ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
Target names:  [151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 155. 225.  59. 104. 182. 128.
  52.  37. 170. 170.  61. 144.  52. 128.  71. 163. 150.  97. 160. 178.
  48. 270. 202. 111.  85.  42. 170. 200. 252. 113. 143.  51.  52. 210.
  65. 141.  55. 134.  42. 111.  98. 164.  48.  96.  90. 162. 150. 279.
  92.  83. 128. 102. 302. 198.  95.  53. 134. 144. 232.  81. 104.  59.
 246. 297. 258. 229. 275. 281. 179. 200. 200. 173. 180.  84. 121. 161.
  99. 109. 115. 268. 274. 158. 107.  83. 103. 272.  85. 280. 336. 281.
 118. 317. 235.  60. 174. 259. 178. 128.  96. 126. 288.  88. 292.  71.
 197. 186.  25.  84.  96. 195.  53. 217. 172. 131. 214.  59.  70. 220.
 268. 152.  47.  74. 295. 101. 151. 127. 237. 225.  

In [48]:
data.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])

In [49]:
print(data['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

In [50]:
from sklearn.preprocessing import StandardScaler
feature_scaler = StandardScaler()
target_scaler = StandardScaler()
# x = feature_scaler.fit_transform(x)
# y = target_scaler.fit_transform(y.reshape(-1,1))
X,x_test,y,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [51]:
X.shape

(353, 10)

In [52]:
x_test.shape

(89, 10)

In [53]:
m,n = X.shape  # 353, n = 10
w = np.zeros(n) # weight vector (shape: [10,])
b = 0 # Bias term (scaler)

**Defining the prediction function**


In [54]:
def predict(X,w,b):
  return np.dot(X,w) + b

**Defining the cost function**

In [55]:
def compute_cost(X,y,w,b):
  m = len(y)
  y_pred = predict(X,w,b)
  cost = (1/(2*m))*np.sum((y_pred-y)**2)
  return cost

**Computing the Gradients**

In [56]:
def compute_gradients(X,y,w,b):
  m = len(y)
  y_pred = predict(X,w,b)
  error = y_pred - y
  dw = (1/2)*np.dot(X.T,error)
  db = (1/m)*np.sum(error)
  return dw,db

**Updating the parameters Using Gradient Descent**

In [57]:
def update_updates(w,b,dw,db,learning_rate):
  w = w - learning_rate*dw
  b = b - learning_rate*db
  return w,b


# This code performs the training of the linear regression model using gradient descent. It iteratively updates the model's parameters (weights and bias) and tracks the cost function:

# learning_rate: The step size for updating the parameters during each iteration. Set to 0.01.
# num_iterations: The number of iterations for the gradient descent process. Set to 1000.
# cost_history: A list that stores the cost value at each iteration to track the convergence of the model.
The training loop performs the following steps for each iteration:

# Prediction: Uses the predict function to compute the predicted target values (y_pred).
# Cost Calculation: Computes the cost (error) using the compute_cost function.
# Store Cost: Appends the cost to the cost_history list for tracking.
# Gradient Calculation: Computes the gradients for the weights and bias using compute_gradients.
# Parameter Update: Updates the weights and bias using the update_parameters function with the calculated gradients and learning rate.
# Iteration Logging: Every 100th iteration, prints the current iteration number and the corresponding cost.

**Training the model**

In [58]:
w = np.zeros(n)
b= 0

learning_rate = .001
num_iterations = 10000
cost_history = []

parameters = {}

for i in range(num_iterations):
  y_pred = predict(X,w,b)
  cost = compute_cost(X,y,w,b)
  dw,db = compute_gradients(X,y,w,b)
  w,b = update_updates(w,b,dw,db,learning_rate)

  if i % 1000 == 0:
    cost_history.append(cost)
    print(f'Iteration {i}: Cost = {cost:.4f}')

    parameters = {'weights': w.tolist(),'bias': b}



Iteration 0: Cost = 14855.6615
Iteration 1000: Cost = 3271.1328
Iteration 2000: Cost = 1757.3349
Iteration 3000: Cost = 1523.6064
Iteration 4000: Cost = 1475.9455
Iteration 5000: Cost = 1461.5489
Iteration 6000: Cost = 1455.5574
Iteration 7000: Cost = 1452.6001
Iteration 8000: Cost = 1451.0035
Iteration 9000: Cost = 1450.0832


In [59]:
parameters

{'weights': [41.605847988944575,
  -231.3991645351087,
  543.679891573393,
  335.5767831793388,
  -89.87164856959676,
  -129.99323505686326,
  -218.62796623956459,
  145.38495856264245,
  401.8013505763632,
  85.38120439585823],
 'bias': np.float64(151.29409390672157)}

In [60]:
import json
with open('parameters.json', 'w') as f:
  json.dump(parameters, f)

In [61]:
for i in range(10):
  print(int(y_test[i]), int(np.dot(x_test[i],parameters['weights'])+parameters['bias']))

219 140
70 180
202 140
230 292
111 121
84 94
242 255
272 189
94 84
96 112


**Evaluating the model peformance with test data**

In [62]:
y_pred = predict(X,w,b)
final_cost = compute_cost(X,y,w,b)
mse = 2 * final_cost


print(f'Final Cost: {final_cost:.4f}')
print(f'Mean Squared Error: {mse:.4f}')

print('Final Weights:',w)
print('Final Bias:',b)

Final Cost: 1449.5180
Mean Squared Error: 2899.0360
Final Weights: [  41.20118615 -234.44868198  547.59303735  337.32489304  -93.3354987
 -129.32353238 -217.87242467  146.52225533  406.15336339   79.22448324]
Final Bias: 151.2993378503322


In [63]:
y_pred_test = predict(x_test, w, b)

# Calculate metrics

print(f"Residual Analysis : {(np.abs(y_test - y_pred_test)).mean()}")

mae = np.abs(y_test - y_pred_test).mean()
mse = ((y_test - y_pred_test)**2).mean()
rmse = np.sqrt(((y_test - y_pred_test)**2).sum()/len(y_test))


print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")


SS_res = np.sum((y_test - y_pred_test)**2)        # Sum of squares of residuals(diff bet actual and predicted values)
SS_tot = np.sum((y_test - np.mean(y_test))**2)    # Total sum of squares

r2_ = 1 - (SS_res / SS_tot)

print(f"R-squared: {r2_:.4f}")

Residual Analysis : 42.9550425765526
Mean Absolute Error (MAE): 42.9550
Mean Squared Error (MSE): 2879.8412
Root Mean Squared Error (RMSE): 53.6642
R-squared: 0.4564


**Comparison with Sklearn Linear Regression**

In [64]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

model = SGDRegressor(
    loss='squared_error',
    alpha=0.0,
    learning_rate='constant',  # Set to 'constant' for a fixed learning rate
    eta0=0.01,                 # Initial learning rate (will remain fixed)
    max_iter=1000,
    random_state=42
    # tol = None                    # no early stopping
  )


model.fit(X, y)  # Use training data for fitting

# 4. Make predictions on the test set
y_pred_test = model.predict(x_test)

# 5. Evaluate the model
mse = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")

Mean Squared Error: 2957.7661
R-squared: 0.4417
