In [1]:
from sklearn.datasets import load_diabetes
from sklearn.metrics import r2_score
import numpy as np

In [3]:
X,y = load_diabetes(return_X_y=True)

In [5]:
from sklearn.model_selection import train_test_split


In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=4)

In [9]:
# For GD we use SGDRegressor
from sklearn.linear_model import SGDRegressor

# Parameters:
### **`2.1.penalty='l2':`**\
This specifies the regularization technique being used. The 'l2' penalty refers to Ridge regression (also known as Tikhonov regularization), where the model tries to minimize the sum of squared weights (L2 norm of the weights).\
This helps in preventing overfitting by penalizing large coefficients, thereby introducing some bias but reducing variance.\
**`2.2. max_iter=500:`**
This defines the maximum number of iterations the model will go through while updating the weights using stochastic gradient descent.\
In this case, the model will run up to 500 iterations, which means the algorithm will go over the training data up to 500 times.\
**`2.3. eta0=0.1:`**
This is the initial learning rate. It controls how big of a step the algorithm takes in the direction of the negative gradient during weight updates.\
A higher learning rate means larger steps, which can speed up convergence but may risk overshooting the optimal solution. A lower learning rate means smaller steps, which might make convergence slower but more stable.\
In this case, the initial learning rate is 0.1, which is a moderately fast step size.\
**`2.4. learning_rate='constant':`**
This controls how the learning rate changes during the training process.\
'constant' means the learning rate will stay the same (i.e., eta0 = 0.1) throughout all iterations.\
There are other options like 'optimal', 'invscaling', and 'adaptive', where the learning rate changes over time (e.g., decreases as training progresses).\
**`2.5. alpha=0.001:`**
This is the regularization strength for the L2 penalty (ridge regularization).\
The parameter alpha determines the amount of regularization applied to the model. A larger alpha applies stronger regularization, shrinking the model coefficients more, whereas a smaller alpha applies weaker regularization.\
In this case, alpha=0.001 applies light regularization, so the model is only slightly penalized for large weights.

In [11]:
reg = SGDRegressor(penalty='l2',max_iter=500,eta0=0.1,learning_rate='constant',alpha=0.001)

In [13]:
reg.fit(X_train,y_train)

y_pred = reg.predict(X_test)
print("R2 score",r2_score(y_test,y_pred))
print(reg.coef_)
print(reg.intercept_)

R2 score 0.39052713215255985
[  49.57361281 -167.30249902  375.17973247  275.0106658   -10.32405151
  -61.12919972 -168.88278633  138.52355946  336.95575794   90.2639166 ]
[136.6804212]


In [15]:
# Using Ridge

**Traditional solvers like 'cholesky' or 'svd' may not be efficient for large sparse datasets because they require more memory to store dense matrices or perform direct matrix inversion.**

In [22]:
from sklearn.linear_model import Ridge
""" A small alpha like 0.001 means only slight regularization is applied,
which helps prevent overfitting while still allowing flexibility to fit the data.

The sparse_cg solver in Ridge regression is used primarily when dealing with large,
sparse datasets"""
reg = Ridge(alpha=0.001, max_iter=500,solver='sparse_cg')

In [24]:
reg.fit(X_train,y_train)

y_pred = reg.predict(X_test)
print("R2 score",r2_score(y_test,y_pred))
print(reg.coef_)
print(reg.intercept_)

R2 score 0.4625010162027918
[  34.52192778 -290.84083871  482.40181675  368.06786931 -852.44872818
  501.59160694  180.11115474  270.76334443  759.73534802   37.49135796]
151.101985182554
