Q1) Generate a dataset with atleast seven highly correlated columns and a target variable. Implement  Ridge  Regression  using  Gradient  Descent  Optimization.  Take  different values of learning rate (such as 0.0001,0.001,0.01,0.1,1,10) and regularization 
parameter (10-15,10-10,10-5,10- 3,0,1,10,20). Choose the best parameters for which ridge regression cost function is minimum and R2_score is maximum.

In [40]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

In [41]:
n = 200
X1 = np.random.rand(n)
X2 = X1 + np.random.normal(0, 0.01, n)
X3 = X1 * 2 + np.random.normal(0, 0.01, n)
X4 = X1 + X2 + np.random.normal(0, 0.01, n)
X5 = X3 - X2 + np.random.normal(0, 0.01, n)
X6 = X4 + np.random.normal(0, 0.01, n)
X7 = X1 * 3 + np.random.normal(0, 0.01, n)
y = 5*X1 + 3*X2 + 2*X3 + np.random.normal(0, 0.05, n)

In [42]:
df = pd.DataFrame({'X1': X1,'X2': X2,'X3': X3,'X4': X4,'X5': X5,'X6': X6,'X7': X7,'y': y})
df = df.dropna()

X = df.drop(columns=['y']).values
y = df['y'].values.reshape(-1, 1)

scaler = StandardScaler()
X = scaler.fit_transform(X)

split = int(0.8 * n)
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

In [43]:
def ridge_gradient_descent(X, y, lr, lam, iterations):
    m, n = X.shape
    X = np.c_[np.ones(m), X]
    beta = np.zeros((n + 1, 1))
    for _ in range(iterations):
        y_pred = X @ beta
        y_pred = np.nan_to_num(y_pred)
        error = y_pred - y
        gradient = (X.T @ error + lam * beta) / m
        gradient[0] = (X[:,0].T @ error) / m
        beta -= lr * gradient
    cost = (1/(2*m)) * np.sum((y_pred - y)**2) + (lam/(2*m))*np.sum(beta[1:]**2)
    return beta, cost

In [44]:
learning_rates = [0.0001, 0.001, 0.01, 0.1, 1, 10]
lambdas = [1e-15, 1e-10, 1e-5, 1e-3, 0, 1, 10, 20]
best_r2 = -np.inf
best_params = None
best_beta = None

for lr in learning_rates:
    for lam in lambdas:
        beta, cost = ridge_gradient_descent(X_train, y_train, lr, lam, iterations=1000)
        X_t = np.c_[np.ones(X_test.shape[0]), X_test]
        y_pred = X_t @ beta
        y_pred = np.nan_to_num(y_pred)
        r2 = r2_score(y_test, y_pred)
        if r2 > best_r2:
            best_r2 = r2
            best_params = (lr, lam, cost)
            best_beta = beta

print(f"Best Learning Rate: {best_params[0]}")
print(f"Best Lambda: {best_params[1]}")
print(f"Minimum Cost: {best_params[2]:.6f}")
print(f"Maximum R² Score: {best_r2:.6f}")

  gradient = (X.T @ error + lam * beta) / m
  y_pred = X @ beta
  gradient = (X.T @ error + lam * beta) / m
  gradient[0] = (X[:,0].T @ error) / m
  gradient[0] = (X[:,0].T @ error) / m
  beta -= lr * gradient
  gradient = (X.T @ error + lam * beta) / m
  y_pred = X @ beta
  gradient = (X.T @ error + lam * beta) / m
  gradient[0] = (X[:,0].T @ error) / m
  gradient[0] = (X[:,0].T @ error) / m
  beta -= lr * gradient
  gradient = (X.T @ error + lam * beta) / m
  y_pred = X @ beta
  gradient = (X.T @ error + lam * beta) / m
  gradient[0] = (X[:,0].T @ error) / m
  gradient[0] = (X[:,0].T @ error) / m
  beta -= lr * gradient
  gradient = (X.T @ error + lam * beta) / m
  y_pred = X @ beta
  gradient = (X.T @ error + lam * beta) / m
  gradient[0] = (X[:,0].T @ error) / m
  gradient[0] = (X[:,0].T @ error) / m
  beta -= lr * gradient
  gradient = (X.T @ error + lam * beta) / m
  y_pred = X @ beta
  gradient = (X.T @ error + lam * beta) / m
  gradient = (X.T @ error + lam * beta) / m
  gradie

Best Learning Rate: 0.1
Best Lambda: 1
Minimum Cost: 0.006932
Maximum R² Score: 0.999816


  gradient = (X.T @ error + lam * beta) / m
  gradient[0] = (X[:,0].T @ error) / m
  y_pred = X @ beta
  beta -= lr * gradient
  gradient = (X.T @ error + lam * beta) / m
  y_pred = X @ beta
  gradient = (X.T @ error + lam * beta) / m
  gradient[0] = (X[:,0].T @ error) / m
  gradient[0] = (X[:,0].T @ error) / m
  beta -= lr * gradient


Q3) Cross Validation for Ridge and Lasso Regression  
Explore  Ridge  Cross  Validation  (RidgeCV)  and  Lasso  Cross  Validation  (LassoCV) 
function of Python. Implement both on Boston House Prediction Dataset (load_boston 
dataset from sklearn.datasets).

In [45]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.metrics import r2_score

In [46]:
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Ridge Regression with Cross Validation
alphas = [0.01, 0.1, 1, 10, 100]
ridge_cv = RidgeCV(alphas=alphas, cv=5)
ridge_cv.fit(X_train_scaled, y_train)
ridge_pred = ridge_cv.predict(X_test_scaled)

In [47]:
print(" Ridge Regression ")
print("Best Alpha:", ridge_cv.alpha_)
print("R2 Score:", r2_score(y_test, ridge_pred))
print()

lasso_cv = LassoCV(alphas=alphas, cv=5, max_iter=10000)
lasso_cv.fit(X_train_scaled, y_train)
lasso_pred = lasso_cv.predict(X_test_scaled)

print(" Lasso Regression ")
print("Best Alpha:", lasso_cv.alpha_)
print("R2 Score:", r2_score(y_test, lasso_pred))

 Ridge Regression 
Best Alpha: 10.0
R2 Score: 0.5959440604913041

 Lasso Regression 
Best Alpha: 0.01
R2 Score: 0.5977137765065148


Q4) Multiclass Logistic Regression: Implement Multiclass Logistic Regression (step-by step) on Iris dataset using one vs. rest strategy? 

In [48]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix

In [49]:
iris = load_iris()
X = iris.data
y = iris.target
classes = np.unique(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [50]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [51]:
def train_logistic_regression(X, y, lr=0.1, iterations=1000):
    m, n = X.shape
    X = np.c_[np.ones((m, 1)), X]
    weights = np.zeros(n + 1)
    
    for _ in range(iterations):
        z = np.dot(X, weights)
        h = sigmoid(z)
        gradient = (1/m) * np.dot(X.T, (h - y))
        weights -= lr * gradient
    return weights

In [52]:
all_weights = []
for c in classes:
    y_binary = np.where(y_train == c, 1, 0)
    weights = train_logistic_regression(X_train, y_binary, lr=0.1, iterations=2000)
    all_weights.append(weights)

In [53]:
def predict_one_vs_rest(X, all_weights):
    m = X.shape[0]
    X = np.c_[np.ones((m, 1)), X]
    probs = [sigmoid(np.dot(X, w)) for w in all_weights]
    probs = np.array(probs).T
    return np.argmax(probs, axis=1)

In [54]:
y_pred = predict_one_vs_rest(X_test, all_weights)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9555555555555556
Confusion Matrix:
 [[19  0  0]
 [ 0 11  2]
 [ 0  0 13]]
