Generate a dataset with atleast seven highly correlated columns and a target variable. Implement Ridge Regression using Gradient Descent Optimization. Take different values of learning rate (such as 0.0001,0.001,0.01,0.1,1,10) and regularization parameter (10-15,10-10,10-5,10- 3,0,1,10,20). Choose the best parameters for which ridge regression cost function is minimum and R2_score is maximum.

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# ----- synthetic data -----
np.random.seed(0)
n = 500
z = np.random.randn(n, 1)
X = np.hstack([z + 0.01 * np.random.randn(n, 1) for _ in range(8)])
X = np.hstack([X, np.random.randn(n, 2)])
true_w = np.array([2.5, -1.2, 1.8, 0.0, 0.7, -0.5, 1.0, 0.3, 0.0, 0.0])
y = X.dot(true_w) + 0.5 * np.random.randn(n)

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=1)

# ----- preprocessing -----
scaler = StandardScaler()
X_tr_s = scaler.fit_transform(X_tr)
X_te_s = scaler.transform(X_te)

# ----- ridge regression helpers -----
def ridge_cost_grad(w, X, y, alpha):
    m = X.shape[0]
    Xb = np.c_[np.ones((m, 1)), X]
    preds = Xb.dot(w)
    err = preds - y
    cost = (1/(2*m)) * np.sum(err**2) + (alpha/(2*m)) * np.sum(w[1:]**2)
    grad = (1/m) * Xb.T.dot(err)
    grad[1:] += (alpha/m) * w[1:]
    return cost, grad

def ridge_gd_stable(X, y, alpha=1.0, lr=1e-3, n_iter=20000, clip=1e3, tol=1e-9, verbose=False):
    m, d = X.shape
    w = np.zeros(d + 1)
    costs = []
    for i in range(n_iter):
        cost, grad = ridge_cost_grad(w, X, y, alpha)
        if not np.isfinite(cost):
            if verbose: print(f"Stop: non-finite cost at iter {i}")
            return None, costs, False
        grad = np.clip(grad, -clip, clip)
        w -= lr * grad
        costs.append(cost)
        if not np.all(np.isfinite(w)):
            if verbose: print(f"Stop: non-finite weights at iter {i}")
            return None, costs, False
        if i > 1 and abs(costs[-2] - costs[-1]) < tol:
            return w, costs, True
    return w, costs, True

# ----- hyperparam sweep -----
lrs = [1e-4, 1e-3, 1e-2, 1e-1]
alphas = [1e-5, 1e-3, 0.01, 0.1, 1, 10]
summary = []

for lr in lrs:
    for a in alphas:
        w, history, ok = ridge_gd_stable(X_tr_s, y_tr, alpha=a, lr=lr, n_iter=20000, clip=1e4, tol=1e-10)
        if not ok or w is None:
            summary.append({'lr': lr, 'alpha': a, 'status': 'fail'})
            continue
        Xb_te = np.c_[np.ones((X_te_s.shape[0], 1)), X_te_s]
        preds = Xb_te.dot(w)
        if not np.all(np.isfinite(preds)):
            summary.append({'lr': lr, 'alpha': a, 'status': 'bad_preds'})
            continue
        mse = mean_squared_error(y_te, preds)
        r2 = r2_score(y_te, preds)
        summary.append({
            'lr': lr, 'alpha': a, 'status': 'ok',
            'final_cost': history[-1], 'mse_test': mse, 'r2_test': r2,
            'iters': len(history)
        })

results = pd.DataFrame(summary)
print(results.sort_values(['status', 'r2_test'], ascending=[True, False]).head(10))


       lr     alpha status  final_cost  mse_test   r2_test  iters
10  0.001   1.00000     ok    0.120167  0.219727  0.989340  20000
9   0.001   0.10000     ok    0.117177  0.219742  0.989339  20000
8   0.001   0.01000     ok    0.116878  0.219744  0.989339  20000
7   0.001   0.00100     ok    0.116848  0.219744  0.989339  20000
6   0.001   0.00001     ok    0.116844  0.219744  0.989339  20000
16  0.010   1.00000     ok    0.120081  0.219756  0.989338  20000
11  0.001  10.00000     ok    0.149976  0.219757  0.989338  20000
17  0.010  10.00000     ok    0.149970  0.219761  0.989338   8772
23  0.100  10.00000     ok    0.149970  0.219762  0.989338   1336
15  0.010   0.10000     ok    0.117040  0.219779  0.989337  20000


#### Load the Hitters dataset from the following link
https://gist.githubusercontent.com/keeganhines/59974f1ebef97bbaa44fb19143f90bad/raw/Hitters.csv

(a) Pre-process the data (null values, noise, categorical to numerical encoding)

(b) Separate input and output features and perform scaling

(c) Fit a Linear, Ridge (use regularization parameter as 0.5748), and LASSO (use regularization parameter as 0.5748) regression function on the dataset.

(d) Evaluate the performance of each trained model on test set. Which model performs the best and Why?

In [2]:
#Q2 hitters regression
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# --- Load dataset ---
url = "https://gist.githubusercontent.com/keeganhines/59974f1ebef97bbaa44fb19143f90bad/raw/Hitters.csv"
df = pd.read_csv(url)

print("Shape:", df.shape)
print(df.head())

# --- Clean and preprocess ---
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')
df = df.dropna(subset=['Salary']).reset_index(drop=True)

# Drop name column if it exists
if df.columns[0].lower() not in [c.lower() for c in ['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI']]:
    df = df.iloc[:, 1:]

cat_cols = [c for c in df.select_dtypes(include=['object']).columns if c not in ['Player', 'Name']]
print("Categorical cols:", cat_cols)

X = df.drop(columns=['Salary'])
y = df['Salary'].values

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

preproc = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(drop='first', sparse_output=False), cat_cols)
])

# --- Split & train ---
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)

alpha = 0.5748
pipelines = {
    'Linear': Pipeline([('prep', preproc), ('model', LinearRegression())]),
    'Ridge' : Pipeline([('prep', preproc), ('model', Ridge(alpha=alpha))]),
    'Lasso' : Pipeline([('prep', preproc), ('model', Lasso(alpha=alpha, max_iter=10000))])
}

for name, pipe in pipelines.items():
    pipe.fit(X_tr, y_tr)
    preds = pipe.predict(X_te)
    mse = mean_squared_error(y_te, preds)
    r2 = r2_score(y_te, preds)
    print(f"{name}: RMSE={np.sqrt(mse):.3f} | R²={r2:.3f}")


Shape: (322, 21)
          Unnamed: 0  AtBat  Hits  HmRun  Runs  RBI  Walks  Years  CAtBat  \
0     -Andy Allanson    293    66      1    30   29     14      1     293   
1        -Alan Ashby    315    81      7    24   38     39     14    3449   
2       -Alvin Davis    479   130     18    66   72     76      3    1624   
3      -Andre Dawson    496   141     20    65   78     37     11    5628   
4  -Andres Galarraga    321    87     10    39   42     30      2     396   

   CHits  ...  CRuns  CRBI  CWalks  League Division PutOuts  Assists  Errors  \
0     66  ...     30    29      14       A        E     446       33      20   
1    835  ...    321   414     375       N        W     632       43      10   
2    457  ...    224   266     263       A        W     880       82      14   
3   1575  ...    828   838     354       N        E     200       11       3   
4    101  ...     48    46      33       N        E     805       40       4   

   Salary  NewLeague  
0     NaN       

#### Cross Validation for Ridge and Lasso Regression

Explore Ridge Cross Validation (RidgeCV) and Lasso Cross Validation (LassoCV) function of Python. Implement both on Boston House Prediction Dataset (load_boston dataset from sklearn.datasets).

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# --- Load dataset ---
try:
    from sklearn.datasets import load_boston
    boston = load_boston()
    X, y = boston.data, boston.target
except Exception:
    url = "http://lib.stat.cmu.edu/datasets/boston"
    raw = pd.read_csv(url, sep=r"\s+", header=None, skiprows=22)
    data = np.hstack([raw.values[::2, :], raw.values[1::2, :2]])
    X, y = data[:, :-1], data[:, -1]

# --- Scale and split ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# --- Ridge Regression with CV ---
alpha_grid = np.logspace(-6, 6, 200)
ridge = RidgeCV(alphas=alpha_grid, store_cv_results=True)
ridge.fit(X_train, y_train)

ridge_preds = ridge.predict(X_test)
print(f"RidgeCV - best α = {ridge.alpha_:.6f}")
print(f"R² = {r2_score(y_test, ridge_preds):.4f} | MSE = {mean_squared_error(y_test, ridge_preds):.4f}")

# --- Lasso Regression with CV ---
lasso = LassoCV(cv=5, max_iter=10000)
lasso.fit(X_train, y_train)

lasso_preds = lasso.predict(X_test)
print(f"\nLassoCV - best α = {lasso.alpha_:.6f}")
print(f"R² = {r2_score(y_test, lasso_preds):.4f} | MSE = {mean_squared_error(y_test, lasso_preds):.4f}")

RidgeCV - best α = 26.126752
R² = 0.6905 | MSE = 16.0824

LassoCV - best α = 0.047088
R² = 0.6948 | MSE = 15.8604


#### Multiclass Logistic Regression:
Implement Multiclass Logistic Regression (step-by step) on Iris dataset using one vs. rest strategy?

In [5]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# --- Load data ---
iris = load_iris()
X, y = iris.data, iris.target

# --- Scale & split ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_tr, X_te, y_tr, y_te = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# --- Sigmoid & training ---
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

def fit_binary_logistic(X, y_bin, lr=0.1, steps=5000, tol=1e-6):
    m, n = X.shape
    Xb = np.c_[np.ones((m, 1)), X]
    w = np.zeros(n + 1)
    for _ in range(steps):
        preds = sigmoid(Xb @ w)
        grad = (1/m) * Xb.T.dot(preds - y_bin)
        w -= lr * grad
        if np.linalg.norm(grad) < tol:
            break
    return w

# --- Train one-vs-rest classifiers ---
K = len(np.unique(y_tr))
W = []
for k in range(K):
    y_k = (y_tr == k).astype(int)
    w_k = fit_binary_logistic(X_tr, y_k, lr=0.3, steps=10000)
    W.append(w_k)
W = np.vstack(W)

# --- Predict ---
def predict_multiclass(X, W):
    Xb = np.c_[np.ones((X.shape[0], 1)), X]
    probs = sigmoid(Xb @ W.T)
    return np.argmax(probs, axis=1)

y_pred = predict_multiclass(X_te, W)

# --- Evaluation ---
print(f"Accuracy: {accuracy_score(y_te, y_pred):.4f}\n")
print("Classification Report:")
print(classification_report(y_te, y_pred, target_names=iris.target_names))
print("Confusion Matrix:\n", confusion_matrix(y_te, y_pred))

Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
