In [1]:
%load_ext autoreload
%autoreload 2

In [40]:
import numpy as np
from diveai.plotting import PlotBuilder
from diveai.models import LinearRegression
from diveai.metrics import mean_squared_error, r2_score

# Linear Regression

## One Feature

In [3]:
np.random.seed(0)
X = 2 * np.random.rand(100, 1) # One feature
y = 4 + 3 * X + np.random.randn(100, 1) # y = 4 + 3x + noise

In [4]:
pb = PlotBuilder(title="Synthetic Data for Linear Regression", x_label="Feature", y_label="Target")
pb.add_plot(X.flatten(), y.flatten(), plot_type='scatter')
pb.show()

FigureWidget({
    'data': [{'marker': {'color': 'blue', 'opacity': 1, 'size': 5},
              'mode': 'markers',
              'type': 'scatter',
              'uid': '455342d2-6100-4b94-b42c-85b45505922a',
              'x': array([1.09762701, 1.43037873, 1.20552675, 1.08976637, 0.8473096 , 1.29178823,
                          0.87517442, 1.783546  , 1.92732552, 0.76688304, 1.58345008, 1.05778984,
                          1.13608912, 1.85119328, 0.14207212, 0.1742586 , 0.04043679, 1.66523969,
                          1.5563135 , 1.7400243 , 1.95723668, 1.59831713, 0.92295872, 1.56105835,
                          0.23654885, 1.27984204, 0.28670657, 1.88933783, 1.04369664, 0.82932388,
                          0.52911122, 1.54846738, 0.91230066, 1.1368679 , 0.0375796 , 1.23527099,
                          1.22419145, 1.23386799, 1.88749616, 1.3636406 , 0.7190158 , 0.87406391,
                          1.39526239, 0.12045094, 1.33353343, 1.34127574, 0.42076512, 0.2578526 ,
      

In [5]:
lr = LinearRegression(learning_rate=0.01, iterations=200)
logs = lr.fit(X, y)

In [6]:
print(f"weights = {lr.weights.flatten()}\nbias = {lr.bias}")

weights = [3.38137813]
bias = 3.7551723958321603


In [7]:
cost_logs = logs['cost']

pb = PlotBuilder(title="MSE vs. Iterations", x_label="Iterations", y_label="Mean Squared Error")
pb.add_plot(np.arange(len(cost_logs)), cost_logs, plot_type='line', label='MSE')
pb.show()

FigureWidget({
    'data': [{'line': {'color': 'blue'},
              'marker': {'color': 'blue', 'opacity': 1, 'size': 5},
              'mode': 'lines',
              'name': 'MSE',
              'type': 'scatter',
              'uid': '5f801aec-dbc9-4d1f-a123-35abe81edc95',
              'x': array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
                           14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
                           28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
                           42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
                           56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
                           70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
                           84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
                           98,  99, 100, 101, 102,

In [8]:
y_pred = lr.predict(X)

pb = PlotBuilder(title="Linear Regression Fit", x_label="Feature", y_label="Target")
pb.add_plot(X.flatten(), y.flatten(), plot_type='scatter', label='Data')
pb.add_plot(X.flatten(), y_pred.flatten(), plot_type='line', label='Best Fit Line', color='red')
pb.show()

FigureWidget({
    'data': [{'marker': {'color': 'blue', 'opacity': 1, 'size': 5},
              'mode': 'markers',
              'name': 'Data',
              'type': 'scatter',
              'uid': '3bd5b007-429a-4228-b9f8-dc2f4c5e7a00',
              'x': array([1.09762701, 1.43037873, 1.20552675, 1.08976637, 0.8473096 , 1.29178823,
                          0.87517442, 1.783546  , 1.92732552, 0.76688304, 1.58345008, 1.05778984,
                          1.13608912, 1.85119328, 0.14207212, 0.1742586 , 0.04043679, 1.66523969,
                          1.5563135 , 1.7400243 , 1.95723668, 1.59831713, 0.92295872, 1.56105835,
                          0.23654885, 1.27984204, 0.28670657, 1.88933783, 1.04369664, 0.82932388,
                          0.52911122, 1.54846738, 0.91230066, 1.1368679 , 0.0375796 , 1.23527099,
                          1.22419145, 1.23386799, 1.88749616, 1.3636406 , 0.7190158 , 0.87406391,
                          1.39526239, 0.12045094, 1.33353343, 1.34127574, 

## Two Features

In [9]:
np.random.seed(42)
X_multi = np.random.rand(100, 2) # Two features
y_multi = 3 + 5 * X_multi[:, 0] + 2 * X_multi[:, 1] + np.random.randn(100)

In [10]:
pb = PlotBuilder(title="Synthetic Data with Two Features", x_label='Feature 1', y_label="Feature 2", z_label='Target')
pb.add_plot(X_multi[:, 0], X_multi[:, 1], y_multi, plot_type='scatter', label='Actual Data')
pb.show()

FigureWidget({
    'data': [{'marker': {'color': 'blue', 'opacity': 1, 'size': 2.5},
              'mode': 'markers',
              'name': 'Actual Data',
              'type': 'scatter3d',
              'uid': '7a3145f3-dc6a-4c54-9ec0-f3e7cb6a1972',
              'x': array([0.37454012, 0.73199394, 0.15601864, 0.05808361, 0.60111501, 0.02058449,
                          0.83244264, 0.18182497, 0.30424224, 0.43194502, 0.61185289, 0.29214465,
                          0.45606998, 0.19967378, 0.59241457, 0.60754485, 0.06505159, 0.96563203,
                          0.30461377, 0.68423303, 0.12203823, 0.03438852, 0.25877998, 0.31171108,
                          0.54671028, 0.96958463, 0.93949894, 0.59789998, 0.0884925 , 0.04522729,
                          0.38867729, 0.82873751, 0.28093451, 0.14092422, 0.07455064, 0.77224477,
                          0.00552212, 0.70685734, 0.77127035, 0.35846573, 0.86310343, 0.33089802,
                          0.31098232, 0.72960618, 0.88721274, 0

In [11]:
lr = LinearRegression(learning_rate=0.01, iterations=500)
logs = lr.fit(X_multi, y_multi)

In [12]:
print(f"weights = {lr.weights.flatten()}\nbias = {lr.bias}")

weights = [3.8915348  2.03422671]
bias = 3.6771958608046504


In [13]:
cost_logs = logs['cost']

pb = PlotBuilder(title="MSE vs. Iterations", x_label="Iterations", y_label="Mean Squared Error")
pb.add_plot(np.arange(len(cost_logs)), cost_logs, plot_type='line', label='MSE')
pb.show()

FigureWidget({
    'data': [{'line': {'color': 'blue'},
              'marker': {'color': 'blue', 'opacity': 1, 'size': 5},
              'mode': 'lines',
              'name': 'MSE',
              'type': 'scatter',
              'uid': '755f1e97-69b0-42c0-80a3-23083459b685',
              'x': array([  0,   1,   2, ..., 497, 498, 499], shape=(500,)),
              'y': [46.06350456787621, 43.45557629865883, 41.00123254885531, ...,
                    1.184628705526663, 1.1840514754844358, 1.1834759861925912]}],
    'layout': {'template': '...',
               'title': {'text': 'MSE vs. Iterations'},
               'xaxis': {'title': {'text': 'Iterations'}},
               'yaxis': {'title': {'text': 'Mean Squared Error'}}}
})

In [14]:
y_pred = lr.predict(X_multi)

x1_range = np.linspace(X_multi[:, 0].min(), X_multi[:, 0].max(), 20)
x2_range = np.linspace(X_multi[:, 1].min(), X_multi[:, 1].max(), 20)
X1, X2 = np.meshgrid(x1_range, x2_range)
X1_flat, X2_flat = X1.ravel(), X2.ravel()

# Flatten and stack without bias
X_grid = np.column_stack([X1.ravel(), X2.ravel()])
# Get predictions
y_pred_grid = lr.predict(X_grid).reshape(X1.shape)

pb = PlotBuilder(title="Multivariate Linear Regression Trend", x_label='Feature 1', y_label="Feature 2", z_label='Target')
pb.add_plot(X_multi[:, 0], X_multi[:, 1], y_multi, plot_type='scatter', label='Actual Data')
pb.add_plot(X1, X2, y_pred_grid, plot_type='surface', color='red', opacity=0.5, colorscale="Reds", label='LR Plane')
pb.show()

FigureWidget({
    'data': [{'marker': {'color': 'blue', 'opacity': 1, 'size': 2.5},
              'mode': 'markers',
              'name': 'Actual Data',
              'type': 'scatter3d',
              'uid': '6eaaa814-982c-454c-9490-f90b9e60c382',
              'x': array([0.37454012, 0.73199394, 0.15601864, 0.05808361, 0.60111501, 0.02058449,
                          0.83244264, 0.18182497, 0.30424224, 0.43194502, 0.61185289, 0.29214465,
                          0.45606998, 0.19967378, 0.59241457, 0.60754485, 0.06505159, 0.96563203,
                          0.30461377, 0.68423303, 0.12203823, 0.03438852, 0.25877998, 0.31171108,
                          0.54671028, 0.96958463, 0.93949894, 0.59789998, 0.0884925 , 0.04522729,
                          0.38867729, 0.82873751, 0.28093451, 0.14092422, 0.07455064, 0.77224477,
                          0.00552212, 0.70685734, 0.77127035, 0.35846573, 0.86310343, 0.33089802,
                          0.31098232, 0.72960618, 0.88721274, 0

## Regularization

In [81]:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler

# Create base dataset with only 5 informative features
X, y = make_regression(
    n_samples=100,
    n_features=200,  # Total features (will add more)
    n_informative=5,
    noise=20,
    random_state=42
)

# Manually create 50 redundant features from existing informative ones
redundant_features = X[:, :5] @ np.random.randn(5, 50) + np.random.normal(0, 0.5, (100, 50))
X = np.hstack([X, redundant_features])

# Add 45 pure noise features
X = np.hstack([X, np.random.randn(100, 45)])

# Add extreme outliers
y[::10] += 500 * np.random.randn(10)

# Scale features
X = StandardScaler().fit_transform(X)

In [82]:
models = {
    "L1 (Lasso)": LinearRegression(learning_rate=0.01, iterations=1000, lambda_=10, regularization='l1'),
    "L2 (Ridge)": LinearRegression(learning_rate=0.01, iterations=1000, lambda_=10, regularization='l2'),
    "ElasticNet": LinearRegression(learning_rate=0.01, iterations=1000, lambda_=10, l1_ratio=0.5, regularization='elastic_net')
}

cost_logs = {}

for name, model in models.items():
    logs = model.fit(X, y)
    cost_logs[name] = logs['cost']
    y_pred = model.predict(X)
    print(f"\n{name}:")
    print(f"Selected features: {(np.abs(model.weights) > 1e-3).sum()}")
    print(f"MSE: {mean_squared_error(y, y_pred):.1f}")
    print(f"R2: {r2_score(y, y_pred):.1f}")
    print(f"Informative feature detection: {np.sum(np.abs(model.weights[:5]) > 1e-3)}/5")
    print(f"Max coefficient: {np.max(model.weights):.2f}")



L1 (Lasso):
Selected features: 272
MSE: 86876.0
R2: -198.5
Informative feature detection: 3/5
Max coefficient: 35.52

L2 (Ridge):
Selected features: 295
MSE: 84804.3
R2: -193.8
Informative feature detection: 5/5
Max coefficient: 33.50

ElasticNet:
Selected features: 284
MSE: 85806.3
R2: -196.1
Informative feature detection: 4/5
Max coefficient: 34.39


In [84]:
pb = PlotBuilder(title="Regularization in Linear Regression", x_label="Iterations", y_label="Cost")

for name, cost_log in cost_logs.items():
    pb.add_plot(np.arange(len(cost_log)), cost_log, plot_type='line', label=name)
pb.show()

FigureWidget({
    'data': [{'line': {'color': 'red'},
              'marker': {'color': 'red', 'opacity': 1, 'size': 5},
              'mode': 'lines',
              'name': 'L1 (Lasso)',
              'type': 'scatter',
              'uid': '2a0608fa-b438-4bad-a537-6a25cd95b381',
              'x': array([  0,   1,   2, ..., 997, 998, 999], shape=(1000,)),
              'y': [43861.89791073023, 39021.725311047776, 34959.56999513535, ...,
                    197.16340683251116, 197.1576916915374, 197.15171623677128]},
             {'line': {'color': 'blue'},
              'marker': {'color': 'blue', 'opacity': 1, 'size': 5},
              'mode': 'lines',
              'name': 'L2 (Ridge)',
              'type': 'scatter',
              'uid': 'd097a9c1-ca9d-486c-bdfa-c1ca6f1c5d40',
              'x': array([  0,   1,   2, ..., 997, 998, 999], shape=(1000,)),
              'y': [43854.86478436656, 39012.75229633009, 34948.86433617026, ...,
                    1162.5651481285431, 1162.