In [1]:
%load_ext autoreload
%autoreload 2

In [21]:
import numpy as np
from diveai.plotting import PlotBuilder, HeatmapPlotBuilder
from diveai.models import LogisticRegression
from diveai.metrics import accuracy_score, r2_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

# Logistic Regression

## One Feature

In [3]:
np.random.seed(0)
X = np.random.randn(100, 1)
y = (X.flatten() + np.random.randn(100)*0.5 > 0).astype(int)

In [4]:
pb = PlotBuilder(title="Synthetic Data for Logistic Regression", x_label="Feature", y_label="Target")
class_0, class_1 = X[y == 0].flatten(), X[y == 1].flatten()
pb.add_plot(class_0, np.zeros_like(class_0), plot_type='scatter', label='Class 0', marker_symbol='circle', color='navy', size=10)
pb.add_plot(class_1, np.ones_like(class_1), plot_type='scatter', label='Class 1', marker_symbol='triangle-up', color='green', size=10)
pb.show()

FigureWidget({
    'data': [{'marker': {'color': 'navy', 'opacity': 1, 'size': 10, 'symbol': 'circle'},
              'mode': 'markers',
              'name': 'Class 0',
              'type': 'scatter',
              'uid': '4a891e12-a14b-4b40-bb83-666f1540bdf8',
              'x': array([ 0.40015721, -0.97727788, -0.15135721, -0.20515826, -0.85409574,
                          -2.55298982, -0.74216502, -1.45436567,  0.04575852,  0.15494743,
                          -0.88778575, -1.98079647, -0.34791215, -0.38732682, -0.30230275,
                          -1.04855297, -1.42001794, -1.70627019, -1.25279536, -1.61389785,
                          -0.21274028, -0.89546656, -0.51080514, -1.18063218, -0.02818223,
                           0.3024719 , -0.63432209, -0.36274117, -0.67246045, -0.81314628,
                          -1.7262826 ,  0.17742614, -1.63019835,  0.46278226, -0.90729836,
                          -1.23482582,  0.40234164, -0.68481009, -0.87079715, -0.57884966,
        

In [5]:
model = LogisticRegression(learning_rate=0.1, iterations=2000)
logs = model.fit(X, y)

y_pred = model.predict(X)

accuracy = accuracy_score(y.flatten(), y_pred.flatten())

# Print results
print("Predictions:", y_pred.flatten())
print("Accuracy:", accuracy)

Predictions: [1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 0 1 0 0 1 1 0 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 1 0 1 1 1 1 0
 1 0 0 0 0 1 0 1 1 0 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1]
Accuracy: 0.86


In [6]:
cost_logs = logs['cost']

pb = PlotBuilder(title="Cross Entropy Loss vs. Iterations", x_label="Iterations", y_label="Cross Entropy Loss")
pb.add_plot(np.arange(len(cost_logs)), cost_logs, plot_type='line', label='MSE')
pb.show()

FigureWidget({
    'data': [{'line': {'color': 'red'},
              'marker': {'color': 'red', 'opacity': 1, 'size': 5},
              'mode': 'lines',
              'name': 'MSE',
              'type': 'scatter',
              'uid': '841658a8-bd19-4b2c-976e-a91a59a3b766',
              'x': array([   0,    1,    2, ..., 1997, 1998, 1999], shape=(2000,)),
              'y': [0.6931471805599453, 0.6798097995671618, 0.6671513149038113,
                    ..., 0.3097984132651487, 0.30979835465153754,
                    0.3097982962421187]}],
    'layout': {'template': '...',
               'title': {'text': 'Cross Entropy Loss vs. Iterations'},
               'xaxis': {'title': {'text': 'Iterations'}},
               'yaxis': {'title': {'text': 'Cross Entropy Loss'}}}
})

In [7]:
probabilities = model.predict_proba(X)
sorted_idx = np.argsort(X.flatten())
X_sorted = X[sorted_idx]
prob_sorted = probabilities[sorted_idx]

class_0, class_1 = X[y == 0].flatten(), X[y == 1].flatten()

pb = PlotBuilder(title="Logistic Regression Classification", x_label="Feature", y_label="Predicted Probability")

pb.add_plot(class_0, np.zeros_like(class_0), plot_type='scatter', label='Class 0', marker_symbol='circle', color='navy', size=10)
pb.add_plot(class_1, np.ones_like(class_1), plot_type='scatter', label='Class 1', marker_symbol='triangle-up', color='green', size=10)

pb.add_plot(X_sorted.flatten(), prob_sorted.flatten(), plot_type='line', label='Probabilities', color='red')

# Calculate actual decision boundary point (where probability = 0.5)
intercept = model.bias
coef = model.weights[0][0]
boundary_x = -intercept/coef if coef != 0 else 0

pb.add_plot([boundary_x, boundary_x], [-0.2, 1.2], plot_type='line', color='black', label=f'Decision Boundary (X={boundary_x:.2f})')
pb.show()

FigureWidget({
    'data': [{'marker': {'color': 'navy', 'opacity': 1, 'size': 10, 'symbol': 'circle'},
              'mode': 'markers',
              'name': 'Class 0',
              'type': 'scatter',
              'uid': '67234b90-ad53-4294-bc75-cf5ddd169b4a',
              'x': array([ 0.40015721, -0.97727788, -0.15135721, -0.20515826, -0.85409574,
                          -2.55298982, -0.74216502, -1.45436567,  0.04575852,  0.15494743,
                          -0.88778575, -1.98079647, -0.34791215, -0.38732682, -0.30230275,
                          -1.04855297, -1.42001794, -1.70627019, -1.25279536, -1.61389785,
                          -0.21274028, -0.89546656, -0.51080514, -1.18063218, -0.02818223,
                           0.3024719 , -0.63432209, -0.36274117, -0.67246045, -0.81314628,
                          -1.7262826 ,  0.17742614, -1.63019835,  0.46278226, -0.90729836,
                          -1.23482582,  0.40234164, -0.68481009, -0.87079715, -0.57884966,
        

In [8]:
cm = confusion_matrix(y.flatten(), y_pred.flatten())

hb = HeatmapPlotBuilder(title="Confusion Matrix", colorscale='Blues', show_scale=True)
hb.add_confusion_matrix(cm, class_labels=['Class 0', 'Class 1'])
hb.show()


FigureWidget({
    'data': [{'colorscale': [[0.0, 'rgb(247,251,255)'], [0.125,
                             'rgb(222,235,247)'], [0.25, 'rgb(198,219,239)'],
                             [0.375, 'rgb(158,202,225)'], [0.5,
                             'rgb(107,174,214)'], [0.625, 'rgb(66,146,198)'],
                             [0.75, 'rgb(33,113,181)'], [0.875, 'rgb(8,81,156)'],
                             [1.0, 'rgb(8,48,107)']],
              'hoverinfo': 'x+y+z',
              'showscale': True,
              'text': array([[38,  8],
                             [ 6, 48]]),
              'texttemplate': '%{z}',
              'type': 'heatmap',
              'uid': 'abb13f36-0928-4b24-a596-a3f17e72e134',
              'x': [Class 0, Class 1],
              'y': [Class 0, Class 1],
              'z': array([[38,  8],
                          [ 6, 48]])}],
    'layout': {'plot_bgcolor': 'white',
               'template': '...',
               'title': {'text': 'Confusion Matrix'},
  

## Two Features

In [9]:
# Generate two features
X1 = np.random.randn(100, 1)
X2 = np.random.randn(100, 1)
X = np.hstack((X1, X2))

# Generate binary labels
y = ((X1.flatten() + X2.flatten() + np.random.randn(100)*0.5) > 0).astype(int)

In [10]:
pb = PlotBuilder(title="Synthetic Data for Logistic Regression", x_label="Feature 1", y_label="Feature 2", z_label="Target")
class_0, class_1 = X[y == 0], X[y == 1]
pb.add_plot(class_0[:, 0], class_0[:, 1], np.zeros_like(class_0[:, 0]), plot_type='scatter', label='Class 0', marker_symbol='circle', color='navy', size=10)
pb.add_plot(class_1[:, 0], class_1[:, 1], np.ones_like(class_1[:, 0]), plot_type='scatter', label='Class 1', marker_symbol='triangle-up', color='green', size=10)
pb.show()

FigureWidget({
    'data': [{'marker': {'color': 'navy', 'opacity': 1, 'size': 5.0},
              'mode': 'markers',
              'name': 'Class 0',
              'type': 'scatter3d',
              'uid': '8592a009-ba3b-42db-9f2c-090d10c7a97f',
              'x': array([-0.36918184, -1.61695604, -0.02432612, -0.73803091, -0.09815039,
                           0.78632796, -0.4664191 , -0.94444626, -0.955945  , -0.34598178,
                          -0.46359597,  0.48148147, -1.54079701,  0.06326199, -0.59731607,
                          -1.42406091, -0.49331988,  0.41605005, -1.15618243, -2.06998503,
                          -0.39727181, -0.13288058, -0.29779088, -0.30901297, -1.67600381,
                          -0.81336426, -1.46642433,  0.52106488, -0.57578797, -0.31932842,
                          -0.72559738, -1.38336396, -1.5829384 , -1.18885926, -0.50681635,
                          -1.93627981,  0.52389102,  0.08842209, -0.31088617,  0.09740017,
                         

In [11]:
model = LogisticRegression(learning_rate=0.1, iterations=2500)
logs = model.fit(X, y)

y_pred = model.predict(X)

accuracy = accuracy_score(y.flatten(), y_pred.flatten())

# Print results
print("Predictions:", y_pred.flatten())
print("Accuracy:", accuracy)

Predictions: [0 1 1 1 1 0 0 0 1 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 0 1 1
 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 0 1 1 0 1 1 0 0 0 1 0 1 1
 0 1 1 0 0 1 0 1 0 1 1 0 0 1 1 1 0 0 1 0 0 1 1 1 1 1]
Accuracy: 0.92


In [12]:
cost_logs = logs['cost']

pb = PlotBuilder(title="Cross Entropy Loss vs. Iterations", x_label="Iterations", y_label="Cross Entropy Loss")
pb.add_plot(np.arange(len(cost_logs)), cost_logs, plot_type='line', label='MSE')
pb.show()

FigureWidget({
    'data': [{'line': {'color': 'red'},
              'marker': {'color': 'red', 'opacity': 1, 'size': 5},
              'mode': 'lines',
              'name': 'MSE',
              'type': 'scatter',
              'uid': '0b99c820-fbbe-4644-af90-eb054e2361ee',
              'x': array([   0,    1,    2, ..., 2497, 2498, 2499], shape=(2500,)),
              'y': [0.6931471805599453, 0.678952865285793, 0.6654910812580367,
                    ..., 0.2419306371526576, 0.2419298412835026,
                    0.2419290465807734]}],
    'layout': {'template': '...',
               'title': {'text': 'Cross Entropy Loss vs. Iterations'},
               'xaxis': {'title': {'text': 'Iterations'}},
               'yaxis': {'title': {'text': 'Cross Entropy Loss'}}}
})

In [13]:
# Create a mesh grid for plotting
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

# Predict probabilities for the mesh grid
Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

In [14]:
pb = PlotBuilder(title="Logistic Regression Classification", x_label="Feature 1", y_label = "Feature 2", z_label="Probability")
class_0, class_1 = X[y == 0], X[y == 1]
pb.add_plot(class_0[:, 0], class_0[:, 1], np.zeros_like(class_0[:, 0]), plot_type='scatter', label='Class 0', marker_symbol='circle', color='navy', size=10)
pb.add_plot(class_1[:, 0], class_1[:, 1], np.ones_like(class_1[:, 0]), plot_type='scatter', label='Class 1', marker_symbol='triangle-up', color='green', size=10)

# Get model coefficients and intercept
w1, w2 = model.weights
b = model.bias

# Create a meshgrid for plotting probability surface
x_min, x_max = X[:, 0].min()-1, X[:, 0].max()+1
y_min, y_max = X[:, 1].min()-1, X[:, 1].max()+1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 50), 
                     np.linspace(y_min, y_max, 50))
Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

# Create decision boundary plane (parallel to z-axis)
xx_db = np.linspace(x_min, x_max, 50)
zz_db = np.linspace(-0.2, 1.2, 50)
XX_db, ZZ_db = np.meshgrid(xx_db, zz_db)

if w2 != 0:
    YY_db = (-w1 * XX_db - b) / w2  # Solve for x2 from decision equation
else:
    # Edge case: vertical plane in x-direction
    XX_db = np.full_like(XX_db, -b / w1)
    YY_db = np.tile(np.linspace(y_min, y_max, 50), (50, 1))

pb.add_plot(x=xx, y=yy, z=Z, plot_type='surface', label='Probability Preditions', colorscale=[[0, 'blue'], [1, 'green']], opacity=0.4, show_scale=False)
pb.add_plot(x=XX_db, y=YY_db, z=ZZ_db,plot_type='surface', label='Decision Boundary', colorscale=[[0, 'black'], [1, 'black']], opacity=0.2, show_scale=False)
pb.show()

FigureWidget({
    'data': [{'marker': {'color': 'navy', 'opacity': 1, 'size': 5.0},
              'mode': 'markers',
              'name': 'Class 0',
              'type': 'scatter3d',
              'uid': '29dcaa5b-a305-49a8-ad5e-279a69dfd1af',
              'x': array([-0.36918184, -1.61695604, -0.02432612, -0.73803091, -0.09815039,
                           0.78632796, -0.4664191 , -0.94444626, -0.955945  , -0.34598178,
                          -0.46359597,  0.48148147, -1.54079701,  0.06326199, -0.59731607,
                          -1.42406091, -0.49331988,  0.41605005, -1.15618243, -2.06998503,
                          -0.39727181, -0.13288058, -0.29779088, -0.30901297, -1.67600381,
                          -0.81336426, -1.46642433,  0.52106488, -0.57578797, -0.31932842,
                          -0.72559738, -1.38336396, -1.5829384 , -1.18885926, -0.50681635,
                          -1.93627981,  0.52389102,  0.08842209, -0.31088617,  0.09740017,
                         

In [15]:
cm = confusion_matrix(y.flatten(), y_pred.flatten())

hb = HeatmapPlotBuilder(title="Confusion Matrix", colorscale='Blues', show_scale=True)
hb.add_confusion_matrix(cm, class_labels=['Class 0', 'Class 1'])
hb.show()


FigureWidget({
    'data': [{'colorscale': [[0.0, 'rgb(247,251,255)'], [0.125,
                             'rgb(222,235,247)'], [0.25, 'rgb(198,219,239)'],
                             [0.375, 'rgb(158,202,225)'], [0.5,
                             'rgb(107,174,214)'], [0.625, 'rgb(66,146,198)'],
                             [0.75, 'rgb(33,113,181)'], [0.875, 'rgb(8,81,156)'],
                             [1.0, 'rgb(8,48,107)']],
              'hoverinfo': 'x+y+z',
              'showscale': True,
              'text': array([[49,  3],
                             [ 5, 43]]),
              'texttemplate': '%{z}',
              'type': 'heatmap',
              'uid': '26e5a476-e709-46ca-a624-a698debcc5b4',
              'x': [Class 0, Class 1],
              'y': [Class 0, Class 1],
              'z': array([[49,  3],
                          [ 5, 43]])}],
    'layout': {'plot_bgcolor': 'white',
               'template': '...',
               'title': {'text': 'Confusion Matrix'},
  

## Regularization

In [55]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
# Create classification dataset with 5 informative features
X, y = make_classification(
    n_samples=100,
    n_features=160,
    n_informative=10,
    n_redundant=150,  # We'll add our own redundant features
    n_classes=2,
    random_state=42
)

# Add redundant and noise features (similar to regression example)
# redundant_features = X[:, :5] @ np.random.randn(5, 50) + np.random.normal(0, 0.5, (100, 50))
# X = np.hstack([X, redundant_features])
# X = np.hstack([X, np.random.randn(100, 45)])

# Add label noise (for outliers)
y[::10] = 1 - y[::10]  # Flip labels for every 10th sample

# Scale features
X = StandardScaler().fit_transform(X)

In [56]:
models = {
    "L1 (Lasso)": LogisticRegression(learning_rate=0.1, iterations=2000, lambda_=1, regularization='l1'),
    "L2 (Ridge)": LogisticRegression(learning_rate=0.1, iterations=2000, lambda_=1, regularization='l2'),
    "ElasticNet": LogisticRegression(learning_rate=0.1, iterations=2000, lambda_=1, l1_ratio=0.5, regularization='elastic_net')
}

cost_logs = {}

for name, model in models.items():
    logs = model.fit(X, y)
    cost_logs[name] = logs['cost']
    y_pred = model.predict(X).flatten()
    y_proba = model.predict_proba(X).flatten()
    
    print(f"\n{name}:")
    print(f"Selected features: {(np.abs(model.weights[1:]) > 1e-3).sum()}")  # Exclude intercept
    print(f"Accuracy: {accuracy_score(y, y_pred):.2f}")
    print(f"Precision: {precision_score(y, y_pred):.2f}")
    print(f"Recall: {recall_score(y, y_pred):.2f}")
    print(f"F1: {f1_score(y, y_pred):.2f}")
    print(f"ROC AUC: {roc_auc_score(y, y_proba):.2f}")
    print(f"Informative feature detection: {np.sum(np.abs(model.weights[1:6]) > 1e-3)}/5")
    print(f"Max coefficient: {np.max(model.weights):.2f}")


L1 (Lasso):
Selected features: 45
Accuracy: 0.77
Precision: 0.72
Recall: 0.80
F1: 0.76
ROC AUC: 0.83
Informative feature detection: 1/5
Max coefficient: 0.61

L2 (Ridge):
Selected features: 156
Accuracy: 0.77
Precision: 0.72
Recall: 0.80
F1: 0.76
ROC AUC: 0.83
Informative feature detection: 5/5
Max coefficient: 0.09

ElasticNet:
Selected features: 26
Accuracy: 0.77
Precision: 0.72
Recall: 0.80
F1: 0.76
ROC AUC: 0.83
Informative feature detection: 1/5
Max coefficient: 0.45


In [57]:
pb = PlotBuilder(title="Regularization in Logistic Regression", x_label="Iterations", y_label="Cost")

for name, cost_log in cost_logs.items():
    pb.add_plot(np.arange(len(cost_log)), cost_log, plot_type='line', label=name)
pb.show()

FigureWidget({
    'data': [{'line': {'color': 'red'},
              'marker': {'color': 'red', 'opacity': 1, 'size': 5},
              'mode': 'lines',
              'name': 'L1 (Lasso)',
              'type': 'scatter',
              'uid': 'b6fc0e4c-b8eb-4797-894b-0531628af74f',
              'x': array([   0,    1,    2, ..., 1997, 1998, 1999], shape=(2000,)),
              'y': [0.7097398283396678, 0.5792296514660233, 0.5598724724117938,
                    ..., 0.5134191921857205, 0.5136138029114292, 0.513495422342557]},
             {'line': {'color': 'blue'},
              'marker': {'color': 'blue', 'opacity': 1, 'size': 5},
              'mode': 'lines',
              'name': 'L2 (Ridge)',
              'type': 'scatter',
              'uid': '59aa1223-5b51-4229-a663-161f957525d2',
              'x': array([   0,    1,    2, ..., 1997, 1998, 1999], shape=(2000,)),
              'y': [0.6932629223863418, 0.5605449484132229, 0.5365341173241907,
                    ..., 0.491874