In [1]:
# Step 1: Imports and data loading

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

# Load a freely available dataset from scikit-learn
data = load_breast_cancer()  # features + target in memory [web:17]
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Peek at a few rows
X.sample(10, random_state=42)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
204,12.47,18.6,81.09,481.9,0.09965,0.1058,0.08005,0.03821,0.1925,0.06373,...,14.97,24.64,96.05,677.9,0.1426,0.2378,0.2671,0.1015,0.3014,0.0875
70,18.94,21.31,123.6,1130.0,0.09009,0.1029,0.108,0.07951,0.1582,0.05461,...,24.86,26.58,165.9,1866.0,0.1193,0.2336,0.2687,0.1789,0.2551,0.06589
131,15.46,19.48,101.7,748.9,0.1092,0.1223,0.1466,0.08087,0.1931,0.05796,...,19.26,26.0,124.9,1156.0,0.1546,0.2394,0.3791,0.1514,0.2837,0.08019
431,12.4,17.68,81.47,467.8,0.1054,0.1316,0.07741,0.02799,0.1811,0.07102,...,12.88,22.91,89.61,515.8,0.145,0.2629,0.2403,0.0737,0.2556,0.09359
540,11.54,14.44,74.65,402.9,0.09984,0.112,0.06737,0.02594,0.1818,0.06782,...,12.26,19.68,78.78,457.8,0.1345,0.2118,0.1797,0.06918,0.2329,0.08134
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,25.74,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124
369,22.01,21.9,147.2,1482.0,0.1063,0.1954,0.2448,0.1501,0.1824,0.0614,...,27.66,25.8,195.0,2227.0,0.1294,0.3885,0.4756,0.2432,0.2741,0.08574
29,17.57,15.05,115.0,955.1,0.09847,0.1157,0.09875,0.07953,0.1739,0.06149,...,20.01,19.52,134.9,1227.0,0.1255,0.2812,0.2489,0.1456,0.2756,0.07919
81,13.34,15.86,86.49,520.0,0.1078,0.1535,0.1169,0.06987,0.1942,0.06902,...,15.53,23.19,96.66,614.9,0.1536,0.4791,0.4858,0.1708,0.3527,0.1016
477,13.9,16.62,88.97,599.4,0.06828,0.05319,0.02224,0.01339,0.1813,0.05536,...,15.14,21.8,101.2,718.9,0.09384,0.2006,0.1384,0.06222,0.2679,0.07698


In [2]:
# Step 2: Taking samples with replacement (bootstrapping feature means)
means = []

for i in range(100):
    # Take a bootstrap sample of 20 rows, with replacement
    sample = X.sample(20, random_state=i, replace=True)
    # Store the mean of each feature for this sample
    means.append(sample.mean())

sample_means = pd.DataFrame(means)
sample_means.head()


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,13.59915,20.232,88.1985,599.58,0.096449,0.102682,0.081417,0.04234,0.17694,0.063471,...,15.658,27.4255,102.236,802.87,0.131319,0.289024,0.30884,0.109858,0.28049,0.089126
1,14.753,19.0265,95.8705,713.61,0.092524,0.098666,0.090682,0.047154,0.173495,0.061646,...,17.2855,25.6665,112.8755,998.325,0.126843,0.279529,0.312334,0.117965,0.282855,0.088025
2,13.2168,18.482,84.9725,558.88,0.08737,0.072363,0.056214,0.032128,0.16371,0.059608,...,14.95,24.6145,96.7615,725.345,0.120584,0.185066,0.195361,0.087182,0.256875,0.07502
3,12.9329,18.8685,84.341,544.0,0.099942,0.11997,0.095888,0.044979,0.19446,0.067557,...,15.05535,25.6555,99.596,750.755,0.13716,0.294411,0.303798,0.116383,0.303095,0.093455
4,14.813,18.731,96.7265,723.11,0.096291,0.107963,0.104841,0.055365,0.18098,0.06126,...,16.9385,24.266,111.8245,956.615,0.129719,0.249644,0.276465,0.113777,0.287935,0.077621


In [4]:
# Step 3: Fitting Logistic Regression models on bootstrap samples
coefs = []

for i in range(100):
    # Draw a bootstrap sample of 100 rows from the full df
    sample = df.sample(100, random_state=i, replace=True)
    X_sample = sample.drop('target', axis=1)
    y_sample = sample['target']

    # Create and fit Logistic Regression model
    lgr = LogisticRegression(max_iter=1000, random_state=42)
    lgr.fit(X_sample, y_sample)

    # Store the coefficients (for one binary logistic regression: shape (n_features,))
    coefs.append(lgr.coef_[0])

coef_df = pd.DataFrame(coefs, columns=X.columns)
coef_df.head()


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,0.31457,0.024653,0.338789,0.006673,-0.038119,-0.053025,-0.110192,-0.08205,-0.057529,-0.008304,...,0.246187,-0.454905,-0.406274,-0.015531,-0.088192,-0.053941,-0.263198,-0.142708,-0.071508,-0.011606
1,0.038388,1.195812,0.108421,0.010197,-0.005757,-0.008982,0.011715,-0.011377,-0.024945,-0.003385,...,-0.061676,-1.084299,-0.018289,-0.027249,-0.011294,-0.069701,-0.072938,-0.011683,-0.098602,-0.012324
2,0.03276,0.355532,0.399899,0.014415,-0.005847,0.008263,-0.0166,-0.020725,-0.024818,0.002409,...,-0.089274,-0.348415,-0.32262,-0.040107,-0.030953,-0.037579,-0.106687,-0.077431,-0.072361,-0.010301
3,0.284439,0.265448,0.364647,-0.000405,-0.040825,-0.100623,-0.163947,-0.065264,-0.072415,-0.01719,...,-0.089801,-0.575091,-0.497097,-0.008391,-0.10823,-0.388628,-0.499514,-0.121351,-0.180653,-0.044524
4,-0.031722,-0.10989,-0.32856,0.221823,-0.001903,-0.004843,-0.006459,-0.002775,-0.00736,-0.000437,...,-0.043697,-0.480592,-0.457249,-0.142148,-0.001818,-0.010363,-0.012778,-0.005548,-0.011853,-0.001631


Correlation between models   
Each model is a vector of coefficients over the same set of features, trained on overlapping bootstrap samples. Intuitively, these models will be similar, not independent.

In [5]:
corr_matrix = coef_df.corr()
corr_matrix.head()

# This tells you how coefficients of different features move together across models. 
# If you want a rough idea of similarity between whole models, you could compute 
# pairwise cosine similarity or Euclidean distances between rows of coef_df.

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
mean radius,1.0,0.186227,0.250742,-0.436822,-0.714451,-0.796648,-0.842985,-0.787444,-0.640824,-0.631169,...,0.453554,0.090505,0.064246,0.311037,-0.705551,-0.607839,-0.6782,-0.74066,-0.632033,-0.478775
mean texture,0.186227,1.0,0.038133,-0.270722,-0.158688,-0.17664,-0.166069,-0.157806,-0.300104,-0.219189,...,0.143889,-0.59755,0.24276,0.209598,-0.203596,-0.244611,-0.259206,-0.18331,-0.313121,-0.206433
mean perimeter,0.250742,0.038133,1.0,-0.316353,-0.026792,0.12427,0.086685,0.069389,0.1845,0.238729,...,0.168028,-0.060599,-0.328376,-0.032721,-0.076034,-0.001935,-0.020152,-0.006126,0.202133,0.126952
mean area,-0.436822,-0.270722,-0.316353,1.0,0.345755,0.289655,0.428022,0.415503,0.287002,0.23275,...,-0.171795,0.077846,-0.001122,-0.861093,0.424115,0.361135,0.508209,0.458947,0.251565,0.281127
mean smoothness,-0.714451,-0.158688,-0.026792,0.345755,1.0,0.795579,0.752694,0.81698,0.538179,0.760977,...,-0.119593,-0.101927,-0.140089,-0.290963,0.94696,0.62006,0.652222,0.786645,0.501816,0.492645


Because all models:
- Use the same algorithm (Logistic Regression),
- See data drawn from the same underlying distribution,
- And bootstrap samples of reasonable size will share many rows,

Their learned decision boundaries and coefficients will tend to be similar, so the models will be correlated, not independent.