In [1]:
import numpy as np
import time
from sklearn.datasets import load_iris # Import the Iris dataset loader
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [2]:
# --- Configuration ---
N_BATCH_SAMPLES = 15     # Number of new samples arriving (simulated batch size per step)
K_FOLDS = 5              # Number of folds for K-Fold Cross-Validation
EVAL_INTERVAL = 3        # Perform K-Fold evaluation every X simulated time steps
MIN_SAMPLES_FOR_KFOLD = K_FOLDS # Need at least K samples for KFold

In [3]:
# --- Load Dataset ---
print("Loading Iris dataset...")
iris = load_iris()
X_full, y_full = iris.data, iris.target
N_TOTAL_SAMPLES = len(X_full)
N_FEATURES = X_full.shape[1]
N_CLASSES = len(np.unique(y_full))

print(f"Total samples in Iris dataset: {N_TOTAL_SAMPLES}")
print(f"Number of features: {N_FEATURES}")
print(f"Number of classes: {N_CLASSES}")

Loading Iris dataset...
Total samples in Iris dataset: 150
Number of features: 4
Number of classes: 3


In [5]:
# Optional: Shuffle the full dataset initially to simulate random arrival order
# In a real system, data might arrive chronologically or based on some other criteria.

shuffle_idx = np.random.permutation(N_TOTAL_SAMPLES)
X_full = X_full[shuffle_idx]
y_full = y_full[shuffle_idx]

In [6]:
# --- Initialize Current "Arrived" Dataset ---
# Start with empty numpy arrays that match the structure of the full dataset
X_current = np.empty((0, N_FEATURES))
y_current = np.empty((0,), dtype=y_full.dtype) # Ensure correct dtype for labels

In [7]:
X_current

array([], shape=(0, 4), dtype=float64)

In [8]:
y_current

array([], dtype=int64)

In [9]:

# --- Simulation Loop: Simulating Data Arrival and Periodic Evaluation ---
print("\nStarting real-time data simulation (slicing Iris) and periodic K-Fold evaluation...")

current_idx = 0 # Keep track of how many samples we've processed from the full dataset
step = 0 # Simulated time step counter

while current_idx < N_TOTAL_SAMPLES:
    step += 1
    print(f"\n--- Time Step {step} ---")

    # 1. Simulate new data arriving (take the next batch from the shuffled dataset)
    start_slice = current_idx
    end_slice = min(current_idx + N_BATCH_SAMPLES, N_TOTAL_SAMPLES) # Handle the last potentially smaller batch

    X_new = X_full[start_slice:end_slice]
    y_new = y_full[start_slice:end_slice]

    # Break if no new samples were added (can happen if N_BATCH_SAMPLES is large and current_idx is already N_TOTAL_SAMPLES)
    if len(X_new) == 0:
        print("No new samples to add. Ending simulation loop.")
        break

    print(f"Simulating arrival of {len(X_new)} new samples (Indices {start_slice} to {end_slice-1} from shuffled dataset).")

    # 2. Add new data to the current "arrived" dataset
    X_current = np.vstack((X_current, X_new))
    y_current = np.concatenate((y_current, y_new))
    current_idx = end_slice # Update the index for the next step

    print(f"Current total dataset size: {len(X_current)} samples")

    # 3. Perform K-Fold Cross-Validation periodically
    # Evaluate every EVAL_INTERVAL steps OR when the last batch has arrived (current_idx == N_TOTAL_SAMPLES)
    if (step % EVAL_INTERVAL == 0) or (current_idx == N_TOTAL_SAMPLES):
        print(f"Evaluation interval reached (or all data arrived).")

        if len(X_current) < MIN_SAMPLES_FOR_KFOLD:
            print(f"Current dataset ({len(X_current)} samples) too small for K-Fold with {K_FOLDS} splits. Skipping evaluation.")
            continue

        print(f"Performing {K_FOLDS}-Fold Cross-Validation on the current snapshot of {len(X_current)} samples...")

        kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42) # Fixed random_state for reproducible splits *within* this evaluation run
        fold_accuracies = []

        # Create a new model instance for each evaluation run on the current snapshot
        # This simulates retraining a model from scratch on the updated dataset
        model = LogisticRegression(solver='liblinear', max_iter=1000) # Use liblinear for speed/small datasets

        for fold, (train_index, val_index) in enumerate(kf.split(X_current, y_current)):
            # Split data for the current fold
            X_fold_train, X_fold_val = X_current[train_index], X_current[val_index]
            y_fold_train, y_fold_val = y_current[train_index], y_current[val_index]

            # Train the model on the training fold
            model.fit(X_fold_train, y_fold_train)

            # Evaluate the model on the validation fold
            y_pred = model.predict(X_fold_val)
            accuracy = accuracy_score(y_fold_val, y_pred)
            fold_accuracies.append(accuracy)

            # Optional: print per fold accuracy
            # print(f"  Fold {fold + 1} Accuracy: {accuracy:.4f}")

        # 4. Report average performance for this evaluation run
        avg_accuracy = np.mean(fold_accuracies)
        print(f"--- Average {K_FOLDS}-Fold CV Accuracy (Snapshot Size: {len(X_current)} samples): {avg_accuracy:.4f} ---")

    # Optional: Add a delay to simulate time passing
    # time.sleep(0.1) # Small delay

print("\nSimulation finished. All data processed.")


Starting real-time data simulation (slicing Iris) and periodic K-Fold evaluation...

--- Time Step 1 ---
Simulating arrival of 15 new samples (Indices 0 to 14 from shuffled dataset).
Current total dataset size: 15 samples

--- Time Step 2 ---
Simulating arrival of 15 new samples (Indices 15 to 29 from shuffled dataset).
Current total dataset size: 30 samples

--- Time Step 3 ---
Simulating arrival of 15 new samples (Indices 30 to 44 from shuffled dataset).
Current total dataset size: 45 samples
Evaluation interval reached (or all data arrived).
Performing 5-Fold Cross-Validation on the current snapshot of 45 samples...
--- Average 5-Fold CV Accuracy (Snapshot Size: 45 samples): 0.8667 ---

--- Time Step 4 ---
Simulating arrival of 15 new samples (Indices 45 to 59 from shuffled dataset).
Current total dataset size: 60 samples

--- Time Step 5 ---
Simulating arrival of 15 new samples (Indices 60 to 74 from shuffled dataset).
Current total dataset size: 75 samples

--- Time Step 6 ---
Si