In [1]:
import pandas as pd
from sklearn.datasets import make_blobs
from core.api import train_pipeline
from config.registries import MODEL_REGISTRY
from data.loaders import PandasDataLoader

In [2]:
# Generate sample data
X, _, centers = make_blobs(
    n_samples=3000,
    n_features=3,
    centers=5,
    cluster_std=1.0,
    random_state=0,
    return_centers=True
)
pd.DataFrame(X, columns=[f'x{i+1}' for i in range(3)]).to_parquet('sample.parquet')

In [3]:
# Example configuration
base_config = {
    "data_source": "pandas",
    "data_path": ["sample.parquet"],
    "metric": "silhouette",
    "output_path": "best_model.joblib"
}

def analyze_results(model: object) -> None:
    """Display model training results."""
    print(f"Optimal parameters: {model.model.get_params()}")
    print(f"Inertia: {model.model.inertia_}")
    print("Cluster centroids:")
    for i, c in enumerate(model.model_data['centroids']):
        print(f"Cluster {i}: {c}")

def demo_pipeline(config: dict, algorithm: str = 'kmeans') -> object:
    """Execute full training pipeline."""
    print(f"Available algorithms: {list(MODEL_REGISTRY.keys())}")
    print(f"\n{algorithm} parameters:")
    for param, desc in MODEL_REGISTRY[algorithm]['params_help'].items():
        print(f"  {param}: {desc}")

    model = train_pipeline(
        features_src=config['data_path'][0],
        algorithm=algorithm,
        param_grid=config.get('params', {}),
        metric=config['metric'],
        optimizer=config.get('optimizer', 'grid')
    )
    
    if 'output_path' in config:
        model.save(config['output_path'])
    
    return model

In [4]:
# Case 1: Manual configuration
manual_config = {
    **base_config,
    "algorithm": "kmeans",
    "params": {
        "n_clusters": [3, 5, 7, 10],
        "init": ["k-means++", "random"],
        "max_iter": [100, 200]
    }
}

# Execute manual pipeline
manual_model = demo_pipeline(manual_config)
analyze_results(manual_model)

Available algorithms: ['kmeans', 'dbscan']

kmeans parameters:
  n_clusters: Number of clusters (positive integer)
  init: Initialization method [k-means++, random]
  max_iter: Maximum iterations (positive integer)
Optimal parameters: {'algorithm': 'lloyd', 'copy_x': True, 'init': 'k-means++', 'max_iter': 100, 'n_clusters': 5, 'n_init': 'auto', 'random_state': None, 'tol': 0.0001, 'verbose': 0}
Inertia: 8882.80363494166
Cluster centroids:
Cluster 0: [0.91868199 4.2509417  2.09264055]
Cluster 1: [-1.24767707  7.87567211  9.29361259]
Cluster 2: [ 1.29176795  8.48369582 -8.57501229]
Cluster 3: [ 0.82568651 -1.53359757  2.93251345]
Cluster 4: [-2.36789535  5.84695449  0.542761  ]


In [5]:
# Case 2: Optimized workflow
auto_config = {
    **base_config,
    "algorithm": "kmeans", 
    "params": {
        "n_clusters": [3, 4, 5, 6, 7],
        "init": ["k-means++", "random"],
        "max_iter": [100]
    },
    "optimizer": "random"
}

# Execute automated pipeline
auto_model = demo_pipeline(auto_config)
analyze_results(auto_model)

Available algorithms: ['kmeans', 'dbscan']

kmeans parameters:
  n_clusters: Number of clusters (positive integer)
  init: Initialization method [k-means++, random]
  max_iter: Maximum iterations (positive integer)
Optimal parameters: {'algorithm': 'lloyd', 'copy_x': True, 'init': 'k-means++', 'max_iter': 100, 'n_clusters': 4, 'n_init': 'auto', 'random_state': None, 'tol': 0.0001, 'verbose': 0}
Inertia: 13555.712867017419
Cluster centroids:
Cluster 0: [-0.71651772  5.06165894  1.31982786]
Cluster 1: [ 1.29176795  8.48369582 -8.57501229]
Cluster 2: [ 0.84387723 -1.46994827  2.92848356]
Cluster 3: [-1.24767707  7.87567211  9.29361259]


In [6]:
# Prediction demo
def predict_clusters(model: object, features: dict) -> None:
    """Make and display cluster predictions."""
    new_data = pd.DataFrame(features)
    predictions = model.predict(PandasDataLoader(new_data))
    print(f"Predicted clusters: {predictions.values}")

In [7]:
predict_clusters(auto_model, {
    'x1': [0.5, 1.5, 2.5],
    'x2': [0.3, 1.2, 2.1],
    'x3': [0.7, 3.5, 6.1],
})

Predicted clusters: [2 2 2]


In [8]:
# Model persistence demo
loaded_model = MODEL_REGISTRY['kmeans']['class'].load("best_model.joblib")
predict_clusters(loaded_model, {
    'x1': [1.2, 2.3, 3.4],
    'x2': [0.8, 1.9, 2.7],
    'x3': [1.5, 2.6, 4.1],
})

Predicted clusters: [2 2 2]
