In [1]:
import pandas as pd
from sklearn.datasets import make_blobs
from core.api import train_pipeline
from config.registries import MODEL_REGISTRY
from data.loaders import PandasDataLoader

In [2]:
# Generate sample data
X, _, centers = make_blobs(
    n_samples=3000,
    n_features=3,
    centers=5,
    cluster_std=1.0,
    random_state=0,
    return_centers=True
)
pd.DataFrame(X, columns=[f'x{i+1}' for i in range(3)]).to_parquet('data.parquet')

In [3]:
pd.read_parquet('data.parquet')

Unnamed: 0,x1,x2,x3
0,-2.803323,6.091965,0.560062
1,1.005863,2.250111,2.111326
2,-1.664214,8.968215,9.462399
3,0.215323,4.218356,1.265304
4,-1.287203,8.312072,8.308078
...,...,...,...
2995,1.211350,-0.541304,2.489197
2996,-2.149935,7.231925,0.423440
2997,1.123496,-0.304882,2.893082
2998,0.102689,10.071523,-9.534722


In [4]:
# Example configuration
base_config = {
  "data_source": "pandas",
  "preprocessing": {
    "normalizer": {
      "methods":{"x1": "zscore",
                "x2": "range",
                "x3": "minmax"},
        "columns": ["x1", "x2", "x3"]
    },
    "sampler": {
      "data_src": [
    "data.parquet",
    None
  ]
    }
  },
  "data_path": [
    "data.parquet",
    None
  ],
  "algorithm": "kmeans",
  "params": {
    "n_clusters": [
      3,
      5,
      7,
      10
    ],
    "init": [
      "k-means++",
      "random"
    ],
    "max_iter": [
      100,
      200
    ]
  },
  "metric": "silhouette",
  "output_path": "best_kmeans.joblib"
}

import json
with open('config.json', 'w') as f:
    json.dump(base_config, f, indent=2)

In [13]:
def analyze_results(model: object) -> None:
    """Display model training results."""
    print(f"Optimal parameters: {model.model.get_params()}")
    print(f"Inertia: {model.model.inertia_}")
    print("Cluster centroids:")
    for i, c in enumerate(model.model_data['centroids']):
        print(f"Cluster {i}: {c}")

def demo_pipeline(config: dict, algorithm: str = 'kmeans', spark=None) -> object:
    """Execute full training pipeline."""
    print(f"Available algorithms: {list(MODEL_REGISTRY.keys())}")
    print(f"\n{algorithm} parameters:")
    for param, desc in MODEL_REGISTRY[algorithm]['params_help'].items():
        print(f"  {param}: {desc}")

    model = train_pipeline(
        features_src=config['data_path'][0],
        similarity_src=config['data_path'][1],
        algorithm=algorithm,
        param_grid=config.get('params', {}),
        normalizer=config.get('preprocessing').get('normalizer'),
        sampler=config.get('preprocessing').get('sampler'),
        metric=config['metric'],
        optimizer=config.get('optimizer', 'grid'),
        spark = spark
    )
    if 'output_path' in config:
        model.save(config['output_path'])
    return model

In [14]:
# Execute manual pipeline
manual_model = demo_pipeline(base_config)
analyze_results(manual_model)

Available algorithms: ['kmeans', 'dbscan']

kmeans parameters:
  n_clusters: Number of clusters (positive integer)
  init: Initialization method [k-means++, random]
  max_iter: Maximum iterations (positive integer)
Optimal parameters: {'algorithm': 'lloyd', 'copy_x': True, 'init': 'random', 'max_iter': 200, 'n_clusters': 7, 'n_init': 'auto', 'random_state': None, 'tol': 0.0001, 'verbose': 0}
Inertia: 35.91633146914405
Cluster centroids:
Cluster 0: [1.38592653 0.22576181 0.13133726]
Cluster 1: [ 0.21693072 -0.47134928  0.64526643]
Cluster 2: [-1.52012816  0.07338093  0.59617673]
Cluster 3: [0.60663863 0.23614239 0.12777483]
Cluster 4: [ 1.03586133 -0.25185631  0.63406237]
Cluster 5: [0.28532377 0.08074441 0.73458234]
Cluster 6: [-0.58196802  0.09715243  0.70486649]


In [15]:
# Prediction demo
def predict_clusters(model: object, features: dict) -> None:
    """Make and display cluster predictions."""
    new_data = pd.DataFrame(features)
    predictions = model.predict(PandasDataLoader(new_data))
    print(f"Predicted clusters: {predictions.values}")

In [16]:
predict_clusters(manual_model, {
    'x1': [0.5, 1.5, 2.5],
    'x2': [0.3, 1.2, 2.1],
    'x3': [0.7, 3.5, 6.1],
})

Predicted clusters: [5 5 4]
