In [None]:
%time
import pandas as pd
from sklearn.datasets import make_blobs
from core.api import optimizing_pipeline
from config.registries import MODEL_REGISTRY
from data.loaders import PandasDataLoader

# Attribute

In [None]:
X, _, centers = make_blobs(
    n_samples=1500,
    n_features=3,
    centers=3,
    cluster_std=1.3,
    random_state=0,
    return_centers=True
)
pd.DataFrame(X, columns=[f'x{i+1}' for i in range(3)]).to_parquet('data.parquet')

In [None]:
pd.read_parquet('data.parquet')

In [None]:
base_config = {
  "data_source": "pandas",
  "optimizer":"tpe",
  "plots_path":"results/datavis/kmeans",
  "stat_path":"results/stat/kmeans",
  "preprocessing": {
    "normalizer": {
      "methods":{"x1": "zscore",
                "x2": "range",
                "x3": "minmax"},
        "columns": ["x1", "x2", "x3"]
    },
    "sampler": {
      "features": "data.parquet",
      "similarity": None,
    }
  },
  "features": "data.parquet",
  "similarity": None,
  "algorithm": "kmeans",
  "params": {
    "n_clusters": [
      3,
      5,
      7,
      10
    ],
    "init": [
      "k-means++",
      "random"
    ],
    "max_iter": [
      100,
      200
    ]
  },
  "metric": "attribute",
  "output_path": "best_kmeans.joblib"
}

import json
with open('config.json', 'w') as f:
    json.dump(base_config, f, indent=2)

## Case Kmeans (pipeline)

In [None]:
def analyze_results(model: object) -> None:
    """Display model training results."""
    print(f"Optimal parameters: {model.model.get_params()}")
    print(f"Inertia: {model.model.inertia_}")
    print("Cluster centroids:")
    for i, c in enumerate(model.model_data['centroids']):
        print(f"Cluster {i}: {c}")

def demo_pipeline(config: dict, spark=None) -> object:
    """Execute full training pipeline."""
    print(f"Available algorithms: {list(MODEL_REGISTRY.keys())}")
    print(f"\n{config.get('algorithm', 'kmeans')} parameters:")
    for param, desc in MODEL_REGISTRY[config.get('algorithm', 'kmeans')]['params_help'].items():
        print(f"  {param}: {desc}")

    model = optimizing_pipeline(
        features_src=config['features'],
        similarity_src=config['similarity'],
        algorithm=config.get('algorithm', 'kmeans'),
        param_grid=config.get('params', {}),
        normalizer=config.get('preprocessing').get('normalizer'),
        sampler=config.get('preprocessing').get('sampler'),
        metric=config['metric'],
        optimizer=config.get('optimizer', 'grid'),
        plots_path=config.get('plots_path'),
        stat_path=config.get('stat_path'),
        spark = spark
    )
    if 'output_path' in config:
        model.save(config['output_path'])
    return model

In [None]:
manual_model = demo_pipeline(base_config)
analyze_results(manual_model)

In [None]:
def predict_clusters(model: object, features: dict) -> None:
    """Make and display cluster predictions."""
    new_data = pd.DataFrame(features)
    predictions = model.predict(PandasDataLoader(new_data))
    print(f"Predicted clusters: {predictions}")

In [None]:
predict_clusters(manual_model, {
    'x1': [0.5, 1.5, 2.5],
    'x2': [0.3, 1.2, 2.1],
    'x3': [0.7, 3.5, 6.1],
})

In [None]:
pd.read_parquet('data.sample.parquet')

## Case Kmeans (extended)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from models.attribute import SklearnKMeans
from preprocessing.normalizers import PandasNormalizer
from data.loaders import PandasDataLoader
from metrics.quality import AttributeMetric
import itertools

In [None]:
# 1. Loading data
loader = PandasDataLoader(
    features="data.parquet",
    normalizer=None,
    sampler=None
)
features, _ = loader.full_data()

In [None]:
# 2. Preprocessing
normalizer = PandasNormalizer(
    methods={
        "x1": "zscore",
        "x2": "range", 
        "x3": "minmax"
    }
)
normalizer.fit(features)
normalized_features = normalizer.transform(features.copy())

In [None]:
# 3. Preparing parameters for search
param_grid = {
    'n_clusters': [3, 5, 7, 10],
    'init': ['k-means++', 'random'],
    'max_iter': [100, 200]
}

In [None]:
# 4. Manual Grid Search
best_score = -1
best_model = None
results = []

all_params = list(itertools.product(*param_grid.values()))

for params_values in all_params:
    params = dict(zip(param_grid.keys(), params_values))
    
    try:

        model = SklearnKMeans(params)
        
        model.fit(PandasDataLoader(features=normalized_features))
        
        labels = model.predict(PandasDataLoader(features=normalized_features))
        
        metric = AttributeMetric()
        score = metric.calculate(
            data_loader=PandasDataLoader(features=normalized_features),
            labels=labels,
            model_data=model.model_data
        )
        
        results.append({
            'params': params,
            'score': score
        })
        
        if score > best_score:
            best_score = score
            best_model = model

    except Exception as e:
        print(f"Skipped {params}: {str(e)}")

In [None]:
# 5. Analysis results
df_results = pd.DataFrame(results)
print("Top 5 configurations:")
print(df_results.sort_values('score', ascending=False).head(5))

In [None]:
# 6. Visualization
plt.figure(figsize=(10, 6))
for init_method in ['k-means++', 'random']:
    subset = df_results[df_results['params'].apply(lambda x: x['init'] == init_method)]
    plt.plot(
        subset['params'].apply(lambda x: x['n_clusters']),
        subset['score'],
        marker='o',
        label=init_method
    )

plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('KMeans Performance Comparison')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# 7. Keeping the best model
if best_model:
    best_model.save("best_kmeans_custom.joblib")
    print(f"Best model saved with score: {best_score}")

# Graph

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
np.random.seed(42)

X = np.concatenate([
    np.random.normal(loc=(0.5, -0.5), scale=1.0, size=(500, 2)),
    np.random.normal(loc=(3, 3), scale=1.0, size=(500, 2)),
    np.random.normal(loc=(-3, 3), scale=1.0, size=(500, 2))
])

dist_matrix = pairwise_distances(X)
adj_matrix = np.zeros_like(dist_matrix)
k = 5
for i in range(len(X)):
    neighbors = np.argsort(dist_matrix[i])[1:k+1]
    adj_matrix[i, neighbors] = 1


plt.figure(figsize=(12, 5))
plt.scatter(X[:, 0], X[:, 1])
plt.title('Graph Clustering')
plt.axis('equal')
plt.tight_layout()
plt.show()

In [None]:
pd.DataFrame(adj_matrix, columns=[f'x{i+1}' for i in range(1500)]).to_parquet('data_graph.parquet')

In [None]:
pd.read_parquet('data_graph.parquet').values.shape

## louvain

In [None]:
base_config = {
  "data_source": "pandas",
  "plots_path":"results/datavis/louvain",
  "stat_path":"results/stat/louvain",
  "optimizer":"tpe",
  "preprocessing": {
    "sampler": {
      "features": None,
      "similarity": "data_graph.parquet"
    }
  },
  "features": None,
  "similarity": "data_graph.parquet",
  "algorithm": "louvain",
  "params": {
    "resolution": [0.1, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0],
    "threshold": [1e-07, 1e-06, 1e-05, 1e-04],
    "max_level": [ 5, 10, 15, 30, 45, 55]
  },
  "metric": "graph",
  "output_path": "best_louvain.joblib"
}

import json
with open('config.json', 'w') as f:
    json.dump(base_config, f, indent=2)

In [None]:
manual_model = demo_pipeline(base_config)

In [None]:
manual_model

## spectral

In [None]:
base_config = {
  "data_source": "pandas",
  "plots_path":"results/datavis/spectral",
  "stat_path":"results/stat/spectral",
  "optimizer":"tpe",
  "preprocessing": {
    "sampler": {
      "features": None,
      "similarity": "data_graph.parquet"
    }
  },
  "features": None,
  "similarity": "data_graph.parquet",
  "algorithm": "spectral",
  "params": {
    "n_clusters": [2, 3, 4, 5, 6, 7],
    "n_neighbors": [3, 10 ,15],
    "assign_labels": ['kmeans', 'discretize'],
    "degree":[1]
  },
  "metric": "graph",
  "output_path": "best_spectral.joblib"
}

import json
with open('config.json', 'w') as f:
    json.dump(base_config, f, indent=2)

In [None]:
manual_model = demo_pipeline(base_config)

In [None]:
manual_model

# Attribute-Graph

In [None]:
import numpy as np
from scipy.sparse import csr_matrix

## Cora

In [None]:
data = np.load('cora.npz', allow_pickle=True)

In [None]:
adj = csr_matrix((data['adj_data'], data['adj_indices'], data['adj_indptr']), shape=data['adj_shape'])
attr = csr_matrix((data['attr_data'], data['attr_indices'], data['attr_indptr']), shape=data['attr_shape'])

In [None]:
labels = data['labels']

In [None]:
pd.DataFrame(adj.todense(), columns=[f'x{i+1}' for i in range(2708)]).to_parquet('cora_graph.parquet')

In [None]:
pd.DataFrame(attr.todense(), columns=[f'x{i+1}' for i in range(1433)]).to_parquet('cora.parquet')

In [None]:
base_config = {
  "data_source": "pandas",
  "optimizer":"grid",
  "preprocessing": {},
  "features": "cora.parquet",
  "similarity": "cora_graph.parquet",
  "algorithm": "dmon",
  "params": {
    "num_clusters": [7],
    "hidden_dim": [256],
    "lambda_": [{'modularity': 1.0, 'collapse': 1.0, 'distance':0.0, 'variance': 0.0, 'entropy':0.0}],
    "epochs":[500],
    "lr":[1e-4],
    "dropout":[0.5]
  },
  "metric": "attribute-graph",
  "output_path": "best_dmon_cora.pt"
}

import json
with open('config.json', 'w') as f:
    json.dump(base_config, f, indent=2)

In [None]:
def demo_pipeline(config: dict, spark=None) -> object:
    """Execute full training pipeline."""
    print(f"Available algorithms: {list(MODEL_REGISTRY.keys())}")
    print(f"\n{config.get('algorithm', 'kmeans')} parameters:")
    for param, desc in MODEL_REGISTRY[config.get('algorithm', 'kmeans')]['params_help'].items():
        print(f"  {param}: {desc}")

    model = optimizing_pipeline(
        features_src=config['features'],
        similarity_src=config['similarity'],
        algorithm=config.get('algorithm', 'kmeans'),
        param_grid=config.get('params', {}),
        normalizer=config.get('preprocessing').get('normalizer'),
        sampler=config.get('preprocessing').get('sampler'),
        metric=config['metric'],
        optimizer=config.get('optimizer', 'grid'),
        plots_path=config.get('plots_path'),
        stat_path=config.get('stat_path'),
        spark = spark
    )
    if 'output_path' in config:
        model.save(config['output_path'])
    return model

In [None]:
manual_model = demo_pipeline(base_config)

In [None]:
manual_model.params

In [None]:
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, adjusted_mutual_info_score

In [None]:
nmi = normalized_mutual_info_score(labels, manual_model.labels_)
ari = adjusted_rand_score(labels, manual_model.labels_)
amis = adjusted_mutual_info_score(labels, manual_model.labels_)
print(f"Params model: {manual_model.params}| NMI: {nmi:.4f} | ARI: {ari:.4f} | AMIS: {amis:.4f}")

## Synthetic

In [None]:
base_config = {
  "data_source": "pandas",
  "plots_path":"results/datavis/DMON",
  "stat_path":"results/stat/DMON",
  "optimizer":"grid",
  "preprocessing": {},
  "features": "data.parquet",
  "similarity": "data_graph.parquet",
  "algorithm": "dmon",
  "params": {
    "num_clusters": [3,4,5],
    "hidden_dim": [256],
    "lambda_": [{'modularity': 1.0, 'collapse': 1.0, 'distance':0.0, 'variance': 0.0, 'entropy':0.0}],
    "epochs":[500],
    "lr":[1e-4],
    "dropout":[0.5]
  },
  "metric": "attribute-graph",
  "output_path": "best_dmon.pt"
}

import json
with open('config.json', 'w') as f:
    json.dump(base_config, f, indent=2)

In [None]:
manual_model = demo_pipeline(base_config)