In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import ( silhouette_score, davies_bouldin_score, calinski_harabasz_score,adjusted_rand_score
)
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from collections import defaultdict

In [2]:
ARTIFACTS_DIR = Path("artifacts")
FIGURES_DIR = ARTIFACTS_DIR / "figures"
LABELS_DIR = ARTIFACTS_DIR / "labels"
ARTIFACTS_DIR.mkdir(exist_ok=True)
FIGURES_DIR.mkdir(exist_ok=True)
LABELS_DIR.mkdir(exist_ok=True)

datasets = {
    "ds1": "S07-hw-dataset-01.csv",
    "ds2": "S07-hw-dataset-02.csv",
    "ds3": "S07-hw-dataset-03.csv",
    "ds4": "S07-hw-dataset-04.csv"
}

data_dict = {}
for name, file in datasets.items():
    df = pd.read_csv(file)
    print(f"\n=== {name.upper()} ===")
    print(df.head())
    print(df.info())
    print(df.describe())
    print("\nПропуски:")
    print(df.isna().sum())
    sample_id = df["sample_id"]
    X = df.drop(columns=["sample_id"])
    data_dict[name] = {"df": df, "X": X, "sample_id": sample_id}


=== DS1 ===
   sample_id        f01        f02       f03         f04        f05  \
0          0  -0.536647 -69.812900 -0.002657   71.743147 -11.396498   
1          1  15.230731  52.727216 -1.273634 -104.123302  11.589643   
2          2  18.542693  77.317150 -1.321686 -111.946636  10.254346   
3          3 -12.538905 -41.709458  0.146474   16.322124   1.391137   
4          4  -6.903056  61.833444 -0.022466  -42.631335   3.107154   

         f06        f07       f08  
0 -12.291287  -6.836847 -0.504094  
1  34.316967 -49.468873  0.390356  
2  25.892951  44.595250  0.325893  
3   2.014316 -39.930582  0.139297  
4  -5.471054   7.001149  0.131213  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sample_id  12000 non-null  int64  
 1   f01        12000 non-null  float64
 2   f02        12000 non-null  float64
 3   f03        12000 non-null  floa

In [3]:
preprocessors = {}

for name, data in data_dict.items():
    X = data["X"].copy()

    num_features = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

    print(f"\n{name}: числовые={num_features}, категориальные={cat_features}")

    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    if cat_features:
        cat_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
            ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ])
        preprocessor = ColumnTransformer([
            ("num", num_pipe, num_features),
            ("cat", cat_pipe, cat_features)
        ])
    else:
        preprocessor = num_pipe

    preprocessors[name] = preprocessor


ds1: числовые=['f01', 'f02', 'f03', 'f04', 'f05', 'f06', 'f07', 'f08'], категориальные=[]

ds2: числовые=['x1', 'x2', 'z_noise'], категориальные=[]

ds3: числовые=['x1', 'x2', 'f_corr', 'f_noise'], категориальные=[]

ds4: числовые=['n01', 'n02', 'n03', 'n04', 'n05', 'n06', 'n07', 'n08', 'n09', 'n10', 'n11', 'n12', 'n13', 'n14', 'n15', 'n16', 'n17', 'n18', 'n19', 'n20', 'n21', 'n22', 'n23', 'n24', 'n25', 'n26', 'n27', 'n28', 'n29', 'n30'], категориальные=['cat_a', 'cat_b']


In [4]:
metrics_summary = {}
best_configs = {}
all_labels = {}

RANDOM_STATE = 777
TSNE_PERPLEXITY = 30

def compute_metrics(X, labels, method_name):
    mask = None
    noise_ratio = 0.0
    if -1 in labels:
        mask = labels != -1
        noise_ratio = np.mean(labels == -1)
        X_clean = X[mask]
        labels_clean = labels[mask]
    else:
        X_clean = X
        labels_clean = labels

    if len(np.unique(labels_clean)) < 2:
        sil = -1
        db = float('inf')
        ch = 0
    else:
        sil = silhouette_score(X_clean, labels_clean)
        db = davies_bouldin_score(X_clean, labels_clean)
        ch = calinski_harabasz_score(X_clean, labels_clean)

    return {
        "silhouette": sil,
        "davies_bouldin": db,
        "calinski_harabasz": ch,
        "noise_ratio": noise_ratio
    }

def plot_pca_tsne(X, labels, title, filename):
    pca = PCA(n_components=2, random_state=RANDOM_STATE)
    X_pca = pca.fit_transform(X)

    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=labels, palette="tab10", s=10)
    plt.title(f"{title} – PCA")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

    tsne = TSNE(n_components=2, perplexity=TSNE_PERPLEXITY, random_state=RANDOM_STATE)
    X_tsne = tsne.fit_transform(X)

    plt.subplot(1, 2, 2)
    sns.scatterplot(x=X_tsne[:, 0], y=X_tsne[:, 1], hue=labels, palette="tab10", s=10)
    plt.title(f"{title} – t-SNE\n(random_state={RANDOM_STATE}, локальная структура)")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.tight_layout()
    plt.savefig(FIGURES_DIR / filename, dpi=150)
    plt.close()

results_by_dataset = defaultdict(list)

for name, data in data_dict.items():
    print(f"\nОбработка {name}...")
    X_raw = data["X"]
    sample_id = data["sample_id"]

    preprocessor = preprocessors[name]
    X = preprocessor.fit_transform(X_raw)

    k_range = range(2, 21)
    sil_scores = []
    for k in k_range:
        km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=10)
        labels = km.fit_predict(X)
        metrics = compute_metrics(X, labels, "KMeans")
        sil_scores.append(metrics["silhouette"])
        results_by_dataset[name].append({
            "method": "KMeans",
            "params": {"n_clusters": k},
            "labels": labels.copy(),
            **metrics
        })

    plt.figure()
    plt.plot(k_range, sil_scores, marker='o')
    plt.title(f"{name}: Silhouette vs k (KMeans)")
    plt.xlabel("k")
    plt.ylabel("Silhouette")
    plt.savefig(FIGURES_DIR / f"{name}_kmeans_sil_vs_k.png")
    plt.close()

    from sklearn.neighbors import NearestNeighbors
    min_samples = max(2, int(0.01 * X.shape[0]))
    nbrs = NearestNeighbors(n_neighbors=min_samples).fit(X)
    distances, _ = nbrs.kneighbors(X)
    k_distances = np.sort(distances[:, -1])[::-1]

    plt.figure()
    plt.plot(k_distances)
    plt.title(f"{name}: k-distance plot (k={min_samples})")
    plt.xlabel("Point")
    plt.ylabel("k-distance")
    plt.savefig(FIGURES_DIR / f"{name}_dbscan_kdist.png")
    plt.close()

    eps_candidates = np.linspace(0.1, 2.0, 20)
    for eps in eps_candidates:
        dbs = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbs.fit_predict(X)
        metrics = compute_metrics(X, labels, "DBSCAN")
        results_by_dataset[name].append({
            "method": "DBSCAN",
            "params": {"eps": eps, "min_samples": min_samples},
            "labels": labels.copy(),
            **metrics
        })

    linkages = ["ward", "average", "complete"]
    if "ward" in linkages:
        if X.shape[1] > X_raw.shape[1]:  # значит, был OHE
            linkages = ["average", "complete"]

    for linkage in linkages:
        sil_scores_agg = []
        for k in k_range:
            try:
                agg = AgglomerativeClustering(n_clusters=k, linkage=linkage)
                labels = agg.fit_predict(X)
                metrics = compute_metrics(X, labels, "Agg")
                sil_scores_agg.append(metrics["silhouette"])
                results_by_dataset[name].append({
                    "method": f"Agglomerative_{linkage}",
                    "params": {"n_clusters": k, "linkage": linkage},
                    "labels": labels.copy(),
                    **metrics
                })
            except Exception as e:
                print(f"Ошибка в Agg ({linkage}, k={k}): {e}")
                continue

        plt.figure()
        plt.plot(k_range, sil_scores_agg, marker='o', label=linkage)
        plt.title(f"{name}: Silhouette vs k (Agglomerative, {linkage})")
        plt.xlabel("k")
        plt.ylabel("Silhouette")
        plt.legend()
        plt.savefig(FIGURES_DIR / f"{name}_agg_{linkage}_sil_vs_k.png")
        plt.close()


for name, results in results_by_dataset.items():
    best = max(results, key=lambda r: r["silhouette"])
    best_configs[name] = {
        "method": best["method"],
        "params": best["params"],
        "silhouette": best["silhouette"]
    }
    all_labels[name] = best["labels"]
    metrics_summary[name] = {
        "silhouette": best["silhouette"],
        "davies_bouldin": best["davies_bouldin"],
        "calinski_harabasz": best["calinski_harabasz"],
        "noise_ratio": best.get("noise_ratio", 0.0)
    }

    X_raw = data_dict[name]["X"]
    X_prep = preprocessors[name].transform(X_raw)
    plot_pca_tsne(
        X_prep,
        best["labels"],
        f"{name} – Best: {best['method']}",
        f"{name}_best_pca_tsne.png"
    )

    df_labels = pd.DataFrame({
        "sample_id": data_dict[name]["sample_id"],
        "cluster_label": best["labels"]
    })
    df_labels.to_csv(LABELS_DIR / f"labels_hw07_{name}.csv", index=False)

with open(ARTIFACTS_DIR / "metrics_summary.json", "w") as f:
    json.dump(metrics_summary, f, indent=2)

with open(ARTIFACTS_DIR / "best_configs.json", "w") as f:
    json.dump(best_configs, f, indent=2)


Обработка ds1...

Обработка ds2...

Обработка ds3...

Обработка ds4...


In [5]:
print("\n=== Проверка устойчивости на ds2 ===")
name = "ds2"
X_raw = data_dict[name]["X"]
X = preprocessors[name].transform(X_raw)

ari_scores = []
labels_list = []

for rs in range(5):
    km = KMeans(n_clusters=3, random_state=rs, n_init=10)
    labels = km.fit_predict(X)
    labels_list.append(labels)

for i in range(len(labels_list)):
    for j in range(i+1, len(labels_list)):
        ari = adjusted_rand_score(labels_list[i], labels_list[j])
        ari_scores.append(ari)

print(f"Средний ARI между запусками: {np.mean(ari_scores):.3f} ± {np.std(ari_scores):.3f}")


=== Проверка устойчивости на ds2 ===
Средний ARI между запусками: 0.993 ± 0.006
