In [33]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score, silhouette_score
from sklearn.decomposition import PCA
import matplotlib.cm as cm

from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram

In [None]:
from google.colab import files
uploaded = files.upload()

import io
data_raw = pd.read_csv(io.BytesIO(uploaded['ObesityDataSet_raw_and_data_sinthetic_.csv']))

Saving ObesityDataSet_raw_and_data_sinthetic_.csv to ObesityDataSet_raw_and_data_sinthetic_.csv


In [4]:
data_raw['Gender'] = data_raw['Gender'].astype('category')

data_raw['CALC'] = data_raw['CALC'].astype('category')

data_raw['FAVC'] = data_raw['FAVC'].astype('category')

data_raw['SCC'] = data_raw['SCC'].astype('category')

data_raw['SMOKE'] = data_raw['SMOKE'].astype('category')

data_raw['family_history_with_overweight'] = data_raw['family_history_with_overweight'].astype('category')

data_raw['CAEC'] = data_raw['CAEC'].astype('category')

data_raw['MTRANS'] = data_raw['MTRANS'].astype('category')

data_raw['NObeyesdad'] = data_raw['NObeyesdad'].astype('category')

In [5]:
label_encoder = LabelEncoder()
data_raw['NObeyesdad'] = label_encoder.fit_transform(data_raw['NObeyesdad'])

In [6]:
categorical_columns = data_raw.select_dtypes(include=['category']).columns.tolist()

for col in categorical_columns:
    data_raw[col]=label_encoder.fit_transform(data_raw[col])

In [7]:
scaler=StandardScaler()
data_raw["Weight"] = scaler.fit_transform(data_raw["Weight"].values.reshape(-1, 1))

In [54]:
X = data_raw.drop(columns=['NObeyesdad'])
y = data_raw['NObeyesdad']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [66]:
# PCA
pca = PCA(n_components=2)
train_pca = pca.fit_transform(X_train)
test_pca = pca.transform(X_test)

# KMeans
print("KMeans Results:")
for i in range(2, 12):
    kmeans = KMeans(n_clusters=i, n_init='auto', random_state=0)
    kmeans.fit(train_pca)
    kmeans_predictions = kmeans.predict(test_pca)

    kmeans_ari = adjusted_rand_score(y_test, kmeans_predictions)

    print(f"KMeans with {i} clusters - ARI: {kmeans_ari:.3f}")


KMeans Results:
KMeans with 2 clusters - ARI: 0.017
KMeans with 3 clusters - ARI: 0.058
KMeans with 4 clusters - ARI: 0.062
KMeans with 5 clusters - ARI: 0.081
KMeans with 6 clusters - ARI: 0.082
KMeans with 7 clusters - ARI: 0.081
KMeans with 8 clusters - ARI: 0.087
KMeans with 9 clusters - ARI: 0.088
KMeans with 10 clusters - ARI: 0.123
KMeans with 11 clusters - ARI: 0.115


In [67]:
# DBSCAN
print("\nDBSCAN Results:")
for eps in [0.5, 1, 1.5, 2, 2.5]:
    for min_samples in range(6, 15):
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        dbscan_predictions = dbscan.fit_predict(test_pca)

        n_clusters_dbscan = len(set(dbscan_predictions)) - (1 if -1 in dbscan_predictions else 0)

        if n_clusters_dbscan > 1:
            dbscan_ari = adjusted_rand_score(y_test, dbscan_predictions)
            print(f"DBSCAN eps={eps}, min_samples={min_samples} - ARI: {dbscan_ari:.3f}")

        else:
            dbscan_ari = -1
            dbscan_silhouette = -1



DBSCAN Results:
DBSCAN eps=0.5, min_samples=6 - ARI: 0.018
DBSCAN eps=0.5, min_samples=7 - ARI: 0.019
DBSCAN eps=0.5, min_samples=8 - ARI: 0.015
DBSCAN eps=0.5, min_samples=9 - ARI: 0.014
DBSCAN eps=0.5, min_samples=10 - ARI: 0.044
DBSCAN eps=0.5, min_samples=11 - ARI: 0.067
DBSCAN eps=0.5, min_samples=12 - ARI: 0.068
DBSCAN eps=0.5, min_samples=13 - ARI: 0.071
DBSCAN eps=0.5, min_samples=14 - ARI: 0.071
DBSCAN eps=1, min_samples=6 - ARI: 0.001
DBSCAN eps=1, min_samples=7 - ARI: 0.001
DBSCAN eps=1, min_samples=8 - ARI: 0.016
DBSCAN eps=1, min_samples=9 - ARI: 0.016
DBSCAN eps=1, min_samples=10 - ARI: 0.015
DBSCAN eps=1, min_samples=11 - ARI: 0.015
DBSCAN eps=1, min_samples=12 - ARI: 0.015
DBSCAN eps=1, min_samples=13 - ARI: 0.016
DBSCAN eps=1, min_samples=14 - ARI: 0.016
DBSCAN eps=1.5, min_samples=13 - ARI: 0.001
DBSCAN eps=1.5, min_samples=14 - ARI: 0.001


In [68]:
linkage_methods = ['ward', 'complete', 'average', 'single']

for linkage in linkage_methods:
    for n_clusters in range(6, 11):
        model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
        agg_labels = model.fit_predict(test_pca)

        # Оценка качества кластеризации
        agg_ari = adjusted_rand_score(y_test, agg_labels)
        if n_clusters > 1:
            print(f"Agglomerative Clustering (Linkage: {linkage}, Clusters: {n_clusters}) - "
              f"ARI: {agg_ari:.3f}")
        else:
            agg_silhouette = -1


Agglomerative Clustering (Linkage: ward, Clusters: 6) - ARI: 0.084
Agglomerative Clustering (Linkage: ward, Clusters: 7) - ARI: 0.084
Agglomerative Clustering (Linkage: ward, Clusters: 8) - ARI: 0.114
Agglomerative Clustering (Linkage: ward, Clusters: 9) - ARI: 0.137
Agglomerative Clustering (Linkage: ward, Clusters: 10) - ARI: 0.123
Agglomerative Clustering (Linkage: complete, Clusters: 6) - ARI: 0.058
Agglomerative Clustering (Linkage: complete, Clusters: 7) - ARI: 0.058
Agglomerative Clustering (Linkage: complete, Clusters: 8) - ARI: 0.051
Agglomerative Clustering (Linkage: complete, Clusters: 9) - ARI: 0.050
Agglomerative Clustering (Linkage: complete, Clusters: 10) - ARI: 0.048
Agglomerative Clustering (Linkage: average, Clusters: 6) - ARI: 0.047
Agglomerative Clustering (Linkage: average, Clusters: 7) - ARI: 0.047
Agglomerative Clustering (Linkage: average, Clusters: 8) - ARI: 0.066
Agglomerative Clustering (Linkage: average, Clusters: 9) - ARI: 0.065
Agglomerative Clustering (Li

Низкие значения ARI. Результат кластеризации слабо согласован с истинными метками классов.