In [20]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
import altair as alt
from sklearn.cluster import KMeans

brush = alt.selection_interval(resolve='global')

In [21]:
# Load the iris.data file using pandas
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
df = pd.read_csv('iris.data', names=column_names)

In [22]:
print(df.info)
df.describe()

<bound method DataFrame.info of      sepal_length  sepal_width  petal_length  petal_width           class
0             5.1          3.5           1.4          0.2     Iris-setosa
1             4.9          3.0           1.4          0.2     Iris-setosa
2             4.7          3.2           1.3          0.2     Iris-setosa
3             4.6          3.1           1.5          0.2     Iris-setosa
4             5.0          3.6           1.4          0.2     Iris-setosa
..            ...          ...           ...          ...             ...
145           6.7          3.0           5.2          2.3  Iris-virginica
146           6.3          2.5           5.0          1.9  Iris-virginica
147           6.5          3.0           5.2          2.0  Iris-virginica
148           6.2          3.4           5.4          2.3  Iris-virginica
149           5.9          3.0           5.1          1.8  Iris-virginica

[150 rows x 5 columns]>


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [23]:
total_info = alt.Chart(df).mark_bar(size=50).encode(
    x=alt.X('class:N', title='Class'),
    y=alt.Y('count():Q', title='Count'),
    color=alt.condition(brush, 'class', alt.ColorValue('gray')),
).add_params(
    brush
).properties(
    width=400,  # 设置宽度为400像素
    height=300  # 设置高度为300像素
).interactive()

total_info

In [24]:
sepal_info = alt.Chart(df).mark_circle(size=100, opacity=0.5).encode(
    x='sepal_length:Q',
    y='sepal_width:Q',
    # color=alt.Color('class:N', scale=alt.Scale(scheme='category10')),
    color=alt.condition(brush, 'class', alt.ColorValue('gray')),
    tooltip=['petal_length','petal_width','class'],
).add_params(
    brush
)

sepal_info

In [25]:
petal_info = alt.Chart(df).mark_circle(size=100, opacity=0.5).encode(
    x='petal_length:Q',
    y='petal_width:Q',
    color=alt.condition(brush, 'class', alt.ColorValue('gray')),
    tooltip=['petal_length','petal_width','class'],
).add_params(
    brush
)

petal_info

In [26]:
# Remove the 'class' column
X = df.iloc[:, :-1]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X)

# Add the cluster labels to the data
df['cluster'] = kmeans.labels_

# Generate the scatter plot using Altair
sepal_cluster = alt.Chart(df).mark_point(size=100, opacity=0.5).encode(
    x='sepal_length:Q',
    y='sepal_width:Q',
    color=alt.condition(brush, 'cluster', alt.ColorValue('gray'), scale=alt.Scale(scheme='category10')),
    tooltip=['petal_length','petal_width','class', 'cluster'],
).add_params(
    brush
).interactive(False).properties(
        title='5 clusters of sepal length and width'
    )

sepal_cluster

In [27]:
petal_cluster = alt.Chart(df).mark_point(size=100, opacity=0.5).encode(
    x='petal_length:Q',
    y='petal_width:Q',
    # color=alt.Color('cluster:N', scale=alt.Scale(scheme='category10'))，
    color=alt.condition(brush, 'cluster', alt.ColorValue('gray'), scale=alt.Scale(scheme='category10')),
    tooltip=['petal_length','petal_width','class', 'cluster'],
).add_params(
    brush
).interactive(False).properties(
        title='5 clusters of pedal length and width'
    )

petal_cluster

In [28]:
# total_info | sepal_info | petal_info | sepal_cluster | petal_cluster
alt.vconcat(total_info , sepal_info | petal_info, sepal_cluster | petal_cluster).save('B-system.html')