In [32]:
import altair as alt
import pandas as pd

In [33]:
# Load the iris.data file using pandas
column_names = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
df = pd.read_csv("iris.data", names=column_names)

In [34]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [35]:
# Project the data into 2D space using PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(df[["sepal_length", "sepal_width", "petal_length", "petal_width"]])
df["pca1"] = pca.transform(
    df[["sepal_length", "sepal_width", "petal_length", "petal_width"]]
)[:, 0]
df["pca2"] = pca.transform(
    df[["sepal_length", "sepal_width", "petal_length", "petal_width"]]
)[:, 1]
# df['pca3'] = pca.transform(df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])[:,2]

In [36]:
# Plot the data in 3D space
alt.Chart(df).mark_circle().encode(
    x="pca1",
    y="pca2",
    # z='pca3',
    color="class",
    tooltip=["sepal_length", "sepal_width", "petal_length", "petal_width", "class"],
    # The title of the plot
).interactive().properties(title="PCA可视化")

In [37]:
# Plot the scatter plot using sepal_length and sepal_width as x and y axis
alt.Chart(df).mark_circle().encode(
    x="sepal_length",
    y="sepal_width",
    color="class",
).interactive().properties(title="Sepal长宽可视化")

In [38]:
# Plot the scatter plot using petal_length and petal_width as x and y axis
alt.Chart(df).mark_circle().encode(
    x="petal_length",
    y="petal_width",
    color="class",
).interactive().properties(title="Petal长宽可视化")

In [39]:
# Combine the three plots into one
alt.vconcat(
    alt.Chart(df)
    .mark_circle()
    .encode(
        x="pca1",
        y="pca2",
        color="class",
    )
    .interactive()
    .properties(title="PCA可视化"),
    alt.Chart(df)
    .mark_circle()
    .encode(
        x="sepal_length",
        y="sepal_width",
        color="class",
    )
    .interactive()
    .properties(title="Sepal长宽可视化"),
    alt.Chart(df)
    .mark_circle()
    .encode(
        x="petal_length",
        y="petal_width",
        color="class",
    )
    .interactive()
    .properties(title="Petal长宽可视化"),
)

In [40]:
# Create a new dataframe, combining the sepal and petal data in one column
# The new dataframe will have three columns: length, width, type
# The type column's value is "sepal_" + class or "petal_" + class
df2 = pd.DataFrame()
df2["length"] = df["sepal_length"].append(df["petal_length"])
df2["width"] = df["sepal_width"].append(df["petal_width"])
df2["type"] = ["sepal-" + i for i in df["class"]] + ["petal-" + i for i in df["class"]]
# Keep the original sepal or petal type in a new column
df2["shape"] = ["sepal"] * len(df) + ["petal"] * len(df)
df2.head()

# df2.describe()

# For each kind of flower, plot its sepal and its petal in different shape, but in same color
# The color of the flower is determined by its class
# For type starts with "petal" the shape is circle, otherwise it is square
# The X is length, Y is width, both for sepal and petal
# For each class and type, plot its data in different shape
# The point should be solid, and its size should be 10
scatter_plot = (
    alt.Chart(df2)
    .mark_point(
        filled=True,
        size=80,
    )
    .encode(
        x="length",
        y="width",
        color="type",
        # Shape should be determined by the type, only options are circle and triangle
        shape=alt.Shape("shape", scale=alt.Scale(range=["circle", "triangle"])),
        tooltip=["length", "width", "type"],
    )
)

scatter_plot.interactive().properties(title="Sepal和Petal长宽可视化")

  df2["length"] = df["sepal_length"].append(df["petal_length"])
  df2["width"] = df["sepal_width"].append(df["petal_width"])


In [41]:
# Create a line chart and each class is a line
# The X is length, Y is width, both for sepal and petal
# The color of the line is determined by its class
# The line should be smooth
line_chart = (
    alt.Chart(df2)
    .mark_line()
    .encode(
        x="length",
        y="width",
        color="type",
        strokeDash="shape",
        tooltip=["length", "width", "type"],
    )
    .properties(title="Sepal和Petal长宽可视化", width=600, height=600)
)

line_chart.interactive()

In [42]:
# Combine the previous two chart into one, and put them side by side
# Also they need to be linked brushing
# If user click on a point in the left chart, the corresponding point in the right chart should be highlighted
# If user click on a point in the right chart, the corresponding point in the left chart should be highlighted
# The color of the flower is determined by its class

brush = alt.selection_single(
    fields=["type"],
    bind="legend",
    clear="click",
    nearest=True,
    on="mouseover",
    empty="none",
)
# alt.hconcat(
#     scatter_plot,
#     line_chart
# ).resolve_scale(
#     color='independent'
# ).add_params(
#     brush
# )

alt.Chart(df2).mark_point(filled=True, size=80,).encode(
    x="length",
    y="width",
    color="type",
    # Shape should be determined by the type, only options are circle and triangle
    shape=alt.Shape("shape", scale=alt.Scale(range=["circle", "triangle"])),
    tooltip=["length", "width", "type"],
).add_selection(brush).interactive().properties(title="Sepal和Petal长宽可视化") | alt.Chart(
    df2
).mark_line().encode(
    x="length",
    y="width",
    color=alt.condition(brush, "type", alt.value("lightgray")),
    strokeDash="shape",
    tooltip=["length", "width", "type"],
).add_selection(
    brush
).interactive().properties(
    title="Sepal和Petal长宽可视化", width=600, height=600
)