In [1]:
import pandas as pd

In [2]:
file_path = "shopping_data.csv"
df_shopping = pd.read_csv(file_path, encoding="ISO-8859-1")
df_shopping.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [3]:
# Columns
df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [4]:
df_shopping.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [5]:
for column in df_shopping.columns:
    print(f"Column {column} has {df_shopping[column].isnull().sum()} null values")

Column CustomerID has 0 null values
Column Card Member has 2 null values
Column Age has 2 null values
Column Annual Income has 0 null values
Column Spending Score (1-100) has 1 null values


In [None]:
# Drop null rows
df_shopping = df_shopping.dropna()

In [None]:
print(f"Duplicate entries: {df_shopping.duplicated().sum()}")

In [None]:
df_shopping.drop(columns=["CustomerID"], inplace=True)
df_shopping.head()

In [None]:
def change_string(member):
    if member == "Yes":
        return 1
    else:
        return 0
df_shopping["Card Member"] = df_shopping["Card Member"].apply(change_string)
df_shopping.head()

In [None]:
df_shopping["Annual Income"] = df_shopping["Annual Income"] = df_shopping["Annual Income"] / 100
df_shopping.head()

In [None]:
df_shopping = df_shopping.rename(columns={'Card Member': 'card_member',
                                          'Annual Income': 'annual_income',
                                          'Spending Score (1-100)':'spending_score',
                                         'Age':'age'})
df_shopping.head()

In [None]:
# Saving cleaned data
file_path = "shopping_data_cleaned.csv"
df_shopping.to_csv(file_path, index=False)

# KMeans

In [None]:
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans

In [None]:
file_path = "new_iris_data.csv"
df_iris = pd.read_csv(file_path)
df_iris.head(10)

In [None]:
# Initializing model with K = 3 (since we already know there are three classes of iris plants)
model = KMeans(n_clusters=3, random_state=5)
model

In [None]:
# Fitting model
model.fit(df_iris)

In [None]:
# Get the predictions
predictions = model.predict(df_iris)
print(predictions)

In [None]:
# Add a new class column to the df_iris
df_iris["class"] = model.labels_
df_iris.head()

In [None]:
import plotly.express as px
import hvplot.pandas

In [None]:
# Create a scatterplot of df_iris
df_iris.hvplot.scatter(x="sepal_length", y="sepal_width", by="class")

In [None]:
# Plotting the clusters with three features
fig = px.scatter_3d(df_iris, x="petal_width", y="sepal_length", z="petal_length", color="class", symbol="class", size="sepal_width",width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [None]:
# Initial imports
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [None]:
# Load data
file_path = "shopping_data_cleaned.csv"
df_shopping = pd.read_csv(file_path)
df_shopping.head(10)

In [None]:
df_shopping.hvplot.scatter(x="annual_income", y="spending_score")

In [None]:
# Function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=7, random_state=5)
    model

    # Fitting model
    model.fit(df)

    # Add a new class column to df_iris
    df["class"] = model.labels_

In [None]:
test_cluster_amount(df_shopping, 2)
df_shopping.hvplot.scatter(x="annual_income", y="spending_score", by="class")

In [None]:
fig = px.scatter_3d(
    df_shopping,
    x="annual_income",
    y="spending_score",
    z="age",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

# Elbow Curve

In [None]:
# Initial imports
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [None]:
# Loading data
file_path = "new_iris_data.csv"
df_iris = pd.read_csv(file_path)

df_iris.head(10)

In [None]:
inertia = []
k = list(range(1, 11))

In [None]:
# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris)
    inertia.append(km.inertia_)

In [None]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [None]:
# Initial imports
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [None]:
# Load data
file_path = "shopping_data_cleaned.csv"
df_shopping = pd.read_csv(file_path)
df_shopping.head(10)

In [None]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_shopping)
    inertia.append(km.inertia_)

In [None]:
elbow_data = {"k":k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [None]:
def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()

    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_

    return data

In [None]:
five_clusters = get_clusters(5, df_shopping)
five_clusters.head()

In [None]:
six_clusters = get_clusters(6, df_shopping)
six_clusters.head()

In [None]:
five_clusters.hvplot.scatter(x="annual_income", y="spending_score", by="class")

In [None]:
# Plot the 3D-scatter with x="Annual Income", y="Spending Score (1-100)" and z="Age"
fig = px.scatter_3d(
    five_clusters,
    x="age",
    y="spending_score",
    z="annual_income",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
six_clusters.hvplot.scatter(x="annual_income", y="spending_score", by="class")

In [None]:
fig = px.scatter_3d(
    six_clusters,
    x="age",
    y="spending_score",
    z="annual_income",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()