## In-Class Assignment 4
#### Data Mining and Analysis
#### 30th July 2024

### Prajwal Luitel (C0927658)

## 1. Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris

## 2. Fetching the data

In [2]:
# Load the iris dataset
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target

iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## 3. Data Preprocessing

In [3]:
# Properly labeling the target values for setosa, versicolor and virginica in place of 0, 1 and 2
iris_df['target'].replace({0:'setosa', 1:'versicolor', 2:'virginica'}, inplace=True)
iris_df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  iris_df['target'].replace({0:'setosa', 1:'versicolor', 2:'virginica'}, inplace=True)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
# Verifying the target values
iris_df['target'].value_counts()

target
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

#### Checking for null values

In [5]:
iris_df.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

#### Standardizing the data using scaling

In [6]:
scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris_df.iloc[:, :-1])

# Convert the scaled features back to a DataFrame for easier manipulation
iris_scaled_df = pd.DataFrame(iris_scaled, columns=iris.feature_names)
iris_scaled_df['target'] = iris_df['target']

# Display the first few rows of the scaled dataframe
print(iris_scaled_df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0          -0.900681          1.019004          -1.340227         -1.315444   
1          -1.143017         -0.131979          -1.340227         -1.315444   
2          -1.385353          0.328414          -1.397064         -1.315444   
3          -1.506521          0.098217          -1.283389         -1.315444   
4          -1.021849          1.249201          -1.340227         -1.315444   

   target  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  


## 4. K-Means Clustering

In [7]:
# Apply k-means clustering for k=2, and visualize the results
k=2
kmeans = KMeans(n_clusters=k, random_state=42)
iris_scaled_df['cluster'] = kmeans.fit_predict(iris_scaled_df.iloc[:, :-1])

# 2D scatter plot
scatter_2d_fig = px.scatter(iris_scaled_df, x=iris.feature_names[0], y=iris.feature_names[1], color='cluster',
                            title=f'K-Means Clustering with k={k} (2D)')
scatter_2d_fig.show()

# 3D scatter plot
scatter_3d_fig = px.scatter_3d(iris_scaled_df, x=iris.feature_names[0], y=iris.feature_names[1],
                                z=iris.feature_names[2], color='cluster',
                                title=f'K-Means Clustering with k={k} (3D)')
scatter_3d_fig.show()

In [8]:
# Apply k-means clustering for k=3 and visualize the results

k=3
kmeans = KMeans(n_clusters=k, random_state=42)
clustering_df = iris_scaled_df[[col for col in iris_scaled_df.columns if col != 'target']]
iris_scaled_df['cluster'] = kmeans.fit_predict(clustering_df)

# 2D scatter plot
scatter_2d_fig = px.scatter(iris_scaled_df, x=iris.feature_names[0], y=iris.feature_names[1], color='cluster',
                            title=f'K-Means Clustering with k={k} (2D)')
scatter_2d_fig.show()

# 3D scatter plot
scatter_3d_fig = px.scatter_3d(iris_scaled_df, x=iris.feature_names[0], y=iris.feature_names[1],
                                z=iris.feature_names[2], color='cluster',
                                title=f'K-Means Clustering with k={k} (3D)')
scatter_3d_fig.show()

In [9]:
# Apply k-means clustering for k=2, 3, and 4 and visualize the results

k=4
kmeans = KMeans(n_clusters=k, random_state=42)
clustering_df = iris_scaled_df[[col for col in iris_scaled_df.columns if col != 'target']]
iris_scaled_df['cluster'] = kmeans.fit_predict(clustering_df)

# 2D scatter plot
scatter_2d_fig = px.scatter(iris_scaled_df, x=iris.feature_names[0], y=iris.feature_names[1], color='cluster',
                            title=f'K-Means Clustering with k={k} (2D)')
scatter_2d_fig.show()

# 3D scatter plot
scatter_3d_fig = px.scatter_3d(iris_scaled_df, x=iris.feature_names[0], y=iris.feature_names[1],
                                z=iris.feature_names[2], color='cluster',
                                title=f'K-Means Clustering with k={k} (3D)')
scatter_3d_fig.show()

In [10]:
# Apply k-means clustering for k=5 and visualize the results

k=5
kmeans = KMeans(n_clusters=k, random_state=42)
clustering_df = iris_scaled_df[[col for col in iris_scaled_df.columns if col != 'target']]
iris_scaled_df['cluster'] = kmeans.fit_predict(clustering_df)

# 2D scatter plot
scatter_2d_fig = px.scatter(iris_scaled_df, x=iris.feature_names[0], y=iris.feature_names[1], color='cluster',
                            title=f'K-Means Clustering with k={k} (2D)')
scatter_2d_fig.show()

# 3D scatter plot
scatter_3d_fig = px.scatter_3d(iris_scaled_df, x=iris.feature_names[0], y=iris.feature_names[1],
                                z=iris.feature_names[2], color='cluster',
                                title=f'K-Means Clustering with k={k} (3D)')
scatter_3d_fig.show()

In [11]:
# Apply k-means clustering for k=6 and visualize the results

k=6
kmeans = KMeans(n_clusters=k, random_state=42)
clustering_df = iris_scaled_df[[col for col in iris_scaled_df.columns if col != 'target']]
iris_scaled_df['cluster'] = kmeans.fit_predict(clustering_df)

# 2D scatter plot
scatter_2d_fig = px.scatter(iris_scaled_df, x=iris.feature_names[0], y=iris.feature_names[1], color='cluster',
                            title=f'K-Means Clustering with k={k} (2D)')
scatter_2d_fig.show()

# 3D scatter plot
scatter_3d_fig = px.scatter_3d(iris_scaled_df, x=iris.feature_names[0], y=iris.feature_names[1],
                                z=iris.feature_names[2], color='cluster',
                                title=f'K-Means Clustering with k={k} (3D)')
scatter_3d_fig.show()

In [12]:
# Apply k-means clustering for k=7 and visualize the results

k=7
kmeans = KMeans(n_clusters=k, random_state=42)
clustering_df = iris_scaled_df[[col for col in iris_scaled_df.columns if col != 'target']]
iris_scaled_df['cluster'] = kmeans.fit_predict(clustering_df)

# 2D scatter plot
scatter_2d_fig = px.scatter(iris_scaled_df, x=iris.feature_names[0], y=iris.feature_names[1], color='cluster',
                            title=f'K-Means Clustering with k={k} (2D)')
scatter_2d_fig.show()

# 3D scatter plot
scatter_3d_fig = px.scatter_3d(iris_scaled_df, x=iris.feature_names[0], y=iris.feature_names[1],
                                z=iris.feature_names[2], color='cluster',
                                title=f'K-Means Clustering with k={k} (3D)')
scatter_3d_fig.show()

## Validation using the Elbow method

In [14]:
# Apply k-means clustering and calculate the within-cluster sum of squares (inertia)
inertia = []
k_values = range(1, 11)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(iris_scaled_df[[col for col in iris_scaled_df if col != 'target']])
    inertia.append(kmeans.inertia_)

# Plot the elbow curve using plotly.express
elbow_fig = px.line(x=k_values, y=inertia, markers=True, labels={'x': 'Number of clusters (k)', 'y': 'Inertia'})
elbow_fig.update_layout(title='Elbow Method For Optimal k')
elbow_fig.show()

In [18]:
from sklearn.metrics import silhouette_score
import plotly.graph_objs as go
import plotly.io as pio


# Range of clusters to try
range_n_clusters = list(range(2, 11))

# Store silhouette scores
silhouette_scores = []

# Calculate silhouette scores for each number of clusters
for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(iris_scaled_df[[col for col in iris_scaled_df if col != 'target']])
    
    # Calculate the silhouette score
    silhouette_avg = silhouette_score(iris_scaled_df[[col for col in iris_scaled_df if col != 'target']], cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Plot silhouette scores using Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=range_n_clusters, 
    y=silhouette_scores, 
    mode='lines+markers',
    marker=dict(size=10),
    line=dict(width=2),
    name='Silhouette Score'
))

# Add titles and labels
fig.update_layout(
    title='Silhouette Score vs Number of Clusters',
    xaxis=dict(title='Number of clusters'),
    yaxis=dict(title='Silhouette Score'),
    template='plotly'
)

# Show the plot
pio.show(fig)