
# Unsupervised Learning Lab: Customer Segmentation - Solutions

## Exercise 1: Data Preprocessing and Exploration
```python
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('customer_data.csv')

# Explore the dataset
print(df.head())
print(df.describe())

# Standardize the numerical features
scaler = StandardScaler()
df[['Age', 'AnnualIncome', 'SpendingScore']] = scaler.fit_transform(df[['Age', 'AnnualIncome', 'SpendingScore']])
```

## Exercise 2: K-Means Clustering
```python
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Apply K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['KMeans_Cluster'] = kmeans.fit_predict(df[['Age', 'AnnualIncome', 'SpendingScore']])

# Determine the optimal number of clusters using the Elbow method
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df[['Age', 'AnnualIncome', 'SpendingScore']])
    sse.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), sse, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('SSE')
plt.show()

# Visualize the clusters
plt.figure(figsize=(10, 6))
plt.scatter(df['AnnualIncome'], df['SpendingScore'], c=df['KMeans_Cluster'], cmap='viridis')
plt.title('K-Means Clustering')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.show()
```

## Exercise 3: Hierarchical Clustering
```python
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

# Apply Agglomerative Hierarchical Clustering
Z = linkage(df[['Age', 'AnnualIncome', 'SpendingScore']], method='ward')

plt.figure(figsize=(10, 6))
dendrogram(Z)
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()

# Use the dendrogram to determine the optimal number of clusters
hc = AgglomerativeClustering(n_clusters=3)
df['Hierarchical_Cluster'] = hc.fit_predict(df[['Age', 'AnnualIncome', 'SpendingScore']])

# Visualize the clusters
plt.figure(figsize=(10, 6))
plt.scatter(df['AnnualIncome'], df['SpendingScore'], c=df['Hierarchical_Cluster'], cmap='viridis')
plt.title('Hierarchical Clustering')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.show()
```

## Exercise 4: DBSCAN Clustering
```python
from sklearn.cluster import DBSCAN

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
df['DBSCAN_Cluster'] = dbscan.fit_predict(df[['Age', 'AnnualIncome', 'SpendingScore']])

# Visualize the clusters
plt.figure(figsize=(10, 6))
plt.scatter(df['AnnualIncome'], df['SpendingScore'], c=df['DBSCAN_Cluster'], cmap='viridis')
plt.title('DBSCAN Clustering')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.show()
```


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('customer_data.csv')

# Explore the dataset
print(df.head())
print(df.describe())

# Standardize the numerical features
scaler = StandardScaler()
df[['Age', 'AnnualIncome', 'SpendingScore']] = scaler.fit_transform(df[['Age', 'AnnualIncome', 'SpendingScore']])

   CustomerID  Age  AnnualIncome  SpendingScore
0           1   56         81228             59
1           2   69         68984             32
2           3   46         60774             96
3           4   32         22568             88
4           5   60         82592             52
       CustomerID         Age   AnnualIncome  SpendingScore
count  100.000000  100.000000     100.000000     100.000000
mean    50.500000   43.350000   69474.690000      48.760000
std     29.011492   14.904663   29863.619229      31.064976
min      1.000000   19.000000   20206.000000       1.000000
25%     25.750000   31.750000   43229.750000      20.000000
50%     50.500000   42.000000   70325.500000      52.000000
75%     75.250000   57.000000   95529.000000      73.500000
max    100.000000   69.000000  118806.000000      99.000000


In [2]:
df

Unnamed: 0,CustomerID,Age,AnnualIncome,SpendingScore
0,1,0.853003,0.395549,0.331292
1,2,1.729608,-0.016514,-0.542232
2,3,0.178692,-0.292815,1.528345
3,4,-0.765343,-1.578610,1.269522
4,5,1.122728,0.441453,0.104823
...,...,...,...,...
95,96,-0.091032,1.616391,-0.801054
96,97,1.257590,1.636381,0.169528
97,98,0.987866,-1.209187,-0.509880
98,99,0.178692,0.562844,-0.801054


In [3]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Apply K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['KMeans_Cluster'] = kmeans.fit_predict(df[['Age', 'AnnualIncome', 'SpendingScore']])

# Determine the optimal number of clusters using the Elbow method




In [4]:
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df[['Age', 'AnnualIncome', 'SpendingScore']])
    sse.append(kmeans.inertia_)


