# Unsupervised Learning: Country Data Analysis

This notebook performs clustering on country data to identify socio-economic patterns. We will use:
1. **K-Means Clustering**
2. **Hierarchical Clustering**
3. **DBSCAN**

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

plt.style.use('seaborn-v0_8')

## 2. Load and Explore Data

In [None]:
df = pd.read_csv('Country-data.csv')
print("Shape:", df.shape)
display(df.head())
print(df.info())

## 3. Data Preprocessing
Clustering algorithms are sensitive to the scale of data. We must scale the features.

In [None]:
# Store country names separately if needed for labeling
country_names = df['country']

# Select numerical features for clustering
features = df.drop('country', axis=1)

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

print("Data Scaled.")

## 4. K-Means Clustering

In [None]:
# Elbow Method to find optimal K
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 5))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Fitting K-Means with K=3 (Visual estimation from Elbow method, usually 3 or 4 for this dataset)
kmeans = KMeans(n_clusters=3, init='k-means++', random_state=42)
y_kmeans = kmeans.fit_predict(X_scaled)

print(f"Silhouette Score for K-Means: {silhouette_score(X_scaled, y_kmeans):.4f}")

# Visualize using PCA (reducing to 2D for plotting)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y_kmeans, palette='viridis', s=100)
plt.title('K-Means Clusters (PCA Reduced)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

## 5. Hierarchical Clustering

In [None]:
# Plot Dendrogram
plt.figure(figsize=(12, 6))
dendrogram = dendrogram(linkage(X_scaled, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Countries')
plt.ylabel('Euclidean Distances')
plt.show()

In [None]:
# Apply Agglomerative Clustering
hc = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
y_hc = hc.fit_predict(X_scaled)

print(f"Silhouette Score for Hierarchical: {silhouette_score(X_scaled, y_hc):.4f}")

plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y_hc, palette='coolwarm', s=100)
plt.title('Hierarchical Clusters (PCA Reduced)')
plt.show()

## 6. DBSCAN Clustering
DBSCAN is density-based. It groups points that are closely packed together and marks points in low-density regions as outliers (-1).

In [None]:
# Apply DBSCAN
# eps and min_samples need tuning. For this dataset, eps around 1-2 often works.
dbscan = DBSCAN(eps=1.5, min_samples=3)
y_dbscan = dbscan.fit_predict(X_scaled)

# Check number of clusters (ignoring noise -1)
n_clusters_ = len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)
print(f'Estimated number of clusters: {n_clusters_}')
print(f"Silhouette Score for DBSCAN: {silhouette_score(X_scaled, y_dbscan):.4f}")

plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y_dbscan, palette='deep', s=100)
plt.title('DBSCAN Clusters (PCA Reduced)')
plt.show()