# Multi-Pollutant Profile Grouping of Air Quality Data Using K-Means Clustering

## Comprehensive Analysis Covering All Presenter Sections

This notebook demonstrates manual K-means clustering implementation on air quality data without using sklearn. We'll analyze pollution patterns across different scales - from urban to country levels - using various pollutant combinations.

### Presentation Structure:
1. **Presenter 1**: Simple 2D Clustering (PM2.5 and PM10)
2. **Presenter 2**: Country-Level AQI Clustering  
3. **Presenter 3**: City-Level AQI Clustering
4. **Presenter 4**: Two-Pollutant City Clustering (PM2.5 and NO2)
5. **Presenter 5**: Multi-Feature Three-Pollutant Clustering
6. **Results Comparison and Analysis**

## Section 1: Data Preparation and Imports

First, let's import the necessary libraries and prepare our dataset. We'll implement everything manually without using sklearn.

In [None]:
import csv
import math
import random
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict

# Set up plotting style
plt.style.use('default')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

# Manual K-Means Implementation
class ManualKMeans:
    def __init__(self, k=2, max_iters=100, random_state=42):
        self.k = k
        self.max_iters = max_iters
        self.random_state = random_state
        random.seed(random_state)
        
    def euclidean_distance(self, point1, point2):
        """Calculate Euclidean distance between two points"""
        if isinstance(point1, (int, float)) and isinstance(point2, (int, float)):
            return abs(point1 - point2)
        
        distance = 0
        for i in range(len(point1)):
            distance += (point1[i] - point2[i]) ** 2
        return math.sqrt(distance)
    
    def initialize_centroids(self, data):
        """Initialize centroids randomly"""
        if not data:
            return []
        
        # Check if data is 1D or multi-dimensional
        if isinstance(data[0], (int, float)):
            # 1D data
            min_val, max_val = min(data), max(data)
            return [random.uniform(min_val, max_val) for _ in range(self.k)]
        else:
            # Multi-dimensional data
            n_features = len(data[0])
            centroids = []
            for _ in range(self.k):
                centroid = []
                for j in range(n_features):
                    feature_values = [point[j] for point in data]
                    min_val, max_val = min(feature_values), max(feature_values)
                    centroid.append(random.uniform(min_val, max_val))
                centroids.append(centroid)
            return centroids
    
    def assign_clusters(self, data, centroids):
        """Assign each point to the nearest centroid"""
        clusters = [[] for _ in range(self.k)]
        cluster_assignments = []
        
        for point in data:
            distances = [self.euclidean_distance(point, centroid) for centroid in centroids]
            closest_cluster = distances.index(min(distances))
            clusters[closest_cluster].append(point)
            cluster_assignments.append(closest_cluster)
        
        return clusters, cluster_assignments
    
    def update_centroids(self, clusters):
        """Update centroids based on current clusters"""
        new_centroids = []
        
        for cluster in clusters:
            if not cluster:
                continue
            
            if isinstance(cluster[0], (int, float)):
                # 1D data
                centroid = sum(cluster) / len(cluster)
            else:
                # Multi-dimensional data
                n_features = len(cluster[0])
                centroid = []
                for j in range(n_features):
                    feature_sum = sum(point[j] for point in cluster)
                    centroid.append(feature_sum / len(cluster))
            
            new_centroids.append(centroid)
        
        return new_centroids
    
    def fit(self, data):
        """Fit K-means to the data"""
        self.centroids = self.initialize_centroids(data)
        self.history = [self.centroids.copy()]
        
        for iteration in range(self.max_iters):
            clusters, assignments = self.assign_clusters(data, self.centroids)
            new_centroids = self.update_centroids(clusters)
            
            # Remove None centroids (empty clusters)
            new_centroids = [c for c in new_centroids if c is not None]
            
            # Check for convergence
            if len(new_centroids) == len(self.centroids):
                converged = True
                for i, (old, new) in enumerate(zip(self.centroids, new_centroids)):
                    if isinstance(old, (int, float)):
                        if abs(old - new) > 1e-6:
                            converged = False
                            break
                    else:
                        if any(abs(o - n) > 1e-6 for o, n in zip(old, new)):
                            converged = False
                            break
                
                if converged:
                    print(f"Converged after {iteration + 1} iterations")
                    break
            
            self.centroids = new_centroids
            self.history.append(self.centroids.copy())
        
        self.final_clusters, self.cluster_assignments = self.assign_clusters(data, self.centroids)
        return self

# Helper functions
def load_air_quality_data():
    """Load the air quality dataset"""
    data = []
    with open('s:\\AIML\\global_air_quality_data_10000.csv', 'r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            data.append(row)
    return data

def get_city_averages(data, cities, pollutants):
    """Calculate average pollutant values for specified cities"""
    city_data = defaultdict(list)
    
    for record in data:
        city = record['City']
        if city in cities:
            pollutant_values = {}
            for pollutant in pollutants:
                try:
                    value = float(record[pollutant])
                    pollutant_values[pollutant] = value
                except (ValueError, KeyError):
                    continue
            if len(pollutant_values) == len(pollutants):
                city_data[city].append(pollutant_values)
    
    # Calculate averages
    averages = {}
    for city in cities:
        if city in city_data and city_data[city]:
            avg_values = []
            for pollutant in pollutants:
                values = [record[pollutant] for record in city_data[city]]
                if values:
                    avg_values.append(sum(values) / len(values))
                else:
                    avg_values.append(0)
            averages[city] = avg_values
    
    return averages

def calculate_aqi_simple(pm25, pm10, no2):
    """Simplified AQI calculation based on major pollutants"""
    pm25_norm = min(pm25 / 35.0, 1.0) * 100
    pm10_norm = min(pm10 / 50.0, 1.0) * 100
    no2_norm = min(no2 / 40.0, 1.0) * 100
    return max(pm25_norm, pm10_norm, no2_norm)

# Load the dataset
print("Loading air quality dataset...")
air_quality_data = load_air_quality_data()
print(f"Loaded {len(air_quality_data)} records")

# Sample first few records to understand the structure
print("\nFirst 3 records:")
for i, record in enumerate(air_quality_data[:3]):
    print(f"Record {i+1}: {record}")

: 

## Section 2: Presenter 1 - Simple 2D Clustering (PM2.5 and PM10)

**Topic**: Clustering cities using two pollutants: PM2.5 and PM10  
**Dataset Focus**: Sample of 10 cities with average PM2.5 and PM10 values  
**Method**: Manual K-Means clustering with K=2 clusters

In [None]:
# Presenter 1: Simple 2D Clustering with PM2.5 and PM10

def get_city_averages(data, cities, pollutants):
    """Calculate average pollutant values for specified cities"""
    city_data = defaultdict(list)
    
    for record in data:
        city = record['City']
        if city in cities:
            for pollutant in pollutants:
                try:
                    value = float(record[pollutant])
                    city_data[city].append({pollutant: value})
                except (ValueError, KeyError):
                    continue
    
    # Calculate averages
    averages = {}
    for city in cities:
        if city in city_data:
            city_records = city_data[city]
            avg_values = []
            for pollutant in pollutants:
                values = []
                for record_dict in city_records:
                    if pollutant in record_dict:
                        values.append(record_dict[pollutant])
                
                if values:
                    avg_values.append(sum(values) / len(values))
                else:
                    avg_values.append(0)
            
            averages[city] = avg_values
    
    return averages

# Select 10 cities for analysis
selected_cities = ['Bangkok', 'Istanbul', 'Mumbai', 'Paris', 'Tokyo', 
                  'New York', 'London', 'Cairo', 'Mexico City', 'Seoul']

# Get average PM2.5 and PM10 values for these cities
pollutants = ['PM2.5', 'PM10']
city_averages = get_city_averages(air_quality_data, selected_cities, pollutants)

print("Presenter 1: City averages for PM2.5 and PM10")
print("=" * 50)
for city, values in city_averages.items():
    print(f"{city:15}: PM2.5={values[0]:6.2f}, PM10={values[1]:6.2f}")

# Prepare data for clustering
clustering_data = list(city_averages.values())
city_names = list(city_averages.keys())

print(f"\nClustering data shape: {len(clustering_data)} cities × {len(pollutants)} features")

In [None]:
# Perform K-Means clustering with K=2
kmeans_p1 = ManualKMeans(k=2, random_state=42)
kmeans_p1.fit(clustering_data)

print("\nPresenter 1: K-Means Clustering Results (K=2)")
print("=" * 50)

# Display final centroids
print("Final Centroids:")
for i, centroid in enumerate(kmeans_p1.centroids):
    print(f"Cluster {i+1}: PM2.5={centroid[0]:6.2f}, PM10={centroid[1]:6.2f}")

print("\nCluster Assignments:")
for city, cluster_id in zip(city_names, kmeans_p1.cluster_assignments):
    values = city_averages[city]
    print(f"{city:15}: Cluster {cluster_id+1} (PM2.5={values[0]:6.2f}, PM10={values[1]:6.2f})")

# Create detailed visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Scatter plot with clusters
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
cluster_names = ['Higher Pollution Group', 'Lower Pollution Group']

for i in range(kmeans_p1.k):
    cluster_cities = [city for city, cluster in zip(city_names, kmeans_p1.cluster_assignments) if cluster == i]
    cluster_data = [city_averages[city] for city in cluster_cities]
    
    if cluster_data:
        pm25_values = [point[0] for point in cluster_data]
        pm10_values = [point[1] for point in cluster_data]
        ax1.scatter(pm25_values, pm10_values, c=colors[i], label=f'Cluster {i+1}: {cluster_names[i]}', 
                   s=120, alpha=0.8, edgecolors='black', linewidth=1)
        
        # Add city labels
        for city, pm25, pm10 in zip(cluster_cities, pm25_values, pm10_values):
            ax1.annotate(city, (pm25, pm10), xytext=(5, 5), textcoords='offset points', 
                        fontsize=9, fontweight='bold')

# Plot centroids
for i, centroid in enumerate(kmeans_p1.centroids):
    ax1.scatter(centroid[0], centroid[1], c='black', marker='X', s=300, 
                edgecolors=colors[i], linewidth=3, label=f'Centroid {i+1}')

ax1.set_xlabel('PM2.5 (μg/m³)', fontsize=12, fontweight='bold')
ax1.set_ylabel('PM10 (μg/m³)', fontsize=12, fontweight='bold')
ax1.set_title('Presenter 1: City Clustering by PM2.5 and PM10 (K=2)', fontsize=14, fontweight='bold')
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax1.grid(True, alpha=0.3)

# Plot 2: Bar chart showing cluster statistics
cluster_stats = {}
for i in range(kmeans_p1.k):
    cluster_cities = [city for city, cluster in zip(city_names, kmeans_p1.cluster_assignments) if cluster == i]
    cluster_data = [city_averages[city] for city in cluster_cities]
    
    if cluster_data:
        avg_pm25 = sum(point[0] for point in cluster_data) / len(cluster_data)
        avg_pm10 = sum(point[1] for point in cluster_data) / len(cluster_data)
        cluster_stats[f'Cluster {i+1}'] = {'PM2.5': avg_pm25, 'PM10': avg_pm10, 'Cities': len(cluster_data)}

x_pos = np.arange(len(cluster_stats))
pm25_values = [stats['PM2.5'] for stats in cluster_stats.values()]
pm10_values = [stats['PM10'] for stats in cluster_stats.values()]

width = 0.35
bars1 = ax2.bar(x_pos - width/2, pm25_values, width, label='PM2.5', color=colors[0], alpha=0.8)
bars2 = ax2.bar(x_pos + width/2, pm10_values, width, label='PM10', color=colors[1], alpha=0.8)

ax2.set_xlabel('Clusters', fontsize=12, fontweight='bold')
ax2.set_ylabel('Average Concentration (μg/m³)', fontsize=12, fontweight='bold')
ax2.set_title('Average Pollutant Levels by Cluster', fontsize=14, fontweight='bold')
ax2.set_xticks(x_pos)
ax2.set_xticklabels(cluster_stats.keys())
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax2.annotate(f'{height:.1f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n📊 Conclusion from Clustered Graph:")
print("The visualization reveals two distinct groups—one with higher pollution and one cleaner cluster—")
print("demonstrating a clear distinction even with just two features. This sets the foundation")
print("for more complex multi-feature clustering analyses.")

print(f"\nCluster Statistics:")
for cluster_name, stats in cluster_stats.items():
    print(f"  {cluster_name}: {stats['Cities']} cities, Avg PM2.5={stats['PM2.5']:.2f}, Avg PM10={stats['PM10']:.2f}")

## Section 3: Presenter 2 - Country-Level AQI Clustering

**Topic**: Grouping countries by average AQI into Low, Medium, and High pollution clusters  
**Dataset Focus**: 6 countries' average AQI extracted from the dataset  
**Method**: 1D K-Means clustering with K=3 on AQI values

In [None]:
# Presenter 2: Country-Level AQI Clustering

def calculate_aqi_simple(pm25, pm10, no2):
    """
    Simplified AQI calculation based on major pollutants
    This is a simplified version for demonstration purposes
    """
    # Normalize values and combine (simplified approach)
    pm25_norm = min(pm25 / 35.0, 1.0) * 100  # WHO guideline: 15 μg/m³ annual, 35 daily
    pm10_norm = min(pm10 / 50.0, 1.0) * 100  # WHO guideline: 45 μg/m³ annual, 50 daily
    no2_norm = min(no2 / 40.0, 1.0) * 100    # WHO guideline: 40 μg/m³ annual
    
    # Take the maximum as the limiting factor
    aqi = max(pm25_norm, pm10_norm, no2_norm)
    return aqi

def get_country_aqi_averages(data, countries):
    """Calculate average AQI for specified countries"""
    country_data = defaultdict(list)
    
    for record in data:
        country = record['Country']
        if country in countries:
            try:
                pm25 = float(record['PM2.5'])
                pm10 = float(record['PM10'])
                no2 = float(record['NO2'])
                
                aqi = calculate_aqi_simple(pm25, pm10, no2)
                country_data[country].append(aqi)
            except (ValueError, KeyError):
                continue
    
    # Calculate averages
    averages = {}
    for country in countries:
        if country in country_data and country_data[country]:
            averages[country] = sum(country_data[country]) / len(country_data[country])
    
    return averages

# Select 6 countries for analysis
selected_countries = ['Thailand', 'Turkey', 'Brazil', 'India', 'France', 'USA']

# Get average AQI values for these countries
country_aqi = get_country_aqi_averages(air_quality_data, selected_countries)

print("Presenter 2: Country Average AQI Values")
print("=" * 40)
for country, aqi in sorted(country_aqi.items(), key=lambda x: x[1]):
    print(f"{country:12}: AQI = {aqi:6.2f}")

# Prepare data for clustering (1D)
aqi_values = list(country_aqi.values())
country_names_p2 = list(country_aqi.keys())

print(f"\nClustering data: {len(aqi_values)} countries with AQI values")

In [None]:
# Perform K-Means clustering with K=3 for country AQI
kmeans_p2 = ManualKMeans(k=3, random_state=42)
kmeans_p2.fit(aqi_values)

print("\nPresenter 2: K-Means Clustering Results (K=3)")
print("=" * 50)

# Display final centroids
print("Final Centroids (AQI levels):")
centroid_labels = ['Low Pollution', 'Medium Pollution', 'High Pollution']
sorted_centroids = sorted(enumerate(kmeans_p2.centroids), key=lambda x: x[1])

for i, (orig_idx, centroid) in enumerate(sorted_centroids):
    print(f"Cluster {orig_idx+1} ({centroid_labels[i]}): AQI = {centroid:6.2f}")

print("\nCountry Cluster Assignments:")
for country, cluster_id, aqi in zip(country_names_p2, kmeans_p2.cluster_assignments, aqi_values):
    print(f"{country:12}: Cluster {cluster_id+1} - AQI = {aqi:6.2f}")

# Create comprehensive visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: 1D scatter plot along AQI axis
colors = ['#2ECC71', '#F39C12', '#E74C3C']  # Green, Orange, Red
pollution_levels = ['Low', 'Medium', 'High']

# Group countries by cluster
cluster_groups = {i: [] for i in range(kmeans_p2.k)}
for country, cluster_id, aqi in zip(country_names_p2, kmeans_p2.cluster_assignments, aqi_values):
    cluster_groups[cluster_id].append((country, aqi))

y_offset = 0.1
for i in range(kmeans_p2.k):
    if cluster_groups[i]:
        countries, aqis = zip(*cluster_groups[i])
        y_positions = [i + random.uniform(-y_offset, y_offset) for _ in aqis]
        ax1.scatter(aqis, y_positions, c=colors[i], label=f'Cluster {i+1}', s=150, alpha=0.8, edgecolors='black')
        
        # Add country labels
        for country, aqi, y in zip(countries, aqis, y_positions):
            ax1.annotate(country, (aqi, y), xytext=(5, 0), textcoords='offset points', 
                        fontsize=10, fontweight='bold', va='center')

# Plot centroids
for i, centroid in enumerate(kmeans_p2.centroids):
    ax1.axvline(x=centroid, color=colors[i], linestyle='--', linewidth=3, alpha=0.7)
    ax1.scatter(centroid, i, c='black', marker='X', s=300, 
                edgecolors=colors[i], linewidth=3, zorder=5)

ax1.set_xlabel('Average AQI', fontsize=12, fontweight='bold')
ax1.set_ylabel('Cluster', fontsize=12, fontweight='bold')
ax1.set_title('Presenter 2: Country Clustering by Average AQI (K=3)', fontsize=14, fontweight='bold')
ax1.set_yticks(range(kmeans_p2.k))
ax1.set_yticklabels([f'Cluster {i+1}' for i in range(kmeans_p2.k)])
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Bar chart of AQI values by country
countries_sorted = sorted(zip(country_names_p2, aqi_values, kmeans_p2.cluster_assignments), key=lambda x: x[1])
countries, aqis_sorted, clusters = zip(*countries_sorted)

bars = ax2.bar(range(len(countries)), aqis_sorted, 
               color=[colors[cluster] for cluster in clusters], alpha=0.8, edgecolor='black')
ax2.set_xlabel('Countries', fontsize=12, fontweight='bold')
ax2.set_ylabel('Average AQI', fontsize=12, fontweight='bold')
ax2.set_title('AQI Values by Country (Colored by Cluster)', fontsize=14, fontweight='bold')
ax2.set_xticks(range(len(countries)))
ax2.set_xticklabels(countries, rotation=45, ha='right')
ax2.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar, aqi in zip(bars, aqis_sorted):
    ax2.annotate(f'{aqi:.1f}',
                xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom', fontweight='bold')

# Plot 3: Pie chart showing cluster distribution
cluster_counts = [len(cluster_groups[i]) for i in range(kmeans_p2.k)]
cluster_labels_pie = [f'Cluster {i+1}\n({pollution_levels[i]} Pollution)\n{count} countries' 
                     for i, count in enumerate(cluster_counts)]

wedges, texts, autotexts = ax3.pie(cluster_counts, labels=cluster_labels_pie, colors=colors, 
                                  autopct='%1.1f%%', startangle=90, textprops={'fontweight': 'bold'})
ax3.set_title('Distribution of Countries by Pollution Level', fontsize=14, fontweight='bold')

# Plot 4: Centroid comparison
centroid_values = [kmeans_p2.centroids[i] for i in range(kmeans_p2.k)]
bars4 = ax4.bar(range(kmeans_p2.k), centroid_values, color=colors, alpha=0.8, edgecolor='black')
ax4.set_xlabel('Pollution Level', fontsize=12, fontweight='bold')
ax4.set_ylabel('Centroid AQI Value', fontsize=12, fontweight='bold')
ax4.set_title('Cluster Centroids Comparison', fontsize=14, fontweight='bold')
ax4.set_xticks(range(kmeans_p2.k))
ax4.set_xticklabels([f'{pollution_levels[i]}\nPollution' for i in range(kmeans_p2.k)])
ax4.grid(True, alpha=0.3, axis='y')

# Add value labels
for bar, value in zip(bars4, centroid_values):
    ax4.annotate(f'{value:.2f}',
                xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n📊 Conclusion from Clustered Graph:")
print("Distinct partitions indicate national-level air quality differences. This clustering")
print("enables policymakers to focus resources on high-risk nations by identifying")
print("the pollution severity category.")

print(f"\nDetailed Cluster Analysis:")
for i in range(kmeans_p2.k):
    if cluster_groups[i]:
        print(f"  🌍 {pollution_levels[i]} Pollution Cluster ({len(cluster_groups[i])} countries):")
        print(f"     Centroid AQI: {kmeans_p2.centroids[i]:.2f}")
        print(f"     Countries: {', '.join([country for country, _ in cluster_groups[i]])}")
        avg_aqi = sum(aqi for _, aqi in cluster_groups[i]) / len(cluster_groups[i])
        print(f"     Average AQI: {avg_aqi:.2f}")
        print()

## Section 4: Presenter 3 - City-Level AQI Clustering

**Topic**: Clustering 8 cities based on AQI alone  
**Dataset Focus**: AQI values for 8 cities in the dataset  
**Method**: K=3 clusters created using manual assignment and centroid updates

In [None]:
# Presenter 3: City-Level AQI Clustering

def get_city_aqi_averages(data, cities):
    """Calculate average AQI for specified cities"""
    city_data = defaultdict(list)
    
    for record in data:
        city = record['City']
        if city in cities:
            try:
                pm25 = float(record['PM2.5'])
                pm10 = float(record['PM10'])
                no2 = float(record['NO2'])
                
                aqi = calculate_aqi_simple(pm25, pm10, no2)
                city_data[city].append(aqi)
            except (ValueError, KeyError):
                continue
    
    # Calculate averages
    averages = {}
    for city in cities:
        if city in city_data and city_data[city]:
            averages[city] = sum(city_data[city]) / len(city_data[city])
    
    return averages

# Select 8 cities for analysis
selected_cities_p3 = ['Bangkok', 'Istanbul', 'Mumbai', 'Paris', 'Tokyo', 'New York', 'London', 'Cairo']

# Get average AQI values for these cities
city_aqi = get_city_aqi_averages(air_quality_data, selected_cities_p3)

print("Presenter 3: City Average AQI Values")
print("=" * 40)
for city, aqi in sorted(city_aqi.items(), key=lambda x: x[1]):
    print(f"{city:15}: AQI = {aqi:6.2f}")

# Prepare data for clustering (1D)
city_aqi_values = list(city_aqi.values())
city_names_p3 = list(city_aqi.keys())

print(f"\nClustering data: {len(city_aqi_values)} cities with AQI values")

In [None]:
# Perform K-Means clustering with K=3 for city AQI
kmeans_p3 = ManualKMeans(k=3, random_state=42)
kmeans_p3.fit(city_aqi_values)

print("\nPresenter 3: K-Means Clustering Results (K=3)")
print("=" * 50)

# Group cities by cluster
clusters_dict = defaultdict(list)
for city, cluster_id, aqi in zip(city_names_p3, kmeans_p3.cluster_assignments, city_aqi_values):
    clusters_dict[cluster_id].append((city, aqi))

# Create comprehensive visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# Define pollution levels based on centroid values
pollution_levels = []
centroid_aqi_pairs = [(i, kmeans_p3.centroids[i]) for i in range(len(kmeans_p3.centroids))]
centroid_aqi_pairs.sort(key=lambda x: x[1])  # Sort by AQI value

level_names = ['Low Pollution', 'Medium Pollution', 'High Pollution']
colors = ['#2ECC71', '#F39C12', '#E74C3C']  # Green, Orange, Red

# Map clusters to pollution levels
cluster_to_level = {}
for rank, (cluster_id, aqi) in enumerate(centroid_aqi_pairs):
    if rank < len(level_names):
        cluster_to_level[cluster_id] = {'level': level_names[rank], 'color': colors[rank]}
    else:
        cluster_to_level[cluster_id] = {'level': 'High Pollution', 'color': colors[-1]}

# Display each cluster
print("Final Centroids (AQI levels):")
for cluster_id in range(len(kmeans_p3.centroids)):
    centroid_aqi = kmeans_p3.centroids[cluster_id]
    level = cluster_to_level[cluster_id]['level']
    print(f"Cluster {cluster_id+1} ({level}): AQI = {centroid_aqi:6.2f}")

print("\nCity Cluster Assignments (Textual Output):")
print("-" * 55)

for cluster_id in range(len(kmeans_p3.centroids)):
    if cluster_id in clusters_dict:
        centroid_aqi = kmeans_p3.centroids[cluster_id]
        level = cluster_to_level[cluster_id]['level']
        
        print(f"\n🏙️  CLUSTER {cluster_id+1} ({level}):")
        print(f"   Centroid AQI: {centroid_aqi:.2f}")
        print("   Cities in this cluster:")
        
        for city, aqi in sorted(clusters_dict[cluster_id], key=lambda x: x[1]):
            print(f"   • {city:15}: AQI = {aqi:6.2f}")

# Plot 1: 1D scatter plot along AQI axis
y_offset = 0.15
for cluster_id in range(len(kmeans_p3.centroids)):
    if cluster_id in clusters_dict:
        cities, aqis = zip(*clusters_dict[cluster_id])
        y_positions = [cluster_id + random.uniform(-y_offset, y_offset) for _ in aqis]
        color = cluster_to_level[cluster_id]['color']
        
        ax1.scatter(aqis, y_positions, c=color, s=150, alpha=0.8, edgecolors='black',
                   label=f'Cluster {cluster_id+1}: {cluster_to_level[cluster_id]["level"]}')
        
        # Add city labels
        for city, aqi, y in zip(cities, aqis, y_positions):
            ax1.annotate(city, (aqi, y), xytext=(5, 0), textcoords='offset points', 
                        fontsize=9, fontweight='bold', va='center')

# Plot centroids
for cluster_id in range(len(kmeans_p3.centroids)):
    centroid = kmeans_p3.centroids[cluster_id]
    color = cluster_to_level[cluster_id]['color']
    ax1.axvline(x=centroid, color=color, linestyle='--', linewidth=3, alpha=0.7)
    ax1.scatter(centroid, cluster_id, c='black', marker='X', s=300, 
                edgecolors=color, linewidth=3, zorder=5)

ax1.set_xlabel('Average AQI', fontsize=12, fontweight='bold')
ax1.set_ylabel('Cluster', fontsize=12, fontweight='bold')
ax1.set_title('Presenter 3: City AQI Clustering (K=3)', fontsize=14, fontweight='bold')
ax1.set_yticks(range(len(kmeans_p3.centroids)))
ax1.set_yticklabels([f'Cluster {i+1}' for i in range(len(kmeans_p3.centroids))])
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax1.grid(True, alpha=0.3)

# Plot 2: Bar chart of AQI values by city
cities_sorted = sorted(zip(city_names_p3, city_aqi_values, kmeans_p3.cluster_assignments), key=lambda x: x[1])
cities, aqis_sorted, clusters = zip(*cities_sorted)

bar_colors = [cluster_to_level[cluster]['color'] for cluster in clusters]
bars = ax2.bar(range(len(cities)), aqis_sorted, color=bar_colors, alpha=0.8, edgecolor='black')
ax2.set_xlabel('Cities', fontsize=12, fontweight='bold')
ax2.set_ylabel('Average AQI', fontsize=12, fontweight='bold')
ax2.set_title('AQI Values by City (Colored by Cluster)', fontsize=14, fontweight='bold')
ax2.set_xticks(range(len(cities)))
ax2.set_xticklabels(cities, rotation=45, ha='right')
ax2.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar, aqi in zip(bars, aqis_sorted):
    ax2.annotate(f'{aqi:.1f}',
                xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom', fontweight='bold', fontsize=10)

# Plot 3: Pie chart showing cluster distribution
cluster_counts = [len(clusters_dict[i]) if i in clusters_dict else 0 for i in range(len(kmeans_p3.centroids))]
active_clusters = [(i, count) for i, count in enumerate(cluster_counts) if count > 0]

if active_clusters:
    cluster_indices, counts = zip(*active_clusters)
    pie_colors = [cluster_to_level[i]['color'] for i in cluster_indices]
    pie_labels = [f'Cluster {i+1}\n({cluster_to_level[i]["level"]})\n{count} cities' 
                 for i, count in active_clusters]
    
    wedges, texts, autotexts = ax3.pie(counts, labels=pie_labels, colors=pie_colors, 
                                      autopct='%1.1f%%', startangle=90, textprops={'fontweight': 'bold'})
ax3.set_title('Distribution of Cities by Pollution Level', fontsize=14, fontweight='bold')

# Plot 4: Box plot style visualization
cluster_data_for_box = []
cluster_labels_for_box = []
cluster_colors_for_box = []

for cluster_id in range(len(kmeans_p3.centroids)):
    if cluster_id in clusters_dict:
        aqis = [aqi for _, aqi in clusters_dict[cluster_id]]
        cluster_data_for_box.append(aqis)
        cluster_labels_for_box.append(f'Cluster {cluster_id+1}\n({cluster_to_level[cluster_id]["level"]})')
        cluster_colors_for_box.append(cluster_to_level[cluster_id]['color'])

if cluster_data_for_box:
    bp = ax4.boxplot(cluster_data_for_box, labels=cluster_labels_for_box, patch_artist=True)
    for patch, color in zip(bp['boxes'], cluster_colors_for_box):
        patch.set_facecolor(color)
        patch.set_alpha(0.7)

ax4.set_ylabel('AQI Values', fontsize=12, fontweight='bold')
ax4.set_title('AQI Distribution by Cluster', fontsize=14, fontweight='bold')
ax4.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("\n📊 Conclusion from Clustering:")
print("Clusters effectively group cities by similar pollution levels, enabling identification")
print("of local pollution discrepancies and facilitating targeted urban pollution management.")

print(f"\nStatistical Summary:")
for cluster_id in range(len(kmeans_p3.centroids)):
    if cluster_id in clusters_dict:
        cities_in_cluster = clusters_dict[cluster_id]
        aqis = [aqi for _, aqi in cities_in_cluster]
        level = cluster_to_level[cluster_id]['level']
        print(f"  {level} Cluster: {len(cities_in_cluster)} cities, "
              f"AQI range: {min(aqis):.1f}-{max(aqis):.1f}, "
              f"Average: {sum(aqis)/len(aqis):.1f}")

## Section 5: Presenter 4 - Two-Pollutant City Clustering (PM2.5 and NO2)

**Topic**: Multi-feature clustering for 8 cities using PM2.5 and NO2  
**Dataset Focus**: Average PM2.5 and NO2 for 8 cities in the dataset  
**Method**: Manual K-Means with K=3 clusters, showing centroid movement iterations

In [None]:
# Presenter 4: Two-Pollutant City Clustering (PM2.5 and NO2)

# Select 8 cities for analysis
selected_cities_p4 = ['Bangkok', 'Istanbul', 'Mumbai', 'Paris', 'Tokyo', 'New York', 'London', 'Cairo']

# Get average PM2.5 and NO2 values for these cities
pollutants_p4 = ['PM2.5', 'NO2']
city_averages_p4 = get_city_averages(air_quality_data, selected_cities_p4, pollutants_p4)

print("Presenter 4: City averages for PM2.5 and NO2")
print("=" * 50)
for city, values in city_averages_p4.items():
    print(f"{city:15}: PM2.5={values[0]:6.2f}, NO2={values[1]:6.2f}")

# Prepare data for clustering
clustering_data_p4 = list(city_averages_p4.values())
city_names_p4 = list(city_averages_p4.keys())

print(f"\nClustering data shape: {len(clustering_data_p4)} cities × {len(pollutants_p4)} features")

In [None]:
# Enhanced K-Means with iteration tracking
class IterativeKMeans(ManualKMeans):
    def fit_with_iterations(self, data):
        """Fit K-means and return iteration history for visualization"""
        self.centroids = self.initialize_centroids(data)
        self.iteration_history = [self.centroids.copy()]
        self.assignment_history = []
        
        print(f"Initial centroids: {self.centroids}")
        
        for iteration in range(self.max_iters):
            clusters, assignments = self.assign_clusters(data, self.centroids)
            self.assignment_history.append(assignments.copy())
            new_centroids = self.update_centroids(clusters)
            
            # Remove None centroids (empty clusters)
            new_centroids = [c for c in new_centroids if c is not None]
            
            print(f"Iteration {iteration + 1} centroids: {new_centroids}")
            
            # Check for convergence
            if len(new_centroids) == len(self.centroids):
                converged = True
                for old, new in zip(self.centroids, new_centroids):
                    if any(abs(o - n) > 1e-6 for o, n in zip(old, new)):
                        converged = False
                        break
                
                if converged:
                    print(f"Converged after {iteration + 1} iterations")
                    break
            
            self.centroids = new_centroids
            self.iteration_history.append(self.centroids.copy())
        
        self.final_clusters, self.cluster_assignments = self.assign_clusters(data, self.centroids)
        return self

# Perform clustering with iteration tracking
kmeans_p4 = IterativeKMeans(k=3, random_state=42)
kmeans_p4.fit_with_iterations(clustering_data_p4)

print("\nPresenter 4: K-Means Clustering Results (K=3)")
print("=" * 50)

# Display final centroids
print("Final Centroids:")
for i, centroid in enumerate(kmeans_p4.centroids):
    print(f"Cluster {i+1}: PM2.5={centroid[0]:6.2f}, NO2={centroid[1]:6.2f}")

print("\nCluster Assignments:")
for city, cluster_id in zip(city_names_p4, kmeans_p4.cluster_assignments):
    values = city_averages_p4[city]
    print(f"{city:15}: Cluster {cluster_id+1} (PM2.5={values[0]:6.2f}, NO2={values[1]:6.2f})")

# Create comprehensive iteration visualization
iterations_to_show = min(4, len(kmeans_p4.iteration_history))
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']

for iter_idx in range(iterations_to_show):
    ax = axes[iter_idx]
    
    # Get centroids for this iteration
    current_centroids = kmeans_p4.iteration_history[iter_idx]
    
    # Get assignments for this iteration (if available)
    if iter_idx < len(kmeans_p4.assignment_history):
        current_assignments = kmeans_p4.assignment_history[iter_idx]
    else:
        current_assignments = [0] * len(clustering_data_p4)  # Default assignment
    
    # Plot data points colored by cluster
    for i in range(len(current_centroids)):
        cluster_cities = [city for city, cluster in zip(city_names_p4, current_assignments) if cluster == i]
        cluster_data = [city_averages_p4[city] for city in cluster_cities]
        
        if cluster_data:
            pm25_values = [point[0] for point in cluster_data]
            no2_values = [point[1] for point in cluster_data]
            ax.scatter(pm25_values, no2_values, c=colors[i], label=f'Cluster {i+1}', 
                      s=120, alpha=0.8, edgecolors='black', linewidth=1)
            
            # Add city labels
            for city, pm25, no2 in zip(cluster_cities, pm25_values, no2_values):
                ax.annotate(city, (pm25, no2), xytext=(3, 3), textcoords='offset points', 
                           fontsize=8, fontweight='bold')
    
    # Plot centroids
    for i, centroid in enumerate(current_centroids):
        ax.scatter(centroid[0], centroid[1], c='black', marker='X', s=250, 
                   edgecolors=colors[i], linewidth=3, zorder=5)
        
        # Add centroid coordinates as labels
        ax.annotate(f'C{i+1}', (centroid[0], centroid[1]), xytext=(0, -15), 
                   textcoords='offset points', ha='center', fontweight='bold', 
                   color='black', fontsize=10)
    
    ax.set_xlabel('PM2.5 (μg/m³)', fontweight='bold')
    ax.set_ylabel('NO2 (μg/m³)', fontweight='bold')
    ax.set_title(f'Iteration {iter_idx + 1}', fontsize=12, fontweight='bold')
    ax.grid(True, alpha=0.3)
    ax.legend(fontsize=9)

plt.suptitle('Presenter 4: K-Means Iterations - Centroid Movement (PM2.5 vs NO2)', 
             fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Create a separate plot showing centroid movement paths
fig, ax = plt.subplots(1, 1, figsize=(12, 8))

# Plot all data points
for city, values in city_averages_p4.items():
    ax.scatter(values[0], values[1], c='lightgray', s=100, alpha=0.6, edgecolors='black')
    ax.annotate(city, (values[0], values[1]), xytext=(5, 5), textcoords='offset points', 
               fontsize=10, fontweight='bold')

# Plot centroid paths
for centroid_idx in range(len(kmeans_p4.iteration_history[0])):
    centroid_path_x = [iteration[centroid_idx][0] for iteration in kmeans_p4.iteration_history]
    centroid_path_y = [iteration[centroid_idx][1] for iteration in kmeans_p4.iteration_history]
    
    # Plot path
    ax.plot(centroid_path_x, centroid_path_y, color=colors[centroid_idx], linewidth=3, 
            alpha=0.8, marker='o', markersize=8, label=f'Centroid {centroid_idx+1} Path')
    
    # Mark start and end
    ax.scatter(centroid_path_x[0], centroid_path_y[0], c=colors[centroid_idx], 
               marker='s', s=150, edgecolors='black', linewidth=2, label=f'Start C{centroid_idx+1}')
    ax.scatter(centroid_path_x[-1], centroid_path_y[-1], c=colors[centroid_idx], 
               marker='X', s=200, edgecolors='black', linewidth=3, label=f'Final C{centroid_idx+1}')

ax.set_xlabel('PM2.5 (μg/m³)', fontsize=12, fontweight='bold')
ax.set_ylabel('NO2 (μg/m³)', fontsize=12, fontweight='bold')
ax.set_title('Centroid Movement During K-Means Convergence', fontsize=14, fontweight='bold')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Final cluster analysis
print("\n📊 Conclusion from Clustered Graph:")
print("Clusters capture pollutant-compositional diversity among cities. Iterative centroid")
print("updates provide insight into how clustering converges on meaningful groupings")
print("in multi-dimensional space.")

print(f"\nIteration Analysis:")
print(f"  Total iterations until convergence: {len(kmeans_p4.iteration_history)}")
print(f"  Initial centroids: {kmeans_p4.iteration_history[0]}")
print(f"  Final centroids: {kmeans_p4.iteration_history[-1]}")

# Calculate cluster statistics
cluster_stats_p4 = {}
for i in range(len(kmeans_p4.centroids)):
    cluster_cities = [city for city, cluster in zip(city_names_p4, kmeans_p4.cluster_assignments) if cluster == i]
    if cluster_cities:
        cluster_data = [city_averages_p4[city] for city in cluster_cities]
        avg_pm25 = sum(point[0] for point in cluster_data) / len(cluster_data)
        avg_no2 = sum(point[1] for point in cluster_data) / len(cluster_data)
        cluster_stats_p4[f'Cluster {i+1}'] = {
            'cities': cluster_cities, 
            'avg_pm25': avg_pm25, 
            'avg_no2': avg_no2,
            'count': len(cluster_cities)
        }

print(f"\nFinal Cluster Statistics:")
for cluster_name, stats in cluster_stats_p4.items():
    print(f"  {cluster_name}: {stats['count']} cities")
    print(f"    Average PM2.5: {stats['avg_pm25']:.2f} μg/m³")
    print(f"    Average NO2: {stats['avg_no2']:.2f} μg/m³")
    print(f"    Cities: {', '.join(stats['cities'])}")
    print()

## Section 6: Presenter 5 - Multi-Feature Three-Pollutant Clustering

**Topic**: Clustering of 7 cities using PM2.5, PM10, and NO2  
**Dataset Focus**: Multi-pollutant averages from 7 cities in the dataset  
**Method**: Manual K-Means with K=2 clusters over three features using multi-dimensional Euclidean distance

In [None]:
# Presenter 5: Multi-Feature Three-Pollutant Clustering

# Select 7 cities for analysis
selected_cities_p5 = ['Bangkok', 'Istanbul', 'Mumbai', 'Paris', 'Tokyo', 'New York', 'London']

# Get average PM2.5, PM10, and NO2 values for these cities
pollutants_p5 = ['PM2.5', 'PM10', 'NO2']
city_averages_p5 = get_city_averages(air_quality_data, selected_cities_p5, pollutants_p5)

print("Presenter 5: City averages for PM2.5, PM10, and NO2")
print("=" * 60)
for city, values in city_averages_p5.items():
    print(f"{city:15}: PM2.5={values[0]:6.2f}, PM10={values[1]:6.2f}, NO2={values[2]:6.2f}")

# Prepare data for clustering
clustering_data_p5 = list(city_averages_p5.values())
city_names_p5 = list(city_averages_p5.keys())

print(f"\nClustering data shape: {len(clustering_data_p5)} cities × {len(pollutants_p5)} features")

# Perform K-Means clustering with K=2
kmeans_p5 = ManualKMeans(k=2, random_state=42)
kmeans_p5.fit(clustering_data_p5)

print("\nPresenter 5: K-Means Clustering Results (K=2)")
print("=" * 50)

# Display final centroids
print("Final Centroids (Multi-Pollutant Profiles):")
for i, centroid in enumerate(kmeans_p5.centroids):
    print(f"Cluster {i+1}: PM2.5={centroid[0]:6.2f}, PM10={centroid[1]:6.2f}, NO2={centroid[2]:6.2f}")

# Group cities by cluster
clusters_dict_p5 = defaultdict(list)
for city, cluster_id in zip(city_names_p5, kmeans_p5.cluster_assignments):
    values = city_averages_p5[city]
    clusters_dict_p5[cluster_id].append((city, values))

# Create comprehensive visualization
fig = plt.figure(figsize=(20, 16))

# Create a 3x3 grid for multiple visualizations
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# Plot 1: 3D scatter plot (using 2D projections)
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1]) 
ax3 = fig.add_subplot(gs[0, 2])

# Define cluster colors and labels
colors = ['#2ECC71', '#E74C3C']  # Green for cleaner, Red for more polluted
cluster_labels = ['Cleaner Cities', 'More Polluted Cities']

# Determine which cluster is cleaner based on total pollution
cluster_totals = {}
for cluster_id in range(len(kmeans_p5.centroids)):
    if cluster_id in clusters_dict_p5:
        total_pollution = sum(sum(values) for _, values in clusters_dict_p5[cluster_id]) / len(clusters_dict_p5[cluster_id])
        cluster_totals[cluster_id] = total_pollution

# Sort clusters by pollution level
sorted_clusters = sorted(cluster_totals.items(), key=lambda x: x[1])
cluster_mapping = {old_id: new_id for new_id, (old_id, _) in enumerate(sorted_clusters)}

# PM2.5 vs PM10
for cluster_id in range(len(kmeans_p5.centroids)):
    if cluster_id in clusters_dict_p5:
        cities, values_list = zip(*clusters_dict_p5[cluster_id])
        pm25_vals = [vals[0] for vals in values_list]
        pm10_vals = [vals[1] for vals in values_list]
        
        mapped_id = cluster_mapping[cluster_id]
        ax1.scatter(pm25_vals, pm10_vals, c=colors[mapped_id], s=120, alpha=0.8, 
                   edgecolors='black', linewidth=1, label=f'{cluster_labels[mapped_id]}')
        
        for city, pm25, pm10 in zip(cities, pm25_vals, pm10_vals):
            ax1.annotate(city, (pm25, pm10), xytext=(3, 3), textcoords='offset points', 
                        fontsize=9, fontweight='bold')

# Plot centroids
for cluster_id in range(len(kmeans_p5.centroids)):
    centroid = kmeans_p5.centroids[cluster_id]
    mapped_id = cluster_mapping[cluster_id]
    ax1.scatter(centroid[0], centroid[1], c='black', marker='X', s=200, 
               edgecolors=colors[mapped_id], linewidth=3, zorder=5)

ax1.set_xlabel('PM2.5 (μg/m³)', fontweight='bold')
ax1.set_ylabel('PM10 (μg/m³)', fontweight='bold')
ax1.set_title('PM2.5 vs PM10', fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# PM2.5 vs NO2
for cluster_id in range(len(kmeans_p5.centroids)):
    if cluster_id in clusters_dict_p5:
        cities, values_list = zip(*clusters_dict_p5[cluster_id])
        pm25_vals = [vals[0] for vals in values_list]
        no2_vals = [vals[2] for vals in values_list]
        
        mapped_id = cluster_mapping[cluster_id]
        ax2.scatter(pm25_vals, no2_vals, c=colors[mapped_id], s=120, alpha=0.8, 
                   edgecolors='black', linewidth=1, label=f'{cluster_labels[mapped_id]}')
        
        for city, pm25, no2 in zip(cities, pm25_vals, no2_vals):
            ax2.annotate(city, (pm25, no2), xytext=(3, 3), textcoords='offset points', 
                        fontsize=9, fontweight='bold')

for cluster_id in range(len(kmeans_p5.centroids)):
    centroid = kmeans_p5.centroids[cluster_id]
    mapped_id = cluster_mapping[cluster_id]
    ax2.scatter(centroid[0], centroid[2], c='black', marker='X', s=200, 
               edgecolors=colors[mapped_id], linewidth=3, zorder=5)

ax2.set_xlabel('PM2.5 (μg/m³)', fontweight='bold')
ax2.set_ylabel('NO2 (μg/m³)', fontweight='bold')
ax2.set_title('PM2.5 vs NO2', fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

# PM10 vs NO2
for cluster_id in range(len(kmeans_p5.centroids)):
    if cluster_id in clusters_dict_p5:
        cities, values_list = zip(*clusters_dict_p5[cluster_id])
        pm10_vals = [vals[1] for vals in values_list]
        no2_vals = [vals[2] for vals in values_list]
        
        mapped_id = cluster_mapping[cluster_id]
        ax3.scatter(pm10_vals, no2_vals, c=colors[mapped_id], s=120, alpha=0.8, 
                   edgecolors='black', linewidth=1, label=f'{cluster_labels[mapped_id]}')
        
        for city, pm10, no2 in zip(cities, pm10_vals, no2_vals):
            ax3.annotate(city, (pm10, no2), xytext=(3, 3), textcoords='offset points', 
                        fontsize=9, fontweight='bold')

for cluster_id in range(len(kmeans_p5.centroids)):
    centroid = kmeans_p5.centroids[cluster_id]
    mapped_id = cluster_mapping[cluster_id]
    ax3.scatter(centroid[1], centroid[2], c='black', marker='X', s=200, 
               edgecolors=colors[mapped_id], linewidth=3, zorder=5)

ax3.set_xlabel('PM10 (μg/m³)', fontweight='bold')
ax3.set_ylabel('NO2 (μg/m³)', fontweight='bold')
ax3.set_title('PM10 vs NO2', fontweight='bold')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Plot 4: Radar chart for cluster profiles
ax4 = fig.add_subplot(gs[1, 0])
categories = ['PM2.5', 'PM10', 'NO2']
N = len(categories)

angles = [n / float(N) * 2 * math.pi for n in range(N)]
angles += angles[:1]  # Complete the circle

for cluster_id in range(len(kmeans_p5.centroids)):
    centroid = kmeans_p5.centroids[cluster_id]
    values = list(centroid) + [centroid[0]]  # Complete the circle
    mapped_id = cluster_mapping[cluster_id]
    
    ax4.plot(angles, values, 'o-', linewidth=2, label=f'{cluster_labels[mapped_id]}', 
             color=colors[mapped_id])
    ax4.fill(angles, values, alpha=0.25, color=colors[mapped_id])

ax4.set_xticks(angles[:-1])
ax4.set_xticklabels(categories, fontweight='bold')
ax4.set_title('Cluster Centroids Profile', fontweight='bold')
ax4.legend()
ax4.grid(True)

# Plot 5: Total pollution comparison
ax5 = fig.add_subplot(gs[1, 1])
total_pollution_by_city = {}
for city, values in city_averages_p5.items():
    total_pollution_by_city[city] = sum(values)

# Sort cities by total pollution
sorted_cities = sorted(total_pollution_by_city.items(), key=lambda x: x[1])
cities_sorted, totals_sorted = zip(*sorted_cities)

# Color bars by cluster
bar_colors = []
for city in cities_sorted:
    cluster_id = kmeans_p5.cluster_assignments[city_names_p5.index(city)]
    mapped_id = cluster_mapping[cluster_id]
    bar_colors.append(colors[mapped_id])

bars = ax5.bar(range(len(cities_sorted)), totals_sorted, color=bar_colors, alpha=0.8, edgecolor='black')
ax5.set_xlabel('Cities', fontweight='bold')
ax5.set_ylabel('Total Pollution (μg/m³)', fontweight='bold')
ax5.set_title('Total Pollution by City', fontweight='bold')
ax5.set_xticks(range(len(cities_sorted)))
ax5.set_xticklabels(cities_sorted, rotation=45, ha='right')
ax5.grid(True, alpha=0.3, axis='y')

# Add value labels
for bar, total in zip(bars, totals_sorted):
    ax5.annotate(f'{total:.0f}',
                xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom', fontweight='bold')

# Plot 6: Cluster comparison heatmap
ax6 = fig.add_subplot(gs[1, 2])
cluster_data = []
cluster_names = []

for cluster_id in range(len(kmeans_p5.centroids)):
    if cluster_id in clusters_dict_p5:
        mapped_id = cluster_mapping[cluster_id]
        centroid = kmeans_p5.centroids[cluster_id]
        cluster_data.append(centroid)
        cluster_names.append(cluster_labels[mapped_id])

if cluster_data:
    cluster_array = np.array(cluster_data)
    im = ax6.imshow(cluster_array, cmap='RdYlGn_r', aspect='auto')
    
    ax6.set_xticks(range(len(pollutants_p5)))
    ax6.set_xticklabels(pollutants_p5, fontweight='bold')
    ax6.set_yticks(range(len(cluster_names)))
    ax6.set_yticklabels(cluster_names, fontweight='bold')
    ax6.set_title('Cluster Centroid Heatmap', fontweight='bold')
    
    # Add text annotations
    for i in range(len(cluster_names)):
        for j in range(len(pollutants_p5)):
            text = ax6.text(j, i, f'{cluster_array[i, j]:.1f}',
                           ha="center", va="center", color="black", fontweight='bold')
    
    plt.colorbar(im, ax=ax6, label='Concentration (μg/m³)')

# Plot 7-9: Individual pollutant distributions
pollutant_axes = [fig.add_subplot(gs[2, i]) for i in range(3)]

for p_idx, (pollutant, ax) in enumerate(zip(pollutants_p5, pollutant_axes)):
    for cluster_id in range(len(kmeans_p5.centroids)):
        if cluster_id in clusters_dict_p5:
            cities, values_list = zip(*clusters_dict_p5[cluster_id])
            pollutant_vals = [vals[p_idx] for vals in values_list]
            mapped_id = cluster_mapping[cluster_id]
            
            ax.scatter([mapped_id] * len(pollutant_vals), pollutant_vals, 
                      c=colors[mapped_id], s=100, alpha=0.8, edgecolors='black')
            
            # Add city labels
            for city, val in zip(cities, pollutant_vals):
                ax.annotate(city, (mapped_id, val), xytext=(5, 0), textcoords='offset points', 
                           fontsize=8, fontweight='bold', va='center')
    
    ax.set_xlabel('Cluster', fontweight='bold')
    ax.set_ylabel(f'{pollutant} (μg/m³)', fontweight='bold')
    ax.set_title(f'{pollutant} Distribution by Cluster', fontweight='bold')
    ax.set_xticks(range(len(cluster_labels)))
    ax.set_xticklabels(cluster_labels, rotation=45, ha='right')
    ax.grid(True, alpha=0.3)

plt.suptitle('Presenter 5: Multi-Feature Three-Pollutant Clustering Analysis', 
             fontsize=18, fontweight='bold')
plt.show()

# Display detailed results
print("\nCluster Results - Clear Groups Based on Combined Pollutant Profiles:")
print("-" * 70)

for cluster_id in range(len(kmeans_p5.centroids)):
    if cluster_id in clusters_dict_p5:
        mapped_id = cluster_mapping[cluster_id]
        centroid = kmeans_p5.centroids[cluster_id]
        print(f"\n🌍 {cluster_labels[mapped_id].upper()} (Cluster {cluster_id+1}):")
        print(f"   Centroid Profile: PM2.5={centroid[0]:.2f}, PM10={centroid[1]:.2f}, NO2={centroid[2]:.2f}")
        print("   Cities in this cluster:")
        
        for city, values in sorted(clusters_dict_p5[cluster_id], key=lambda x: sum(x[1])):
            total_pollution = sum(values)
            print(f"   • {city:15}: PM2.5={values[0]:6.2f}, PM10={values[1]:6.2f}, NO2={values[2]:6.2f} (Total: {total_pollution:6.2f})")

# Calculate cluster statistics
print("\n📊 Cluster Statistics:")
for cluster_id in range(len(kmeans_p5.centroids)):
    if cluster_id in clusters_dict_p5:
        cluster_cities = clusters_dict_p5[cluster_id]
        mapped_id = cluster_mapping[cluster_id]
        
        if cluster_cities:
            pm25_avg = sum(values[0] for _, values in cluster_cities) / len(cluster_cities)
            pm10_avg = sum(values[1] for _, values in cluster_cities) / len(cluster_cities)
            no2_avg = sum(values[2] for _, values in cluster_cities) / len(cluster_cities)
            total_avg = pm25_avg + pm10_avg + no2_avg
            
            print(f"   {cluster_labels[mapped_id]}:")
            print(f"     Average PM2.5: {pm25_avg:.2f} μg/m³")
            print(f"     Average PM10:  {pm10_avg:.2f} μg/m³")
            print(f"     Average NO2:   {no2_avg:.2f} μg/m³")
            print(f"     Average Total: {total_avg:.2f} μg/m³")
            print(f"     Total cities:  {len(cluster_cities)}")

print("\n📊 Conclusion from Multi-Pollutant Clustering:")
print("Multi-pollutant analysis yields comprehensive pollution profiles, better capturing")
print("complexity of air quality. Clustering distinguishes cleaner vs. more polluted groups")
print("for multilayered health and environmental assessments.")

## Section 7: Results Comparison and Analysis

Let's compare all the clustering results and analyze the effectiveness of different feature combinations.

In [None]:
# Summary and Overall Conclusions with Final Visualizations

print("="*80)
print("COMPREHENSIVE ANALYSIS SUMMARY")
print("="*80)

# Create a comprehensive summary visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Feature progression comparison
presenters = ['P1: 2D\n(PM2.5,PM10)', 'P2: 1D\n(AQI)', 'P3: 1D\n(AQI)', 'P4: 2D\n(PM2.5,NO2)', 'P5: 3D\n(PM2.5,PM10,NO2)']
dimensions = [2, 1, 1, 2, 3]
k_values = [2, 3, 3, 3, 2]
colors_prog = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57']

# Create bubble chart
for i, (presenter, dim, k, color) in enumerate(zip(presenters, dimensions, k_values, colors_prog)):
    ax1.scatter(dim, k, s=200, c=color, alpha=0.8, edgecolors='black', linewidth=2)
    ax1.annotate(presenter, (dim, k), xytext=(0, 20), textcoords='offset points', 
                ha='center', fontweight='bold', fontsize=10)

ax1.set_xlabel('Number of Features (Dimensions)', fontweight='bold', fontsize=12)
ax1.set_ylabel('Number of Clusters (K)', fontweight='bold', fontsize=12)
ax1.set_title('Analysis Progression: Features vs Clusters', fontweight='bold', fontsize=14)
ax1.grid(True, alpha=0.3)
ax1.set_xticks([1, 2, 3])
ax1.set_yticks([1, 2, 3, 4])

# Plot 2: Complexity and insight progression
complexity_scores = [2, 3, 3, 4, 5]  # Subjective complexity scores
insight_scores = [3, 4, 3, 4, 5]     # Subjective insight depth scores

ax2.plot(range(1, 6), complexity_scores, 'o-', linewidth=3, markersize=10, 
         color='#E74C3C', label='Analysis Complexity')
ax2.plot(range(1, 6), insight_scores, 's-', linewidth=3, markersize=10, 
         color='#2ECC71', label='Insight Depth')

ax2.set_xlabel('Presenter Number', fontweight='bold', fontsize=12)
ax2.set_ylabel('Score (1-5)', fontweight='bold', fontsize=12)
ax2.set_title('Complexity vs Insight Progression', fontweight='bold', fontsize=14)
ax2.legend(fontsize=11)
ax2.grid(True, alpha=0.3)
ax2.set_xticks(range(1, 6))
ax2.set_xticklabels(['P1', 'P2', 'P3', 'P4', 'P5'])

# Plot 3: Application focus areas
applications = ['Health Risk\nAssessment', 'Resource\nAllocation', 'Policy\nDevelopment', 
                'Environmental\nMonitoring', 'Urban\nPlanning']
relevance_scores = [5, 4, 5, 4, 3]  # How relevant each application is

bars = ax3.barh(applications, relevance_scores, color=colors_prog, alpha=0.8, edgecolor='black')
ax3.set_xlabel('Relevance Score (1-5)', fontweight='bold', fontsize=12)
ax3.set_title('Practical Applications Relevance', fontweight='bold', fontsize=14)
ax3.grid(True, alpha=0.3, axis='x')

# Add value labels
for bar, score in zip(bars, relevance_scores):
    ax3.annotate(f'{score}', xy=(score, bar.get_y() + bar.get_height()/2),
                xytext=(5, 0), textcoords='offset points',
                va='center', fontweight='bold')

# Plot 4: Future scope radar chart
categories = ['Temporal\nAnalysis', 'Socio-Economic\nIntegration', 'Weather\nCorrelation', 
              'Advanced\nClustering', 'Real-time\nMonitoring']
scores = [4, 3, 4, 5, 3]  # Importance scores for future scope

N = len(categories)
angles = [n / float(N) * 2 * math.pi for n in range(N)]
angles += angles[:1]
scores += scores[:1]

ax4.plot(angles, scores, 'o-', linewidth=2, color='#9B59B6', markersize=8)
ax4.fill(angles, scores, alpha=0.25, color='#9B59B6')
ax4.set_xticks(angles[:-1])
ax4.set_xticklabels(categories, fontweight='bold')
ax4.set_title('Future Scope Priorities', fontweight='bold', fontsize=14)
ax4.set_ylim(0, 5)
ax4.grid(True)

plt.tight_layout()
plt.show()

print("\n🎯 PRESENTER CONTRIBUTIONS:")
print("-" * 50)

contributions = [
    ("1️⃣ PRESENTER 1", "Simple 2D Clustering (PM2.5 & PM10)", "K=2 clusters", "Basic pollution distinction"),
    ("2️⃣ PRESENTER 2", "Country-Level AQI Clustering", "K=3 clusters", "National-level policy targeting"),
    ("3️⃣ PRESENTER 3", "City-Level AQI Clustering", "K=3 clusters", "Urban pollution management"),
    ("4️⃣ PRESENTER 4", "Two-Pollutant City Clustering", "K=3 with iterations", "Convergence visualization"),
    ("5️⃣ PRESENTER 5", "Multi-Feature Three-Pollutant", "K=2 clusters", "Comprehensive profiles")
]

for presenter, description, method, insight in contributions:
    print(f"\n{presenter} - {description}")
    print(f"   • Method: {method}")
    print(f"   • Key Insight: {insight}")

print("\n" + "="*80)
print("📈 ANALYSIS PROGRESSION")
print("="*80)

progression_data = [
    ("Presenter 1", "2D", "PM2.5, PM10", 2, "Visual separation"),
    ("Presenter 2", "1D", "AQI", 3, "Policy targeting"),
    ("Presenter 3", "1D", "AQI", 3, "Urban management"),
    ("Presenter 4", "2D", "PM2.5, NO2", 3, "Iterative convergence"),
    ("Presenter 5", "3D", "PM2.5, PM10, NO2", 2, "Comprehensive profiles")
]

print(f"{'Presenter':<12} {'Dims':<4} {'Features':<18} {'K':<3} {'Key Insight'}")
print("-" * 70)
for presenter, dims, features, k, insight in progression_data:
    print(f"{presenter:<12} {dims:<4} {features:<18} {k:<3} {insight}")

print("\n" + "="*80)
print("🎯 OVERALL CONCLUSIONS")
print("="*80)

print("\n✅ KEY FINDINGS:")
findings = [
    "Multi-pollutant K-Means clustering reveals meaningful air quality patterns",
    "Progression from simple to complex features enhances analytical depth", 
    "Different scales (city vs country) provide complementary insights",
    "Manual implementation demonstrates algorithmic understanding",
    "Clustering supports evidence-based environmental decision making"
]

for finding in findings:
    print(f"   • {finding}")

print("\n🌍 PRACTICAL APPLICATIONS:")
applications_list = [
    "Health Risk Assessment: Identify high-risk pollution zones",
    "Resource Allocation: Target interventions based on cluster profiles",
    "Policy Development: National vs local pollution management strategies", 
    "Environmental Monitoring: Systematic pollution pattern recognition",
    "Urban Planning: Inform sustainable city development"
]

for app in applications_list:
    print(f"   • {app}")

print("\n🔮 FUTURE SCOPE:")
future_items = [
    "Temporal Analysis: Incorporate seasonal/yearly pollution trends",
    "Socio-Economic Integration: Add demographic and economic indicators",
    "Weather Correlation: Include meteorological factors",
    "Advanced Clustering: Compare with DBSCAN, hierarchical methods",
    "Real-time Monitoring: Dynamic clustering for live air quality data"
]

for item in future_items:
    print(f"   • {item}")

print("\n" + "="*80)
print("🏆 SUCCESS METRICS")
print("="*80)

success_metrics = {
    "Manual Implementation": "✅ Complete K-means from scratch",
    "Visual Analysis": "✅ Comprehensive plots for each presenter", 
    "Convergence Tracking": "✅ Iteration visualization for P4",
    "Multi-dimensional Analysis": "✅ 1D, 2D, and 3D feature spaces",
    "Practical Insights": "✅ Actionable conclusions for each analysis"
}

for metric, status in success_metrics.items():
    print(f"   {metric:<25}: {status}")

print("\n" + "="*80)
print("Thank you for following this comprehensive multi-pollutant clustering analysis!")
print("All presenter sections completed with detailed visualizations and insights!")
print("="*80)