# Multi-Pollutant Profile Grouping of Air Quality Data Using K-Means Clustering

## Comprehensive Analysis Covering All Presenter Sections

This notebook demonstrates manual K-means clustering implementation on air quality data without using sklearn. We'll analyze pollution patterns across different scales - from urban to country levels - using various pollutant combinations.

### Presentation Structure:
1. **Presenter 1**: Simple 2D Clustering (PM2.5 and PM10)
2. **Presenter 2**: Country-Level AQI Clustering  
3. **Presenter 3**: City-Level AQI Clustering
4. **Presenter 4**: Two-Pollutant City Clustering (PM2.5 and NO2)
5. **Presenter 5**: Multi-Feature Three-Pollutant Clustering
6. **Results Comparison and Analysis**

## Section 1: Data Preparation and Imports

First, let's import the necessary libraries and prepare our dataset. We'll implement everything manually without using sklearn.

In [None]:
import csv
import math
import random
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict

# Set up plotting style
plt.style.use('default')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

# Manual K-Means Implementation
class ManualKMeans:
    def __init__(self, k=2, max_iters=100, random_state=42):
        self.k = k
        self.max_iters = max_iters
        self.random_state = random_state
        random.seed(random_state)
        
    def euclidean_distance(self, point1, point2):
        """Calculate Euclidean distance between two points"""
        if isinstance(point1, (int, float)) and isinstance(point2, (int, float)):
            return abs(point1 - point2)
        
        distance = 0
        for i in range(len(point1)):
            distance += (point1[i] - point2[i]) ** 2
        return math.sqrt(distance)
    
    def initialize_centroids(self, data):
        """Initialize centroids randomly"""
        if not data:
            return []
        
        # Check if data is 1D or multi-dimensional
        if isinstance(data[0], (int, float)):
            # 1D data
            min_val, max_val = min(data), max(data)
            return [random.uniform(min_val, max_val) for _ in range(self.k)]
        else:
            # Multi-dimensional data
            n_features = len(data[0])
            centroids = []
            for _ in range(self.k):
                centroid = []
                for j in range(n_features):
                    feature_values = [point[j] for point in data]
                    min_val, max_val = min(feature_values), max(feature_values)
                    centroid.append(random.uniform(min_val, max_val))
                centroids.append(centroid)
            return centroids
    
    def assign_clusters(self, data, centroids):
        """Assign each point to the nearest centroid"""
        clusters = [[] for _ in range(self.k)]
        cluster_assignments = []
        
        for point in data:
            distances = [self.euclidean_distance(point, centroid) for centroid in centroids]
            closest_cluster = distances.index(min(distances))
            clusters[closest_cluster].append(point)
            cluster_assignments.append(closest_cluster)
        
        return clusters, cluster_assignments
    
    def update_centroids(self, clusters):
        """Update centroids based on current clusters"""
        new_centroids = []
        
        for cluster in clusters:
            if not cluster:
                continue
            
            if isinstance(cluster[0], (int, float)):
                # 1D data
                centroid = sum(cluster) / len(cluster)
            else:
                # Multi-dimensional data
                n_features = len(cluster[0])
                centroid = []
                for j in range(n_features):
                    feature_sum = sum(point[j] for point in cluster)
                    centroid.append(feature_sum / len(cluster))
            
            new_centroids.append(centroid)
        
        return new_centroids
    
    def fit(self, data):
        """Fit K-means to the data"""
        self.centroids = self.initialize_centroids(data)
        self.history = [self.centroids.copy()]
        
        for iteration in range(self.max_iters):
            clusters, assignments = self.assign_clusters(data, self.centroids)
            new_centroids = self.update_centroids(clusters)
            
            # Remove None centroids (empty clusters)
            new_centroids = [c for c in new_centroids if c is not None]
            
            # Check for convergence
            if len(new_centroids) == len(self.centroids):
                converged = True
                for i, (old, new) in enumerate(zip(self.centroids, new_centroids)):
                    if isinstance(old, (int, float)):
                        if abs(old - new) > 1e-6:
                            converged = False
                            break
                    else:
                        if any(abs(o - n) > 1e-6 for o, n in zip(old, new)):
                            converged = False
                            break
                
                if converged:
                    print(f"Converged after {iteration + 1} iterations")
                    break
            
            self.centroids = new_centroids
            self.history.append(self.centroids.copy())
        
        self.final_clusters, self.cluster_assignments = self.assign_clusters(data, self.centroids)
        return self

# Helper functions
def load_air_quality_data():
    """Load the air quality dataset"""
    data = []
    with open('s:\\AIML\\global_air_quality_data_10000.csv', 'r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            data.append(row)
    return data

def get_city_averages(data, cities, pollutants):
    """Calculate average pollutant values for specified cities"""
    city_data = defaultdict(list)
    
    for record in data:
        city = record['City']
        if city in cities:
            pollutant_values = {}
            for pollutant in pollutants:
                try:
                    value = float(record[pollutant])
                    pollutant_values[pollutant] = value
                except (ValueError, KeyError):
                    continue
            if len(pollutant_values) == len(pollutants):
                city_data[city].append(pollutant_values)
    
    # Calculate averages
    averages = {}
    for city in cities:
        if city in city_data and city_data[city]:
            avg_values = []
            for pollutant in pollutants:
                values = [record[pollutant] for record in city_data[city]]
                if values:
                    avg_values.append(sum(values) / len(values))
                else:
                    avg_values.append(0)
            averages[city] = avg_values
    
    return averages

def calculate_aqi_simple(pm25, pm10, no2):
    """Simplified AQI calculation based on major pollutants"""
    pm25_norm = min(pm25 / 35.0, 1.0) * 100
    pm10_norm = min(pm10 / 50.0, 1.0) * 100
    no2_norm = min(no2 / 40.0, 1.0) * 100
    return max(pm25_norm, pm10_norm, no2_norm)

# Load the dataset
print("Loading air quality dataset...")
air_quality_data = load_air_quality_data()
print(f"Loaded {len(air_quality_data)} records")

# Sample first few records to understand the structure
print("\nFirst 3 records:")
for i, record in enumerate(air_quality_data[:3]):
    print(f"Record {i+1}: {record}")

: 

## Section 2: Presenter 1 - Simple 2D Clustering (PM2.5 and PM10)

**Topic**: Clustering cities using two pollutants: PM2.5 and PM10  
**Dataset Focus**: Sample of 10 cities with average PM2.5 and PM10 values  
**Method**: Manual K-Means clustering with K=2 clusters

In [None]:
# Presenter 1: Simple 2D Clustering with PM2.5 and PM10

def get_city_averages(data, cities, pollutants):
    """Calculate average pollutant values for specified cities"""
    city_data = defaultdict(list)
    
    for record in data:
        city = record['City']
        if city in cities:
            for pollutant in pollutants:
                try:
                    value = float(record[pollutant])
                    city_data[city].append({pollutant: value})
                except (ValueError, KeyError):
                    continue
    
    # Calculate averages
    averages = {}
    for city in cities:
        if city in city_data:
            city_records = city_data[city]
            avg_values = []
            for pollutant in pollutants:
                values = []
                for record_dict in city_records:
                    if pollutant in record_dict:
                        values.append(record_dict[pollutant])
                
                if values:
                    avg_values.append(sum(values) / len(values))
                else:
                    avg_values.append(0)
            
            averages[city] = avg_values
    
    return averages

# Select 10 cities for analysis
selected_cities = ['Bangkok', 'Istanbul', 'Mumbai', 'Paris', 'Tokyo', 
                  'New York', 'London', 'Cairo', 'Mexico City', 'Seoul']

# Get average PM2.5 and PM10 values for these cities
pollutants = ['PM2.5', 'PM10']
city_averages = get_city_averages(air_quality_data, selected_cities, pollutants)

print("Presenter 1: City averages for PM2.5 and PM10")
print("=" * 50)
for city, values in city_averages.items():
    print(f"{city:15}: PM2.5={values[0]:6.2f}, PM10={values[1]:6.2f}")

# Prepare data for clustering
clustering_data = list(city_averages.values())
city_names = list(city_averages.keys())

print(f"\nClustering data shape: {len(clustering_data)} cities × {len(pollutants)} features")

In [None]:
# Perform K-Means clustering with K=2
kmeans_p1 = ManualKMeans(k=2, random_state=42)
kmeans_p1.fit(clustering_data)

print("\nPresenter 1: K-Means Clustering Results (K=2)")
print("=" * 50)

# Display final centroids
print("Final Centroids:")
for i, centroid in enumerate(kmeans_p1.centroids):
    print(f"Cluster {i+1}: PM2.5={centroid[0]:6.2f}, PM10={centroid[1]:6.2f}")

print("\nCluster Assignments:")
for city, cluster_id in zip(city_names, kmeans_p1.cluster_assignments):
    values = city_averages[city]
    print(f"{city:15}: Cluster {cluster_id+1} (PM2.5={values[0]:6.2f}, PM10={values[1]:6.2f})")

# Create detailed visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Scatter plot with clusters
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
cluster_names = ['Higher Pollution Group', 'Lower Pollution Group']

for i in range(kmeans_p1.k):
    cluster_cities = [city for city, cluster in zip(city_names, kmeans_p1.cluster_assignments) if cluster == i]
    cluster_data = [city_averages[city] for city in cluster_cities]
    
    if cluster_data:
        pm25_values = [point[0] for point in cluster_data]
        pm10_values = [point[1] for point in cluster_data]
        ax1.scatter(pm25_values, pm10_values, c=colors[i], label=f'Cluster {i+1}: {cluster_names[i]}', 
                   s=120, alpha=0.8, edgecolors='black', linewidth=1)
        
        # Add city labels
        for city, pm25, pm10 in zip(cluster_cities, pm25_values, pm10_values):
            ax1.annotate(city, (pm25, pm10), xytext=(5, 5), textcoords='offset points', 
                        fontsize=9, fontweight='bold')

# Plot centroids
for i, centroid in enumerate(kmeans_p1.centroids):
    ax1.scatter(centroid[0], centroid[1], c='black', marker='X', s=300, 
                edgecolors=colors[i], linewidth=3, label=f'Centroid {i+1}')

ax1.set_xlabel('PM2.5 (μg/m³)', fontsize=12, fontweight='bold')
ax1.set_ylabel('PM10 (μg/m³)', fontsize=12, fontweight='bold')
ax1.set_title('Presenter 1: City Clustering by PM2.5 and PM10 (K=2)', fontsize=14, fontweight='bold')
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax1.grid(True, alpha=0.3)

# Plot 2: Bar chart showing cluster statistics
cluster_stats = {}
for i in range(kmeans_p1.k):
    cluster_cities = [city for city, cluster in zip(city_names, kmeans_p1.cluster_assignments) if cluster == i]
    cluster_data = [city_averages[city] for city in cluster_cities]
    
    if cluster_data:
        avg_pm25 = sum(point[0] for point in cluster_data) / len(cluster_data)
        avg_pm10 = sum(point[1] for point in cluster_data) / len(cluster_data)
        cluster_stats[f'Cluster {i+1}'] = {'PM2.5': avg_pm25, 'PM10': avg_pm10, 'Cities': len(cluster_data)}

x_pos = np.arange(len(cluster_stats))
pm25_values = [stats['PM2.5'] for stats in cluster_stats.values()]
pm10_values = [stats['PM10'] for stats in cluster_stats.values()]

width = 0.35
bars1 = ax2.bar(x_pos - width/2, pm25_values, width, label='PM2.5', color=colors[0], alpha=0.8)
bars2 = ax2.bar(x_pos + width/2, pm10_values, width, label='PM10', color=colors[1], alpha=0.8)

ax2.set_xlabel('Clusters', fontsize=12, fontweight='bold')
ax2.set_ylabel('Average Concentration (μg/m³)', fontsize=12, fontweight='bold')
ax2.set_title('Average Pollutant Levels by Cluster', fontsize=14, fontweight='bold')
ax2.set_xticks(x_pos)
ax2.set_xticklabels(cluster_stats.keys())
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax2.annotate(f'{height:.1f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n📊 Conclusion from Clustered Graph:")
print("The visualization reveals two distinct groups—one with higher pollution and one cleaner cluster—")
print("demonstrating a clear distinction even with just two features. This sets the foundation")
print("for more complex multi-feature clustering analyses.")

print(f"\nCluster Statistics:")
for cluster_name, stats in cluster_stats.items():
    print(f"  {cluster_name}: {stats['Cities']} cities, Avg PM2.5={stats['PM2.5']:.2f}, Avg PM10={stats['PM10']:.2f}")

## Section 3: Presenter 2 - Country-Level AQI Clustering

**Topic**: Grouping countries by average AQI into Low, Medium, and High pollution clusters  
**Dataset Focus**: 6 countries' average AQI extracted from the dataset  
**Method**: 1D K-Means clustering with K=3 on AQI values

In [None]:
# Presenter 2: Country-Level AQI Clustering

def calculate_aqi_simple(pm25, pm10, no2):
    """
    Simplified AQI calculation based on major pollutants
    This is a simplified version for demonstration purposes
    """
    # Normalize values and combine (simplified approach)
    pm25_norm = min(pm25 / 35.0, 1.0) * 100  # WHO guideline: 15 μg/m³ annual, 35 daily
    pm10_norm = min(pm10 / 50.0, 1.0) * 100  # WHO guideline: 45 μg/m³ annual, 50 daily
    no2_norm = min(no2 / 40.0, 1.0) * 100    # WHO guideline: 40 μg/m³ annual
    
    # Take the maximum as the limiting factor
    aqi = max(pm25_norm, pm10_norm, no2_norm)
    return aqi

def get_country_aqi_averages(data, countries):
    """Calculate average AQI for specified countries"""
    country_data = defaultdict(list)
    
    for record in data:
        country = record['Country']
        if country in countries:
            try:
                pm25 = float(record['PM2.5'])
                pm10 = float(record['PM10'])
                no2 = float(record['NO2'])
                
                aqi = calculate_aqi_simple(pm25, pm10, no2)
                country_data[country].append(aqi)
            except (ValueError, KeyError):
                continue
    
    # Calculate averages
    averages = {}
    for country in countries:
        if country in country_data and country_data[country]:
            averages[country] = sum(country_data[country]) / len(country_data[country])
    
    return averages

# Select 6 countries for analysis
selected_countries = ['Thailand', 'Turkey', 'Brazil', 'India', 'France', 'USA']

# Get average AQI values for these countries
country_aqi = get_country_aqi_averages(air_quality_data, selected_countries)

print("Presenter 2: Country Average AQI Values")
print("=" * 40)
for country, aqi in sorted(country_aqi.items(), key=lambda x: x[1]):
    print(f"{country:12}: AQI = {aqi:6.2f}")

# Prepare data for clustering (1D)
aqi_values = list(country_aqi.values())
country_names_p2 = list(country_aqi.keys())

print(f"\nClustering data: {len(aqi_values)} countries with AQI values")

In [None]:
# Perform K-Means clustering with K=3 for country AQI
kmeans_p2 = ManualKMeans(k=3, random_state=42)
kmeans_p2.fit(aqi_values)

print("\nPresenter 2: K-Means Clustering Results (K=3)")
print("=" * 50)

# Display final centroids
print("Final Centroids (AQI levels):")
centroid_labels = ['Low Pollution', 'Medium Pollution', 'High Pollution']
sorted_centroids = sorted(enumerate(kmeans_p2.centroids), key=lambda x: x[1])

for i, (orig_idx, centroid) in enumerate(sorted_centroids):
    print(f"Cluster {orig_idx+1} ({centroid_labels[i]}): AQI = {centroid:6.2f}")

print("\nCountry Cluster Assignments:")
for country, cluster_id, aqi in zip(country_names_p2, kmeans_p2.cluster_assignments, aqi_values):
    print(f"{country:12}: Cluster {cluster_id+1} - AQI = {aqi:6.2f}")

# Create comprehensive visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: 1D scatter plot along AQI axis
colors = ['#2ECC71', '#F39C12', '#E74C3C']  # Green, Orange, Red
pollution_levels = ['Low', 'Medium', 'High']

# Group countries by cluster
cluster_groups = {i: [] for i in range(kmeans_p2.k)}
for country, cluster_id, aqi in zip(country_names_p2, kmeans_p2.cluster_assignments, aqi_values):
    cluster_groups[cluster_id].append((country, aqi))

y_offset = 0.1
for i in range(kmeans_p2.k):
    if cluster_groups[i]:
        countries, aqis = zip(*cluster_groups[i])
        y_positions = [i + random.uniform(-y_offset, y_offset) for _ in aqis]
        ax1.scatter(aqis, y_positions, c=colors[i], label=f'Cluster {i+1}', s=150, alpha=0.8, edgecolors='black')
        
        # Add country labels
        for country, aqi, y in zip(countries, aqis, y_positions):
            ax1.annotate(country, (aqi, y), xytext=(5, 0), textcoords='offset points', 
                        fontsize=10, fontweight='bold', va='center')

# Plot centroids
for i, centroid in enumerate(kmeans_p2.centroids):
    ax1.axvline(x=centroid, color=colors[i], linestyle='--', linewidth=3, alpha=0.7)
    ax1.scatter(centroid, i, c='black', marker='X', s=300, 
                edgecolors=colors[i], linewidth=3, zorder=5)

ax1.set_xlabel('Average AQI', fontsize=12, fontweight='bold')
ax1.set_ylabel('Cluster', fontsize=12, fontweight='bold')
ax1.set_title('Presenter 2: Country Clustering by Average AQI (K=3)', fontsize=14, fontweight='bold')
ax1.set_yticks(range(kmeans_p2.k))
ax1.set_yticklabels([f'Cluster {i+1}' for i in range(kmeans_p2.k)])
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Bar chart of AQI values by country
countries_sorted = sorted(zip(country_names_p2, aqi_values, kmeans_p2.cluster_assignments), key=lambda x: x[1])
countries, aqis_sorted, clusters = zip(*countries_sorted)

bars = ax2.bar(range(len(countries)), aqis_sorted, 
               color=[colors[cluster] for cluster in clusters], alpha=0.8, edgecolor='black')
ax2.set_xlabel('Countries', fontsize=12, fontweight='bold')
ax2.set_ylabel('Average AQI', fontsize=12, fontweight='bold')
ax2.set_title('AQI Values by Country (Colored by Cluster)', fontsize=14, fontweight='bold')
ax2.set_xticks(range(len(countries)))
ax2.set_xticklabels(countries, rotation=45, ha='right')
ax2.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar, aqi in zip(bars, aqis_sorted):
    ax2.annotate(f'{aqi:.1f}',
                xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom', fontweight='bold')

# Plot 3: Pie chart showing cluster distribution
cluster_counts = [len(cluster_groups[i]) for i in range(kmeans_p2.k)]
cluster_labels_pie = [f'Cluster {i+1}\n({pollution_levels[i]} Pollution)\n{count} countries' 
                     for i, count in enumerate(cluster_counts)]

wedges, texts, autotexts = ax3.pie(cluster_counts, labels=cluster_labels_pie, colors=colors, 
                                  autopct='%1.1f%%', startangle=90, textprops={'fontweight': 'bold'})
ax3.set_title('Distribution of Countries by Pollution Level', fontsize=14, fontweight='bold')

# Plot 4: Centroid comparison
centroid_values = [kmeans_p2.centroids[i] for i in range(kmeans_p2.k)]
bars4 = ax4.bar(range(kmeans_p2.k), centroid_values, color=colors, alpha=0.8, edgecolor='black')
ax4.set_xlabel('Pollution Level', fontsize=12, fontweight='bold')
ax4.set_ylabel('Centroid AQI Value', fontsize=12, fontweight='bold')
ax4.set_title('Cluster Centroids Comparison', fontsize=14, fontweight='bold')
ax4.set_xticks(range(kmeans_p2.k))
ax4.set_xticklabels([f'{pollution_levels[i]}\nPollution' for i in range(kmeans_p2.k)])
ax4.grid(True, alpha=0.3, axis='y')

# Add value labels
for bar, value in zip(bars4, centroid_values):
    ax4.annotate(f'{value:.2f}',
                xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n📊 Conclusion from Clustered Graph:")
print("Distinct partitions indicate national-level air quality differences. This clustering")
print("enables policymakers to focus resources on high-risk nations by identifying")
print("the pollution severity category.")

print(f"\nDetailed Cluster Analysis:")
for i in range(kmeans_p2.k):
    if cluster_groups[i]:
        print(f"  🌍 {pollution_levels[i]} Pollution Cluster ({len(cluster_groups[i])} countries):")
        print(f"     Centroid AQI: {kmeans_p2.centroids[i]:.2f}")
        print(f"     Countries: {', '.join([country for country, _ in cluster_groups[i]])}")
        avg_aqi = sum(aqi for _, aqi in cluster_groups[i]) / len(cluster_groups[i])
        print(f"     Average AQI: {avg_aqi:.2f}")
        print()

## Section 4: Presenter 3 - City-Level AQI Clustering

**Topic**: Clustering 8 cities based on AQI alone  
**Dataset Focus**: AQI values for 8 cities in the dataset  
**Method**: K=3 clusters created using manual assignment and centroid updates

In [None]:
# Presenter 3: City-Level AQI Clustering

def get_city_aqi_averages(data, cities):
    """Calculate average AQI for specified cities"""
    city_data = defaultdict(list)
    
    for record in data:
        city = record['City']
        if city in cities:
            try:
                pm25 = float(record['PM2.5'])
                pm10 = float(record['PM10'])
                no2 = float(record['NO2'])
                
                aqi = calculate_aqi_simple(pm25, pm10, no2)
                city_data[city].append(aqi)
            except (ValueError, KeyError):
                continue
    
    # Calculate averages
    averages = {}
    for city in cities:
        if city in city_data and city_data[city]:
            averages[city] = sum(city_data[city]) / len(city_data[city])
    
    return averages

# Select 8 cities for analysis
selected_cities_p3 = ['Bangkok', 'Istanbul', 'Mumbai', 'Paris', 'Tokyo', 'New York', 'London', 'Cairo']

# Get average AQI values for these cities
city_aqi = get_city_aqi_averages(air_quality_data, selected_cities_p3)

print("Presenter 3: City Average AQI Values")
print("=" * 40)
for city, aqi in sorted(city_aqi.items(), key=lambda x: x[1]):
    print(f"{city:15}: AQI = {aqi:6.2f}")

# Prepare data for clustering (1D)
city_aqi_values = list(city_aqi.values())
city_names_p3 = list(city_aqi.keys())

print(f"\nClustering data: {len(city_aqi_values)} cities with AQI values")

In [None]:
# Perform K-Means clustering with K=3 for city AQI
kmeans_p3 = ManualKMeans(k=3, random_state=42)
kmeans_p3.fit(city_aqi_values)

print("\nPresenter 3: K-Means Clustering Results (K=3)")
print("=" * 50)

# Display final centroids
print("Final Centroids (AQI levels):")
centroid_labels = ['Low Pollution', 'Medium Pollution', 'High Pollution']
sorted_centroids_p3 = sorted(enumerate(kmeans_p3.centroids), key=lambda x: x[1])

for i, (orig_idx, centroid) in enumerate(sorted_centroids_p3):
    print(f"Cluster {orig_idx+1} ({centroid_labels[i]}): AQI = {centroid:6.2f}")

print("\nCity Cluster Assignments (Textual Output):")
print("-" * 55)

# Group cities by cluster
clusters_dict = defaultdict(list)
for city, cluster_id, aqi in zip(city_names_p3, kmeans_p3.cluster_assignments, city_aqi_values):
    clusters_dict[cluster_id].append((city, aqi))

# Display each cluster
for cluster_id in range(kmeans_p3.k):
    centroid_aqi = kmeans_p3.centroids[cluster_id]
    # Determine pollution level based on centroid value
    if centroid_aqi < 50:
        level = "Low Pollution"
    elif centroid_aqi < 75:
        level = "Medium Pollution" 
    else:
        level = "High Pollution"
    
    print(f"\n🏙️  CLUSTER {cluster_id+1} ({level}):")
    print(f"   Centroid AQI: {centroid_aqi:.2f}")
    print("   Cities in this cluster:")
    
    for city, aqi in sorted(clusters_dict[cluster_id], key=lambda x: x[1]):
        print(f"   • {city:15}: AQI = {aqi:6.2f}")

print("\n📊 Conclusion from Clustering:")
print("Clusters effectively group cities by similar pollution levels, enabling identification")
print("of local pollution discrepancies and facilitating targeted urban pollution management.")

## Section 5: Presenter 4 - Two-Pollutant City Clustering (PM2.5 and NO2)

**Topic**: Multi-feature clustering for 8 cities using PM2.5 and NO2  
**Dataset Focus**: Average PM2.5 and NO2 for 8 cities in the dataset  
**Method**: Manual K-Means with K=3 clusters, showing centroid movement iterations

In [None]:
# Presenter 4: Two-Pollutant City Clustering (PM2.5 and NO2)

# Select 8 cities for analysis
selected_cities_p4 = ['Bangkok', 'Istanbul', 'Mumbai', 'Paris', 'Tokyo', 'New York', 'London', 'Cairo']

# Get average PM2.5 and NO2 values for these cities
pollutants_p4 = ['PM2.5', 'NO2']
city_averages_p4 = get_city_averages(air_quality_data, selected_cities_p4, pollutants_p4)

print("Presenter 4: City averages for PM2.5 and NO2")
print("=" * 50)
for city, values in city_averages_p4.items():
    print(f"{city:15}: PM2.5={values[0]:6.2f}, NO2={values[1]:6.2f}")

# Prepare data for clustering
clustering_data_p4 = list(city_averages_p4.values())
city_names_p4 = list(city_averages_p4.keys())

print(f"\nClustering data shape: {len(clustering_data_p4)} cities × {len(pollutants_p4)} features")

In [None]:
# Perform K-Means clustering with K=3, showing iterations
class IterativeKMeans(ManualKMeans):
    def fit_with_iterations(self, data):
        """Fit K-means and return iteration history for visualization"""
        self.centroids = self.initialize_centroids(data)
        self.iteration_history = [self.centroids.copy()]
        self.assignment_history = []
        
        for iteration in range(self.max_iters):
            clusters, assignments = self.assign_clusters(data, self.centroids)
            self.assignment_history.append(assignments.copy())
            new_centroids = self.update_centroids(clusters)
            
            # Remove None centroids (empty clusters)
            new_centroids = [c for c in new_centroids if c is not None]
            
            # Check for convergence
            if len(new_centroids) == len(self.centroids):
                converged = True
                for old, new in zip(self.centroids, new_centroids):
                    if any(abs(o - n) > 1e-6 for o, n in zip(old, new)):
                        converged = False
                        break
                
                if converged:
                    print(f"Converged after {iteration + 1} iterations")
                    break
            
            self.centroids = new_centroids
            self.iteration_history.append(self.centroids.copy())
        
        self.final_clusters, self.cluster_assignments = self.assign_clusters(data, self.centroids)
        return self

# Perform clustering with iteration tracking
kmeans_p4 = IterativeKMeans(k=3, random_state=42)
kmeans_p4.fit_with_iterations(clustering_data_p4)

print("\nPresenter 4: K-Means Clustering Results (K=3)")
print("=" * 50)

# Display final centroids
print("Final Centroids:")
for i, centroid in enumerate(kmeans_p4.centroids):
    print(f"Cluster {i+1}: PM2.5={centroid[0]:6.2f}, NO2={centroid[1]:6.2f}")

print("\nCluster Assignments:")
for city, cluster_id in zip(city_names_p4, kmeans_p4.cluster_assignments):
    values = city_averages_p4[city]
    print(f"{city:15}: Cluster {cluster_id+1} (PM2.5={values[0]:6.2f}, NO2={values[1]:6.2f})")

# Visualize iterations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.flatten()

# Show first 4 iterations (or all if fewer)
iterations_to_show = min(4, len(kmeans_p4.iteration_history))

for iter_idx in range(iterations_to_show):
    ax = axes[iter_idx]
    
    # Get centroids for this iteration
    current_centroids = kmeans_p4.iteration_history[iter_idx]
    
    # Get assignments for this iteration (if available)
    if iter_idx < len(kmeans_p4.assignment_history):
        current_assignments = kmeans_p4.assignment_history[iter_idx]
    else:
        current_assignments = [0] * len(clustering_data_p4)  # Default assignment
    
    # Plot data points colored by cluster
    colors = ['red', 'blue', 'green', 'orange']
    for i in range(len(current_centroids)):
        cluster_cities = [city for city, cluster in zip(city_names_p4, current_assignments) if cluster == i]
        cluster_data = [city_averages_p4[city] for city in cluster_cities]
        
        if cluster_data:
            pm25_values = [point[0] for point in cluster_data]
            no2_values = [point[1] for point in cluster_data]
            ax.scatter(pm25_values, no2_values, c=colors[i], label=f'Cluster {i+1}', s=100, alpha=0.7)
    
    # Plot centroids
    for i, centroid in enumerate(current_centroids):
        ax.scatter(centroid[0], centroid[1], c='black', marker='X', s=200, 
                   edgecolors=colors[i], linewidth=2)
    
    ax.set_xlabel('PM2.5 (μg/m³)')
    ax.set_ylabel('NO2 (μg/m³)')
    ax.set_title(f'Iteration {iter_idx + 1}')
    ax.grid(True, alpha=0.3)
    ax.legend()

plt.suptitle('Presenter 4: K-Means Iterations - Centroid Movement (PM2.5 vs NO2)', fontsize=16)
plt.tight_layout()
plt.show()

print("\n📊 Conclusion from Clustered Graph:")
print("Clusters capture pollutant-compositional diversity among cities. Iterative centroid")
print("updates provide insight into how clustering converges on meaningful groupings")
print("in multi-dimensional space.")

## Section 6: Presenter 5 - Multi-Feature Three-Pollutant Clustering

**Topic**: Clustering of 7 cities using PM2.5, PM10, and NO2  
**Dataset Focus**: Multi-pollutant averages from 7 cities in the dataset  
**Method**: Manual K-Means with K=2 clusters over three features using multi-dimensional Euclidean distance

In [None]:
# Presenter 5: Multi-Feature Three-Pollutant Clustering

# Select 7 cities for analysis
selected_cities_p5 = ['Bangkok', 'Istanbul', 'Mumbai', 'Paris', 'Tokyo', 'New York', 'London']

# Get average PM2.5, PM10, and NO2 values for these cities
pollutants_p5 = ['PM2.5', 'PM10', 'NO2']
city_averages_p5 = get_city_averages(air_quality_data, selected_cities_p5, pollutants_p5)

print("Presenter 5: City averages for PM2.5, PM10, and NO2")
print("=" * 60)
for city, values in city_averages_p5.items():
    print(f"{city:15}: PM2.5={values[0]:6.2f}, PM10={values[1]:6.2f}, NO2={values[2]:6.2f}")

# Prepare data for clustering
clustering_data_p5 = list(city_averages_p5.values())
city_names_p5 = list(city_averages_p5.keys())

print(f"\nClustering data shape: {len(clustering_data_p5)} cities × {len(pollutants_p5)} features")

# Perform K-Means clustering with K=2
kmeans_p5 = ManualKMeans(k=2, random_state=42)
kmeans_p5.fit(clustering_data_p5)

print("\nPresenter 5: K-Means Clustering Results (K=2)")
print("=" * 50)

# Display final centroids
print("Final Centroids (Multi-Pollutant Profiles):")
for i, centroid in enumerate(kmeans_p5.centroids):
    print(f"Cluster {i+1}: PM2.5={centroid[0]:6.2f}, PM10={centroid[1]:6.2f}, NO2={centroid[2]:6.2f}")

print("\nCluster Results - Clear Groups Based on Combined Pollutant Profiles:")
print("-" * 70)

# Group cities by cluster
clusters_dict_p5 = defaultdict(list)
for city, cluster_id in zip(city_names_p5, kmeans_p5.cluster_assignments):
    values = city_averages_p5[city]
    clusters_dict_p5[cluster_id].append((city, values))

# Display each cluster with comprehensive analysis
cluster_labels = ['Cleaner Cities', 'More Polluted Cities']

for cluster_id in range(kmeans_p5.k):
    centroid = kmeans_p5.centroids[cluster_id]
    print(f"\n🌍 {cluster_labels[cluster_id].upper()} (Cluster {cluster_id+1}):")
    print(f"   Centroid Profile: PM2.5={centroid[0]:.2f}, PM10={centroid[1]:.2f}, NO2={centroid[2]:.2f}")
    print("   Cities in this cluster:")
    
    for city, values in sorted(clusters_dict_p5[cluster_id], key=lambda x: sum(x[1])):
        total_pollution = sum(values)
        print(f"   • {city:15}: PM2.5={values[0]:6.2f}, PM10={values[1]:6.2f}, NO2={values[2]:6.2f} (Total: {total_pollution:6.2f})")

# Calculate cluster statistics
print("\n📊 Cluster Statistics:")
for cluster_id in range(kmeans_p5.k):
    cluster_cities = clusters_dict_p5[cluster_id]
    if cluster_cities:
        pm25_avg = sum(values[0] for _, values in cluster_cities) / len(cluster_cities)
        pm10_avg = sum(values[1] for _, values in cluster_cities) / len(cluster_cities)
        no2_avg = sum(values[2] for _, values in cluster_cities) / len(cluster_cities)
        
        print(f"   {cluster_labels[cluster_id]}:")
        print(f"     Average PM2.5: {pm25_avg:.2f} μg/m³")
        print(f"     Average PM10:  {pm10_avg:.2f} μg/m³")
        print(f"     Average NO2:   {no2_avg:.2f} μg/m³")
        print(f"     Total cities:  {len(cluster_cities)}")

print("\n📊 Conclusion from Multi-Pollutant Clustering:")
print("Multi-pollutant analysis yields comprehensive pollution profiles, better capturing")
print("complexity of air quality. Clustering distinguishes cleaner vs. more polluted groups")
print("for multilayered health and environmental assessments.")

## Section 7: Results Comparison and Analysis

Let's compare all the clustering results and analyze the effectiveness of different feature combinations.

In [None]:
# Summary and Overall Conclusions

print("="*80)
print("COMPREHENSIVE ANALYSIS SUMMARY")
print("="*80)

print("\n🎯 PRESENTER CONTRIBUTIONS:")
print("-" * 50)

print("\n1️⃣ PRESENTER 1 - Simple 2D Clustering (PM2.5 & PM10)")
print(f"   • Method: K=2 clusters on {len(city_names)} cities")
print(f"   • Features: 2D (PM2.5, PM10)")
print(f"   • Result: Clear separation into high/low pollution groups")
print(f"   • Insight: Basic pollution distinction with just two features")

print("\n2️⃣ PRESENTER 2 - Country-Level AQI Clustering")
print(f"   • Method: K=3 clusters on {len(country_names_p2)} countries") 
print(f"   • Features: 1D (AQI)")
print(f"   • Result: Low/Medium/High pollution country categories")
print(f"   • Insight: National-level policy targeting capabilities")

print("\n3️⃣ PRESENTER 3 - City-Level AQI Clustering")
print(f"   • Method: K=3 clusters on {len(city_names_p3)} cities")
print(f"   • Features: 1D (AQI)")
print(f"   • Result: Urban pollution management groupings")
print(f"   • Insight: Local pollution discrepancy identification")

print("\n4️⃣ PRESENTER 4 - Two-Pollutant City Clustering (PM2.5 & NO2)")
print(f"   • Method: K=3 clusters on {len(city_names_p4)} cities with iterations")
print(f"   • Features: 2D (PM2.5, NO2)")
print(f"   • Result: Pollutant-compositional diversity capture")
print(f"   • Insight: Convergence visualization in multi-dimensional space")

print("\n5️⃣ PRESENTER 5 - Multi-Feature Three-Pollutant Clustering")
print(f"   • Method: K=2 clusters on {len(city_names_p5)} cities")
print(f"   • Features: 3D (PM2.5, PM10, NO2)")
print(f"   • Result: Comprehensive pollution profiles")
print(f"   • Insight: Complex air quality patterns for health assessments")

print("\n" + "="*80)
print("📈 ANALYSIS PROGRESSION")
print("="*80)

progression_data = [
    ("Presenter 1", "2D", "PM2.5, PM10", 2, "Visual separation"),
    ("Presenter 2", "1D", "AQI", 3, "Policy targeting"),
    ("Presenter 3", "1D", "AQI", 3, "Urban management"),
    ("Presenter 4", "2D", "PM2.5, NO2", 3, "Iterative convergence"),
    ("Presenter 5", "3D", "PM2.5, PM10, NO2", 2, "Comprehensive profiles")
]

print(f"{'Presenter':<12} {'Dims':<4} {'Features':<18} {'K':<3} {'Key Insight'}")
print("-" * 70)
for presenter, dims, features, k, insight in progression_data:
    print(f"{presenter:<12} {dims:<4} {features:<18} {k:<3} {insight}")

print("\n" + "="*80)
print("🎯 OVERALL CONCLUSIONS")
print("="*80)

print("\n✅ KEY FINDINGS:")
print("   • Multi-pollutant K-Means clustering reveals meaningful air quality patterns")
print("   • Progression from simple to complex features enhances analytical depth")
print("   • Different scales (city vs country) provide complementary insights")
print("   • Manual implementation demonstrates algorithmic understanding")
print("   • Clustering supports evidence-based environmental decision making")

print("\n🌍 PRACTICAL APPLICATIONS:")
print("   • Health Risk Assessment: Identify high-risk pollution zones")
print("   • Resource Allocation: Target interventions based on cluster profiles")
print("   • Policy Development: National vs local pollution management strategies")
print("   • Environmental Monitoring: Systematic pollution pattern recognition")
print("   • Urban Planning: Inform sustainable city development")

print("\n🔮 FUTURE SCOPE:")
print("   • Temporal Analysis: Incorporate seasonal/yearly pollution trends")
print("   • Socio-Economic Integration: Add demographic and economic indicators")
print("   • Weather Correlation: Include meteorological factors")
print("   • Advanced Clustering: Compare with DBSCAN, hierarchical methods")
print("   • Real-time Monitoring: Dynamic clustering for live air quality data")

print("\n" + "="*80)
print("Thank you for following this comprehensive multi-pollutant clustering analysis!")
print("="*80)