# Loading and Cleaning/Preparing the Data

In [13]:
import pandas as pd

csv_file = "openaq.csv"

openaq_df = pd.read_csv(csv_file, sep=';', low_memory=False)

print("Columns in dataset:", openaq_df.columns.tolist())
print(openaq_df.head())

Columns in dataset: ['Country Code', 'City', 'Location', 'Coordinates', 'Pollutant', 'Source Name', 'Unit', 'Value', 'Last Updated', 'Country Label']
  Country Code City Location                             Coordinates  \
0           CN  NaN    市八十六中             23.1047, 113.43319999999999   
1           CN  NaN     市农科院                       21.9508, 108.6553   
2           CN  NaN     市发改委                       29.8454, 114.3107   
3           CN  NaN       市委  30.457600000000003, 106.63030000000002   
4           CN  NaN     市委党校            27.731400000000004, 112.0194   

  Pollutant   Source Name   Unit  Value               Last Updated  \
0        O3  ChinaAQIData  µg/m³   36.0  2021-08-09T12:00:00+01:00   
1       SO2  ChinaAQIData  µg/m³    7.0  2020-12-31T16:00:00+00:00   
2     PM2.5  ChinaAQIData  µg/m³   26.0  2021-08-09T12:00:00+01:00   
3        O3  ChinaAQIData  µg/m³   91.0  2021-08-09T12:00:00+01:00   
4       NO2  ChinaAQIData  µg/m³   19.0  2021-08-09T12:00:00+01:00 

In [14]:
print("Unique countries in dataset:")
print(openaq_df['Country Label'].unique())

Unique countries in dataset:
['China' 'Colombia' 'Cyprus' 'Czech Republic' 'Germany' 'Denmark'
 'Ecuador' 'Estonia' 'Spain' 'Finland' 'France' 'United Kingdom' 'Greece'
 'Hong Kong, China' 'Korea, Republic of' 'Lithuania' 'Luxembourg' 'Latvia'
 'Montenegro' 'Macedonia, The former Yugoslav Rep. of' 'Mongolia' 'Malta'
 'Mexico' 'Japan' 'Netherlands' 'Norway' 'Nepal' 'Peru' 'Poland' 'India'
 'Iraq' 'Iceland' 'Italy' 'Croatia' 'Hungary' 'Israel' 'Kyrgyzstan'
 'Taiwan, China' 'United States' 'Thailand' 'Turkey' 'Serbia'
 'Russian Federation' 'Sweden' 'Singapore' 'Slovenia' 'Slovakia'
 'South Africa' nan 'West Bank and Gaza Strip' 'Portugal' 'Romania'
 'Austria' 'Australia' 'Canada' 'Switzerland' 'Chile'
 'Bosnia and Herzegovina' 'Belgium' 'Andorra' 'United Arab Emirates'
 'Argentina' 'Bulgaria' 'Brazil' 'Ghana' 'Indonesia' 'Ireland' 'Kenya'
 'Trinidad and Tobago' 'New Zealand' 'Chad' 'Puerto Rico' 'Qatar' 'Egypt'
 'Serbia and Montenegro' 'Gibraltar' 'Jordan' 'Saudi Arabia' 'Uzbekistan'
 "La

In [15]:
print("Number of unique cities:", openaq_df['City'].nunique())

print("Number of NaN city values:", openaq_df['City'].isna().sum())

print("\nUnique cities in dataset:")
print(openaq_df['City'].unique())

Number of unique cities: 4464
Number of NaN city values: 29146

Unique cities in dataset:
[nan 'Medellin' 'Αγία Μαρίνα Ξυλιάτου - Σταθμός Υποβάθρου' ... 'Svalöv'
 'LJ Bežigrad' 'Złockie']


In [16]:
print("Unique pollutants in dataset:")
print(openaq_df['Pollutant'].unique())

Unique pollutants in dataset:
['O3' 'SO2' 'PM2.5' 'NO2' 'CO' 'PM10' 'NO' 'PM1' 'RELATIVEHUMIDITY'
 'TEMPERATURE' 'NOX' 'UM003' 'BC']


In [17]:
# First convert to datetime if not done already
openaq_df['Last Updated'] = pd.to_datetime(openaq_df['Last Updated'], errors='coerce', utc=True)

# Check date range
print("Earliest date:", openaq_df['Last Updated'].min())
print("Latest date:", openaq_df['Last Updated'].max())

# List all years available
print("Years available in dataset:")
print(openaq_df['Last Updated'].dt.year.unique())

Earliest date: 2014-03-13 12:00:00+00:00
Latest date: 2025-01-31 23:00:00+00:00
Years available in dataset:
[2021 2020 2024 2022 2025 2019 2023 2018 2016 2017 2014 2015]


In [18]:
target_cities = ["Dublin", "London", "Paris", "Delhi", "Beijing"]

for city in target_cities:
    matches = openaq_df['City'].str.contains(city, case=False, na=False)
    print(f"{city}: {matches.sum()} rows")


Dublin: 8 rows
London: 45 rows
Paris: 48 rows
Delhi: 0 rows
Beijing: 2 rows


In [19]:
openaq_df[["Latitude", "Longitude"]] = (openaq_df["Coordinates"].str.split(",", expand=True).astype(float))

print(openaq_df.head())

  Country Code City Location                             Coordinates  \
0           CN  NaN    市八十六中             23.1047, 113.43319999999999   
1           CN  NaN     市农科院                       21.9508, 108.6553   
2           CN  NaN     市发改委                       29.8454, 114.3107   
3           CN  NaN       市委  30.457600000000003, 106.63030000000002   
4           CN  NaN     市委党校            27.731400000000004, 112.0194   

  Pollutant   Source Name   Unit  Value              Last Updated  \
0        O3  ChinaAQIData  µg/m³   36.0 2021-08-09 11:00:00+00:00   
1       SO2  ChinaAQIData  µg/m³    7.0 2020-12-31 16:00:00+00:00   
2     PM2.5  ChinaAQIData  µg/m³   26.0 2021-08-09 11:00:00+00:00   
3        O3  ChinaAQIData  µg/m³   91.0 2021-08-09 11:00:00+00:00   
4       NO2  ChinaAQIData  µg/m³   19.0 2021-08-09 11:00:00+00:00   

  Country Label  Latitude  Longitude  
0         China   23.1047   113.4332  
1         China   21.9508   108.6553  
2         China   29.8454   114.310

In [20]:
openaq_df["City"] = openaq_df["City"].fillna("Unknown")

print(openaq_df.head())

  Country Code     City Location                             Coordinates  \
0           CN  Unknown    市八十六中             23.1047, 113.43319999999999   
1           CN  Unknown     市农科院                       21.9508, 108.6553   
2           CN  Unknown     市发改委                       29.8454, 114.3107   
3           CN  Unknown       市委  30.457600000000003, 106.63030000000002   
4           CN  Unknown     市委党校            27.731400000000004, 112.0194   

  Pollutant   Source Name   Unit  Value              Last Updated  \
0        O3  ChinaAQIData  µg/m³   36.0 2021-08-09 11:00:00+00:00   
1       SO2  ChinaAQIData  µg/m³    7.0 2020-12-31 16:00:00+00:00   
2     PM2.5  ChinaAQIData  µg/m³   26.0 2021-08-09 11:00:00+00:00   
3        O3  ChinaAQIData  µg/m³   91.0 2021-08-09 11:00:00+00:00   
4       NO2  ChinaAQIData  µg/m³   19.0 2021-08-09 11:00:00+00:00   

  Country Label  Latitude  Longitude  
0         China   23.1047   113.4332  
1         China   21.9508   108.6553  
2         C

# Data Aggregation for Dashboard

In [21]:
country_pollution = openaq_df.groupby(["Country Label", "Pollutant"], as_index = False).agg(avg_value = ("Value", "mean"))

print(country_pollution)

                Country Label Pollutant   avg_value
0                 Afghanistan     PM2.5 -431.500000
1                     Algeria     PM2.5   14.000000
2                     Andorra        CO  500.000000
3                     Andorra        NO   16.900000
4                     Andorra       NO2   36.500000
..                        ...       ...         ...
602                  Viet Nam     PM2.5   26.000000
603  West Bank and Gaza Strip        CO    0.433333
604  West Bank and Gaza Strip       NO2    0.018467
605  West Bank and Gaza Strip        O3    0.010167
606  West Bank and Gaza Strip       SO2    0.000267

[607 rows x 3 columns]


In [22]:
openaq_df["Date"] = openaq_df["Last Updated"].dt.date

time_pollution = openaq_df.groupby(["Date", "Pollutant"], as_index = False).agg(avg_value = ("Value", "mean"))

print(time_pollution)

            Date Pollutant   avg_value
0     2014-03-13     PM2.5    5.000000
1     2014-08-12     PM2.5    7.100000
2     2015-08-21     PM2.5   36.900000
3     2016-02-09     PM2.5   10.000000
4     2016-02-16        CO  740.700000
...          ...       ...         ...
6038  2025-01-31        O3   31.038400
6039  2025-01-31       PM1   10.196597
6040  2025-01-31      PM10   15.021268
6041  2025-01-31     PM2.5    3.292397
6042  2025-01-31       SO2    4.446568

[6043 rows x 3 columns]


# Interactive Dashboard

In [23]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px

pollutants = ["PM2.5", "PM10", "NO2", "SO2", "O3", "CO"]
openaq_df = openaq_df[openaq_df["Pollutant"].isin(pollutants)]

app = dash.Dash(__name__)
app.title = "Global Air Quality Dashboard"

# Layout

app.layout = html.Div([
    html.H1("Global Air Quality Dashboard", style = {"textAlign": "center"}),

    dcc.Dropdown(
        id = "pollutant dropdown",
        options = [{"label": p, "value": p} for p in pollutants],
        value = "PM2.5",
        clearable = False
    ),

    dcc.Graph(id = "map graph"),
    dcc.Graph(id = "time series graph")
])

# Callbacks

@app.callback(
    [Output("map graph", "figure"),
     Output("time series graph", "figure")],
    Input("pollutant dropdown", "value")
)

def updated_graphs(selected_pollutant):
    filtered = openaq_df[openaq_df["Pollutant"] == selected_pollutant]

    map_fig = px.scatter_geo(
        filtered,
        lat="Latitude",
        lon="Longitude",
        color="Value",
        hover_name="City",
        hover_data=["Country Label", "Value"],
        title=f"{selected_pollutant} Concentration Around the World",
        color_continuous_scale="Reds"
    )

    time_df = (
        filtered.groupby("Date", as_index=False)
                .agg(avg_value=("Value", "mean"))
    )

    time_fig = px.line(
        time_df,
        x="Date",
        y="avg_value",
        title=f"Average {selected_pollutant} Over Time"
    )

    return map_fig, time_fig

if __name__ == "__main__":
    app.run(debug=True)