In [2]:
# Import the necessary libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import interact, SelectMultiple
import plotly.io as pio

# Set a default template
pio.templates.default = "plotly_white"

print("Libraries imported successfully!")

Libraries imported successfully!


In [3]:
# Load the data from the 'city_day.csv', we use exception in case it fails to read

try:
    df = pd.read_csv("city_day.csv")
    print("Data loaded successfully!")
except FileNotFoundError:
    print("Error: city_data.csv was not found")

# Display some information about the data set and we also print the first 5 rows
if 'df' in locals():
    print(df.info())
    display(df.head())

Data loaded successfully!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   Date        29531 non-null  object 
 2   PM2.5       24933 non-null  float64
 3   PM10        18391 non-null  float64
 4   NO          25949 non-null  float64
 5   NO2         25946 non-null  float64
 6   NOx         25346 non-null  float64
 7   NH3         19203 non-null  float64
 8   CO          27472 non-null  float64
 9   SO2         25677 non-null  float64
 10  O3          25509 non-null  float64
 11  Benzene     23908 non-null  float64
 12  Toluene     21490 non-null  float64
 13  Xylene      11422 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.6+ MB
None


Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


In [None]:
# 1. Convert the 'Date' column to DateTime objects for easier use and manipulation later

df['Date'] = pd.to_datetime(df['Date'])

# 2. Check for missing values
print(df.isnull().sum())

# There are a lot of missing values, however since we are only interested in PM2.5 levels, we will fix that only!

City              0
Date              0
PM2.5          4598
PM10          11140
NO             3582
NO2            3585
NOx            4185
NH3           10328
CO             2059
SO2            3854
O3             4022
Benzene        5623
Toluene        8041
Xylene        18109
AQI            4681
AQI_Bucket     4681
dtype: int64


In [5]:
# 2. Handling missing PM2.5 Values
# First, we sort by City and Date
df = df.sort_values(by=['City','Date'])

# Group by 'City' anf forward-fill missing PM2.5 values
df['PM2.5'] = df.groupby('City')['PM2.5'].ffill()

# 3. Now, we will drop any other remaining null values (for example if the start value was missing ffill wouldnt handle it)
df_cleaned = df.dropna(subset=['PM2.5','City'])

print(f"Missing PM2.5 values after cleaning: {df_cleaned['PM2.5'].isnull().sum()}")
print("Data cleaning complete.")
display(df_cleaned.head())

Missing PM2.5 values after cleaning: 0
Data cleaning complete.


Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
27,Ahmedabad,2015-01-28,73.24,,5.72,21.11,25.84,,5.72,36.52,62.42,0.03,0.01,1.41,,
28,Ahmedabad,2015-01-29,83.13,,6.93,28.71,33.72,,6.93,49.52,59.76,0.02,0.0,3.14,209.0,Poor
29,Ahmedabad,2015-01-30,79.84,,13.85,28.68,41.08,,13.85,48.49,97.07,0.04,0.0,4.81,328.0,Very Poor
30,Ahmedabad,2015-01-31,94.52,,24.39,32.66,52.61,,24.39,67.39,111.33,0.24,0.01,7.67,514.0,Severe
31,Ahmedabad,2015-02-01,135.99,,43.48,42.08,84.57,,43.48,75.23,102.7,0.4,0.04,25.87,782.0,Severe


In [6]:
# To get a high-level overview, we will create a box plot, to help us estimate median PM2.5 level
# and the spread of data for each city.

# Create a box plot
fig_box = px.box(df_cleaned,
                 x='City',
                 y='PM2.5',
                 title='Overall PM2.5 Distribution by City (2015 - 2020)',
                 labels={'PM2.5': 'PM2.5 (ug / m3)'})

# Ordering the cities by their median PM2.5 value for easier comparision
fig_box.update_layout(xaxis={'categoryorder': 'total descending'}, height=600)

fig_box.show()

In [None]:
# Now, we create the main interactive plot, we will use monthly averages instead of daily data (to reduce alot of noice).
# Using ipywidgets to create a dropdown to allow user interactivity.

# Resampling the data by month
# Group by City, then resample by Month and find the mean
df_monthly = df_cleaned.set_index('Date').groupby('City').resample('M')['PM2.5'].mean().reset_index()

print("Monthly average data created successfully!")
display(df_monthly.head())

Monthly average data created successfully!



'M' is deprecated and will be removed in a future version, please use 'ME' instead.



Unnamed: 0,City,Date,PM2.5
0,Ahmedabad,2015-01-31,82.6825
1,Ahmedabad,2015-02-28,109.339643
2,Ahmedabad,2015-03-31,112.486774
3,Ahmedabad,2015-04-30,101.682
4,Ahmedabad,2015-05-31,74.919355


In [None]:
# Get a sorted list of unique city names for the dropdown
city_list = sorted(df_monthly['City'].unique())

# Now, we define the interactive plotting functiion
@interact(cities=SelectMultiple(options=city_list,
                                value=['Delhi','Mumbai'],
                                description="Select Cities:"))

def plot_city_comparision(cities):
    if not cities:
        print("Please select atleast one city to visualize!")
        return
    # Filter the monthly dataframe based on the selected cities by user
    plot_data = df_monthly[df_monthly['City'].isin(cities)]

    # Create an interactive plot with plotly express
    fig = px.line(plot_data,
                  x='Date',
                  y='PM2.5',
                  color='City', # This automatically creates separate lines and also a legend
                  title="Monthly Average PM2.5 levels",
                  labels={'PM2.5': 'Monthly Average PM2.5 (ug / m3)'})
    fig.update_layout(height=500)
    fig.show()

interactive(children=(SelectMultiple(description='Select Cities:', index=(10, 20), options=('Ahmedabad', 'Aiza…

In [9]:
# To better understand when pollution is at worst, a heatmap will be very effective. We can plot the average PM2.5 level
# for each month against each year. This will reveal the seasonal patterns.

# Extract 'Year' and 'Month' from the date
df_cleaned['Year'] = df_cleaned['Date'].dt.year
df_cleaned['Month'] = df_cleaned['Date'].dt.month_name()

# Now, aggregate all data for the heatmap
df_heatmap = df_cleaned.groupby(['Year','Month'])['PM2.5'].mean().reset_index()

# Define the correct order for months (according to calender)
month_order = ['January','February','March','April','May','June','July','August','September','October','November','December']

# Pivot the data to create a matrix: Years vs Months
heatmap_pivot = df_heatmap.pivot(index='Year',columns='Month',values='PM2.5')

# Reorder the columns to follow the calender
heatmap_pivot = heatmap_pivot[month_order]

# Create the heatmap
fig_heatmap = px.imshow(heatmap_pivot,
                        title='Average PM2.5 levels by month and year',
                        labels=dict(x="Month", y="Year",color="Avg PM2.5"),
                        color_continuous_scale='YlOrRd' ) # yellow -> orange -> red

fig_heatmap.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [10]:
# Creating a pollutant correaltion heatmap to see whether 1 polutant affects the other pollutant aswell
pollutant_df= df_cleaned[['PM2.5','PM10','NO','NO2','SO2','CO','O3','AQI']]

# Calculate the correlation
corr_matrix = pollutant_df.corr()

# Plot the heatmap
fig_corr = px.imshow(corr_matrix,
                     text_auto=True,
                     title='Pollutants correlation heatmap',
                     color_continuous_scale='RdBu_r',
                     zmin=-1,
                     zmax=1)

fig_corr.show()
