In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Making new Directory for Matplotlib plot
output_dir = "matplotlib_plots"
os.makedirs(output_dir, exist_ok=True)

# Read File
df = pd.read_csv('cleaned_daily_air_data.csv')

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)

In [3]:
# Helper to save and show plots (works in scripts & notebooks)
def save_fig(fig, name):
    path = os.path.join(output_dir, name)
    fig.savefig(path, bbox_inches='tight', dpi=150)
    print("Saved:", path)

In [4]:
# 1. Line plot: PM2.5 over time
fig, ax = plt.subplots()
ax.plot(df['date'], df['pm2_5'], linewidth=0.8)
ax.set_title("PM2.5 over Time")
ax.set_xlabel("Date")
ax.set_ylabel("PM2.5")
save_fig(fig, "1_line_pm2_5_time.png")
plt.close(fig)

Saved: matplotlib_plots\1_line_pm2_5_time.png


In [5]:
# 2. Line plot: SO2, NO2, PM2.5 together (multi-line)
fig, ax = plt.subplots()
ax.plot(df['date'], df['so2'], label='SO2', linewidth=0.7)
ax.plot(df['date'], df['no2'], label='NO2', linewidth=0.7)
ax.plot(df['date'], df['pm2_5'], label='PM2.5', linewidth=0.7)
ax.legend()
ax.set_title("SO2, NO2, PM2.5 over Time")
ax.set_xlabel("Date")
save_fig(fig, "2_multi_line_gases_time.png")
plt.close(fig)

Saved: matplotlib_plots\2_multi_line_gases_time.png


In [6]:
# 3. Histogram: PM2.5 distribution
fig, ax = plt.subplots()
ax.hist(df['pm2_5'].dropna(), bins=40)
ax.set_title("PM2.5 Distribution")
ax.set_xlabel("PM2.5")
save_fig(fig, "3_hist_pm2_5.png")
plt.close(fig)

Saved: matplotlib_plots\3_hist_pm2_5.png


In [7]:
# 4. Boxplot: distributions of pollutants
fig, ax = plt.subplots()
data_box = [df[col].dropna() for col in ['so2','no2','rspm','spm','pm2_5']]
ax.boxplot(data_box, labels=['SO2','NO2','RSPM','SPM','PM2.5'])
ax.set_title("Boxplots of Pollutants")
save_fig(fig, "4_boxplot_pollutants.png")
plt.close(fig)

Saved: matplotlib_plots\4_boxplot_pollutants.png


In [8]:
# 5. Scatter: NO2 vs PM2.5
fig, ax = plt.subplots()
ax.scatter(df['no2'], df['pm2_5'], s=8, alpha=0.5)
ax.set_xlabel("NO2")
ax.set_ylabel("PM2.5")
ax.set_title("NO2 vs PM2.5")
save_fig(fig, "5_scatter_no2_pm2_5.png")
plt.close(fig)

Saved: matplotlib_plots\5_scatter_no2_pm2_5.png


In [9]:
# 6. Rolling mean (30 days) of PM2.5
df['pm2_5_30d'] = df['pm2_5'].rolling(window=30, min_periods=1).mean()
fig, ax = plt.subplots()
ax.plot(df['date'], df['pm2_5_30d'])
ax.set_title("30-day Rolling Mean of PM2.5")
ax.set_xlabel("Date")
save_fig(fig, "6_rolling_mean_pm2_5.png")
plt.close(fig)

Saved: matplotlib_plots\6_rolling_mean_pm2_5.png


In [10]:
# 7. Heatmap-style: correlation matrix (as image)
corr = df[['so2','no2','rspm','spm','pm2_5']].corr()
fig, ax = plt.subplots()
cax = ax.imshow(corr, interpolation='nearest', aspect='auto')
ax.set_xticks(range(len(corr.columns))); ax.set_xticklabels(corr.columns, rotation=45)
ax.set_yticks(range(len(corr.columns))); ax.set_yticklabels(corr.columns)
fig.colorbar(cax)
ax.set_title("Correlation matrix (pollutants)")
save_fig(fig, "7_corr_matrix.png")
plt.close(fig)

Saved: matplotlib_plots\7_corr_matrix.png


In [11]:
# 8. Bar plot: average pollutant by month
df['month'] = df['date'].dt.to_period('M').astype(str)
monthly = df.groupby('month')[['pm2_5','so2','no2']].mean().reset_index()
fig, ax = plt.subplots(figsize=(10,4))
ax.bar(monthly['month'], monthly['pm2_5'])
ax.set_title("Average PM2.5 by Month")
ax.set_xticklabels(monthly['month'], rotation=45, ha='right')
save_fig(fig, "8_bar_avg_pm2_5_by_month.png")
plt.close(fig)

  ax.set_xticklabels(monthly['month'], rotation=45, ha='right')


Saved: matplotlib_plots\8_bar_avg_pm2_5_by_month.png


In [12]:
# 9. Area plot: stacked pollutants (small sample to avoid readability issues)
sample = df.set_index('date').resample('M')[['so2','no2','pm2_5']].mean().dropna()
fig, ax = plt.subplots()
ax.stackplot(sample.index.astype(str), sample['so2'], sample['no2'], sample['pm2_5'], labels=['SO2','NO2','PM2.5'])
ax.legend(loc='upper left')
ax.set_title("Monthly stacked pollutant contributions")
save_fig(fig, "9_area_stacked_monthly.png")
plt.close(fig)


  sample = df.set_index('date').resample('M')[['so2','no2','pm2_5']].mean().dropna()


Saved: matplotlib_plots\9_area_stacked_monthly.png


In [13]:
# 10. Violin-like using histogram overlay (since matplotlib has no direct violin without seaborn)
fig, ax = plt.subplots()
ax.hist(df['so2'].dropna(), bins=60, alpha=0.6)
ax.set_title("SO2 Histogram (violin-style proxy)")
save_fig(fig, "10_hist_so2.png")
plt.close(fig)

print("Matplotlib plots completed. Check folder:", output_dir)

Saved: matplotlib_plots\10_hist_so2.png
Matplotlib plots completed. Check folder: matplotlib_plots


In [14]:
import seaborn as sns

In [15]:
# Make new Directory For seaborn plot 
output_dir = "seaborn_plots"
os.makedirs(output_dir, exist_ok=True)

df['month'] = df['date'].dt.month_name().str.slice(stop=3) 


In [16]:
# 1. Line plot with seaborn: PM2.5 over time
fig = plt.figure()
sns.lineplot(data=df, x='date', y='pm2_5', linewidth=1)
plt.title("PM2.5 over time (Seaborn)")
plt.xlabel("Date")
plt.ylabel("PM2.5")
plt.savefig(os.path.join(output_dir,"1_line_pm2_5_seaborn.png"), bbox_inches='tight', dpi=150)
plt.close()

In [17]:
# 2. Scatter with regression: NO2 vs PM2.5
fig = plt.figure()
sns.regplot(data=df, x='no2', y='pm2_5', scatter_kws={'s':10, 'alpha':0.5}, line_kws={'linewidth':1})
plt.title("NO2 vs PM2.5 (regression)")
plt.savefig(os.path.join(output_dir,"2_reg_no2_pm2_5.png"), bbox_inches='tight', dpi=150)
plt.close()

In [18]:
# 3. Histogram + KDE: PM2.5
fig = plt.figure()
sns.histplot(df['pm2_5'].dropna(), kde=True, bins=40)
plt.title("PM2.5 Histogram + KDE")
plt.savefig(os.path.join(output_dir,"3_hist_kde_pm2_5.png"), bbox_inches='tight', dpi=150)
plt.close()

In [19]:
# 4. Boxplot: pollutants
fig = plt.figure()
sns.boxplot(data=df[['so2','no2','rspm','spm','pm2_5']].melt(var_name='pollutant', value_name='value'),
            x='pollutant', y='value')
plt.title("Boxplots (Seaborn)")
plt.savefig(os.path.join(output_dir,"4_boxplot_pollutants_seaborn.png"), bbox_inches='tight', dpi=150)
plt.close()

In [20]:
# 5. Violin plot: PM2.5 by month (showing seasonality)
fig = plt.figure(figsize=(12,4))
sns.violinplot(x='month', y='pm2_5', data=df, order=pd.date_range('2000-01-01', periods=12, freq='M').month_name().str.slice(stop=3).tolist())
plt.title("PM2.5 by Month (Violin)")
plt.savefig(os.path.join(output_dir,"5_violin_pm2_5_by_month.png"), bbox_inches='tight', dpi=150)
plt.close()

  sns.violinplot(x='month', y='pm2_5', data=df, order=pd.date_range('2000-01-01', periods=12, freq='M').month_name().str.slice(stop=3).tolist())


In [21]:
# 6. Pairplot: small subset to avoid huge output
small = df[['so2','no2','rspm','pm2_5']].sample(min(1000, len(df)), random_state=1).dropna()
sns.pairplot(small)
plt.savefig(os.path.join(output_dir,"6_pairplot_sample.png"), bbox_inches='tight', dpi=150)
plt.close()


In [22]:
# 7. Heatmap of correlation
fig = plt.figure(figsize=(6,5))
corr = df[['so2','no2','rspm','spm','pm2_5']].corr()
sns.heatmap(corr, annot=True, fmt=".2f")
plt.title("Correlation heatmap (Seaborn)")
plt.savefig(os.path.join(output_dir,"7_heatmap_corr.png"), bbox_inches='tight', dpi=150)
plt.close()

In [23]:
# 8. Boxen plot (more detail) for NO2
fig = plt.figure()
sns.boxenplot(x=df['no2'].dropna())
plt.title("Boxen plot for NO2")
plt.savefig(os.path.join(output_dir,"8_boxen_no2.png"), bbox_inches='tight', dpi=150)
plt.close()

In [24]:
# 9. Jointplot: PM2.5 vs SO2 with KDE
g = sns.jointplot(x='so2', y='pm2_5', data=df, kind='kde')
g.fig.suptitle("Joint KDE: SO2 vs PM2.5", y=1.02)
g.fig.savefig(os.path.join(output_dir,"9_joint_kde_so2_pm2_5.png"), bbox_inches='tight', dpi=150)
plt.close()

In [25]:
# 10. Rug + histogram: PM2.5
fig = plt.figure()
sns.histplot(df['pm2_5'].dropna(), kde=False, bins=40)
sns.rugplot(df['pm2_5'].dropna())
plt.title("Histogram + Rug for PM2.5")
plt.savefig(os.path.join(output_dir,"10_hist_rug_pm2_5.png"), bbox_inches='tight', dpi=150)
plt.close()

print("Seaborn plots completed. Check folder:", output_dir)

Seaborn plots completed. Check folder: seaborn_plots


In [27]:
import plotly.express as px
import plotly.graph_objects as go

In [28]:
# Make new Directory For Plotly plot
output_dir = "plotly_plots"
os.makedirs(output_dir, exist_ok=True)

In [29]:
# 1. Time series line (PM2.5)
fig = px.line(df, x='date', y='pm2_5', title="PM2.5 over Time")
fig.write_html(os.path.join(output_dir,"1_line_pm2_5.html"))
print("Saved:", os.path.join(output_dir,"1_line_pm2_5.html"))

Saved: plotly_plots\1_line_pm2_5.html


In [30]:
# 2. Multi-line
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['date'], y=df['so2'], mode='lines', name='SO2'))
fig.add_trace(go.Scatter(x=df['date'], y=df['no2'], mode='lines', name='NO2'))
fig.add_trace(go.Scatter(x=df['date'], y=df['pm2_5'], mode='lines', name='PM2.5'))
fig.update_layout(title="Multiple Pollutants Over Time")
fig.write_html(os.path.join(output_dir,"2_multi_line.html"))

In [32]:
# 3. Scatter with trendline
fig = px.scatter(df, x='no2', y='pm2_5', trendline='ols', title="NO2 vs PM2.5 with Trendline")
fig.write_html(os.path.join(output_dir,"3_scatter_trend.html"))

In [33]:
# 4. Histogram
fig = px.histogram(df, x='pm2_5', nbins=40, title="PM2.5 Histogram")
fig.write_html(os.path.join(output_dir,"4_hist_pm2_5.html"))

In [34]:
# 5. Box plots
fig = px.box(df.melt(value_vars=['so2','no2','rspm','spm','pm2_5'], var_name='pollutant', value_name='value'),
             x='pollutant', y='value', title="Boxplots for Pollutants")
fig.write_html(os.path.join(output_dir,"5_boxplots.html"))

In [35]:
# 6. Heatmap (correlation)
corr = df[['so2','no2','rspm','spm','pm2_5']].corr()
fig = px.imshow(corr, text_auto=True, title="Correlation Heatmap")
fig.write_html(os.path.join(output_dir,"6_corr_heatmap.html"))


In [36]:
# 7. Violin plot: PM2.5 by month
df['month'] = df['date'].dt.month_name().str.slice(stop=3)
fig = px.violin(df, x='month', y='pm2_5', category_orders={'month': ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']},
                title="PM2.5 by Month (Violin)")
fig.write_html(os.path.join(output_dir,"7_violin_pm2_5_month.html"))


In [37]:
# 8. Pairwise scatter matrix
fig = px.scatter_matrix(df[['so2','no2','rspm','pm2_5']].dropna().sample(min(1000,len(df))),
                        title="Scatter matrix (sample)")
fig.write_html(os.path.join(output_dir,"8_scatter_matrix.html"))

In [38]:
# 9. Rolling mean interactive (30-day)
df['pm2_5_30d'] = df['pm2_5'].rolling(30, min_periods=1).mean()
fig = px.line(df, x='date', y='pm2_5_30d', title="30-day Rolling Mean PM2.5 (Interactive)")
fig.write_html(os.path.join(output_dir,"9_rolling_pm2_5.html"))

In [39]:
# 10. Area (stacked) monthly
monthly = df.set_index('date').resample('M')[['so2','no2','pm2_5']].mean().dropna().reset_index()
fig = go.Figure()
fig.add_trace(go.Scatter(x=monthly['date'], y=monthly['so2'], stackgroup='one', name='SO2'))
fig.add_trace(go.Scatter(x=monthly['date'], y=monthly['no2'], stackgroup='one', name='NO2'))
fig.add_trace(go.Scatter(x=monthly['date'], y=monthly['pm2_5'], stackgroup='one', name='PM2.5'))
fig.update_layout(title="Monthly Stacked Pollutants (Area)")
fig.write_html(os.path.join(output_dir,"10_area_stacked_monthly.html"))

print("Plotly plots saved to:", output_dir)


'M' is deprecated and will be removed in a future version, please use 'ME' instead.



Plotly plots saved to: plotly_plots


In [73]:
from bokeh.plotting import figure, output_file, save, show, reset_output
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, ColorBar, BasicTicker, PrintfTickFormatter
from bokeh.transform import transform
from bokeh.palettes import Viridis256

In [75]:
# make output dir
output_dir = "bokeh_plots"
os.makedirs(output_dir, exist_ok=True)
# drop rows with no numeric values for plotting when necessary
df_numeric = df[['so2', 'no2', 'rspm', 'spm', 'pm2_5', 'date']].copy()

In [77]:
# ----- 1) Time series PM2.5 -----
reset_output()
src = ColumnDataSource(df_numeric.dropna(subset=['date', 'pm2_5']))
p = figure(x_axis_type='datetime', title="PM2.5 over Time", width=900, height=300, tools="pan,wheel_zoom,reset,save")
p.line(x='date', y='pm2_5', source=src, line_width=2, color="steelblue", legend_label="PM2.5")
p.legend.location = "top_left"
output_file(os.path.join(output_dir, "1_time_pm2_5.html"))
save(p)

'D:\\Internship\\MILESTONE_1\\Day_15\\bokeh_plots\\1_time_pm2_5.html'

In [78]:
# ----- 2) Multi-line pollutants (SO2, NO2, PM2.5) -----
reset_output()
src = ColumnDataSource(df_numeric.dropna(subset=['date','so2','no2','pm2_5']))
p = figure(x_axis_type='datetime', title="SO2, NO2, PM2.5 over Time", width=900, height=300, tools="pan,wheel_zoom,reset,save")
p.line('date', 'so2', source=src, legend_label='SO2', line_color="orange", line_width=1)
p.line('date', 'no2', source=src, legend_label='NO2', line_color="green", line_width=1)
p.line('date', 'pm2_5', source=src, legend_label='PM2.5', line_color="red", line_width=1)
p.legend.location = "top_left"
output_file(os.path.join(output_dir, "2_multi_line.html"))
save(p)


'D:\\Internship\\MILESTONE_1\\Day_15\\bokeh_plots\\2_multi_line.html'

In [79]:
# ----- 3) Scatter NO2 vs PM2.5 with hover -----
reset_output()
sdf = df_numeric.dropna(subset=['no2','pm2_5'])
src = ColumnDataSource(sdf)
p = figure(title="NO2 vs PM2.5", width=700, height=450, tools="pan,wheel_zoom,reset,save")
# use scatter (size argument ok here)
p.scatter(x='no2', y='pm2_5', source=src, size=8, fill_alpha=0.6, line_alpha=0.6, marker="circle")
hover = HoverTool(tooltips=[("NO2", "@no2{0.2f}"), ("PM2.5", "@pm2_5{0.2f}")])
p.add_tools(hover)
p.xaxis.axis_label = "NO2"
p.yaxis.axis_label = "PM2.5"
output_file(os.path.join(output_dir, "3_scatter_no2_pm2_5.html"))
save(p)

'D:\\Internship\\MILESTONE_1\\Day_15\\bokeh_plots\\3_scatter_no2_pm2_5.html'

In [80]:
# ----- 4) PM2.5 Histogram -----
reset_output()
vals = df_numeric['pm2_5'].dropna().values
hist, edges = np.histogram(vals, bins=40)
p = figure(title="PM2.5 Histogram", width=700, height=450, tools="pan,wheel_zoom,reset,save")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color="navy", line_color="white", alpha=0.7)
p.xaxis.axis_label = "PM2.5"
p.yaxis.axis_label = "Count"
output_file(os.path.join(output_dir, "4_hist_pm2_5.html"))
save(p)

'D:\\Internship\\MILESTONE_1\\Day_15\\bokeh_plots\\4_hist_pm2_5.html'

In [81]:
# ----- 5) Box-like summary for PM2.5 -----
reset_output()
series = df_numeric['pm2_5'].dropna()
q1, q2, q3 = series.quantile([0.25, 0.5, 0.75])
iqr = q3 - q1
upper = series[series <= (q3 + 1.5*iqr)].max()
lower = series[series >= (q1 - 1.5*iqr)].min()
outliers = series[(series > upper) | (series < lower)]

p = figure(title="Box-like Summary PM2.5", width=400, height=400, x_range=(-1,1), tools="save,reset")
# whiskers
p.segment(0, upper, 0, q3, line_width=2, line_color="black")
p.segment(0, lower, 0, q1, line_width=2, line_color="black")
# boxes
p.vbar(x=0, width=0.6, top=q3, bottom=q2, fill_color="#E08E79", line_color="black")
p.vbar(x=0, width=0.6, top=q2, bottom=q1, fill_color="#3B8686", line_color="black")
# median marker
p.circle(0, q2, size=8, color="black")
# outliers
if len(outliers)>0:
    p.circle(np.zeros(len(outliers)), outliers.values, size=6, color="firebrick", alpha=0.7)
p.xaxis.visible = False
p.yaxis.axis_label = "PM2.5"
output_file(os.path.join(output_dir, "5_box_pm2_5.html"))
save(p)



'D:\\Internship\\MILESTONE_1\\Day_15\\bokeh_plots\\5_box_pm2_5.html'

In [82]:
# ----- 6) Correlation Heatmap (proper mapping) -----
reset_output()
numeric_cols = ['so2','no2','rspm','spm','pm2_5']
corr = df[numeric_cols].corr()
corr_df = corr.stack().reset_index(name="correlation")
corr_df.columns = ["x", "y", "correlation"]
src_heat = ColumnDataSource(corr_df)

mapper = LinearColorMapper(palette=Viridis256, low=corr_df.correlation.min(), high=corr_df.correlation.max())

p = figure(title="Correlation Heatmap (Bokeh)", x_range=list(corr.columns), y_range=list(reversed(corr.columns)),
           x_axis_location="above", width=700, height=700, tools="hover,save,pan,box_zoom,reset",
           tooltips=[('X','@x'),('Y','@y'),('corr','@correlation{0.2f}')])
p.rect(x="x", y="y", width=1, height=1, source=src_heat, line_color=None, fill_color=transform("correlation", mapper))
color_bar = ColorBar(color_mapper=mapper, ticker=BasicTicker(desired_num_ticks=9),
                     formatter=PrintfTickFormatter(format="%.2f"),
                     label_standoff=12, border_line_color=None, location=(0,0))
p.add_layout(color_bar, 'right')
output_file(os.path.join(output_dir, "6_corr_heatmap.html"))
save(p)

'D:\\Internship\\MILESTONE_1\\Day_15\\bokeh_plots\\6_corr_heatmap.html'

In [83]:
# ----- 7) KDE (violin-like) for PM2.5 -----
reset_output()
from scipy.stats import gaussian_kde
vals = df_numeric['pm2_5'].dropna().values
if len(vals) >= 10:
    kde = gaussian_kde(vals)
    xgrid = np.linspace(vals.min(), vals.max(), 300)
    yvals = kde(xgrid)
    # scale the kernel for nicer plotting
    scale = (xgrid.max() - xgrid.min()) * 0.15
    p = figure(title="PM2.5 KDE (violin-like)", width=700, height=350, tools="save,reset")
    p.patch(x=np.concatenate([xgrid, xgrid[::-1]]),
            y=np.concatenate([yvals*scale, -yvals[::-1]*scale]),
            color="orchid", alpha=0.5, line_color="black")
    p.yaxis.visible = False
    p.xaxis.axis_label = "PM2.5"
    output_file(os.path.join(output_dir, "7_kde_pm2_5.html"))
    save(p)
else:
    print("Not enough PM2.5 values for KDE (need >=10).")



In [85]:
monthly = df.set_index('date').resample('ME')['pm2_5'].mean().dropna().reset_index()
src_month = ColumnDataSource(monthly)
p8 = figure(x_axis_type="datetime", title="Average PM2.5 by Month", width=900, height=350)
p8.vbar(x='date', top='pm2_5', width=20*24*60*60*1000, color="dodgerblue", source=src_month)
output_file(os.path.join(output_dir, "8_bar_monthly_pm2_5.html"))
save(p8)

'D:\\Internship\\MILESTONE_1\\Day_15\\bokeh_plots\\8_bar_monthly_pm2_5.html'

In [86]:
# === 9. Pairwise Scatter Grid ===
sample = df[['so2', 'no2', 'rspm', 'pm2_5']].dropna().sample(min(400, len(df)))
plots = []
for col in sample.columns:
    p = figure(width=250, height=250, title=col)
    p.circle(sample.index, sample[col], size=4, color="teal")
    plots.append(p)
layout = gridplot([plots[i:i+2] for i in range(0, len(plots), 2)])
output_file(os.path.join(output_dir, "9_grid_scatter.html"))
save(layout)



'D:\\Internship\\MILESTONE_1\\Day_15\\bokeh_plots\\9_grid_scatter.html'

In [87]:
# === 10. Rolling Mean PM2.5 ===
df['pm2_5_30d'] = df['pm2_5'].rolling(30, min_periods=1).mean()
p10 = figure(x_axis_type="datetime", title="30-Day Rolling Mean PM2.5", width=900, height=300)
p10.line('date', 'pm2_5_30d', line_width=2, color="crimson", source=ColumnDataSource(df))
output_file(os.path.join(output_dir, "10_rolling_mean.html"))
save(p10)

print("✅ All 10 Bokeh plots successfully saved to:", output_dir)

✅ All 10 Bokeh plots successfully saved to: bokeh_plots
