In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Making new Directory for Matplotlib plot
output_dir = "matplotlib_plots"
os.makedirs(output_dir, exist_ok=True)

# Read File
df = pd.read_csv('cleaned_daily_air_data.csv')

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)

In [3]:
# Helper to save and show plots (works in scripts & notebooks)
def save_fig(fig, name):
    path = os.path.join(output_dir, name)
    fig.savefig(path, bbox_inches='tight', dpi=150)
    print("Saved:", path)

In [4]:
# 1. Line plot: PM2.5 over time
fig, ax = plt.subplots()
ax.plot(df['date'], df['pm2_5'], linewidth=0.8)
ax.set_title("PM2.5 over Time")
ax.set_xlabel("Date")
ax.set_ylabel("PM2.5")
save_fig(fig, "1_line_pm2_5_time.png")
plt.close(fig)

Saved: matplotlib_plots\1_line_pm2_5_time.png


In [5]:
# 2. Line plot: SO2, NO2, PM2.5 together (multi-line)
fig, ax = plt.subplots()
ax.plot(df['date'], df['so2'], label='SO2', linewidth=0.7)
ax.plot(df['date'], df['no2'], label='NO2', linewidth=0.7)
ax.plot(df['date'], df['pm2_5'], label='PM2.5', linewidth=0.7)
ax.legend()
ax.set_title("SO2, NO2, PM2.5 over Time")
ax.set_xlabel("Date")
save_fig(fig, "2_multi_line_gases_time.png")
plt.close(fig)

Saved: matplotlib_plots\2_multi_line_gases_time.png


In [6]:
# 3. Histogram: PM2.5 distribution
fig, ax = plt.subplots()
ax.hist(df['pm2_5'].dropna(), bins=40)
ax.set_title("PM2.5 Distribution")
ax.set_xlabel("PM2.5")
save_fig(fig, "3_hist_pm2_5.png")
plt.close(fig)

Saved: matplotlib_plots\3_hist_pm2_5.png


In [7]:
# 4. Boxplot: distributions of pollutants
fig, ax = plt.subplots()
data_box = [df[col].dropna() for col in ['so2','no2','rspm','spm','pm2_5']]
ax.boxplot(data_box, labels=['SO2','NO2','RSPM','SPM','PM2.5'])
ax.set_title("Boxplots of Pollutants")
save_fig(fig, "4_boxplot_pollutants.png")
plt.close(fig)

Saved: matplotlib_plots\4_boxplot_pollutants.png


In [8]:
# 5. Scatter: NO2 vs PM2.5
fig, ax = plt.subplots()
ax.scatter(df['no2'], df['pm2_5'], s=8, alpha=0.5)
ax.set_xlabel("NO2")
ax.set_ylabel("PM2.5")
ax.set_title("NO2 vs PM2.5")
save_fig(fig, "5_scatter_no2_pm2_5.png")
plt.close(fig)

Saved: matplotlib_plots\5_scatter_no2_pm2_5.png


In [9]:
# 6. Rolling mean (30 days) of PM2.5
df['pm2_5_30d'] = df['pm2_5'].rolling(window=30, min_periods=1).mean()
fig, ax = plt.subplots()
ax.plot(df['date'], df['pm2_5_30d'])
ax.set_title("30-day Rolling Mean of PM2.5")
ax.set_xlabel("Date")
save_fig(fig, "6_rolling_mean_pm2_5.png")
plt.close(fig)

Saved: matplotlib_plots\6_rolling_mean_pm2_5.png


In [10]:
# 7. Heatmap-style: correlation matrix (as image)
corr = df[['so2','no2','rspm','spm','pm2_5']].corr()
fig, ax = plt.subplots()
cax = ax.imshow(corr, interpolation='nearest', aspect='auto')
ax.set_xticks(range(len(corr.columns))); ax.set_xticklabels(corr.columns, rotation=45)
ax.set_yticks(range(len(corr.columns))); ax.set_yticklabels(corr.columns)
fig.colorbar(cax)
ax.set_title("Correlation matrix (pollutants)")
save_fig(fig, "7_corr_matrix.png")
plt.close(fig)

Saved: matplotlib_plots\7_corr_matrix.png


In [11]:
# 8. Bar plot: average pollutant by month
df['month'] = df['date'].dt.to_period('M').astype(str)
monthly = df.groupby('month')[['pm2_5','so2','no2']].mean().reset_index()
fig, ax = plt.subplots(figsize=(10,4))
ax.bar(monthly['month'], monthly['pm2_5'])
ax.set_title("Average PM2.5 by Month")
ax.set_xticklabels(monthly['month'], rotation=45, ha='right')
save_fig(fig, "8_bar_avg_pm2_5_by_month.png")
plt.close(fig)

  ax.set_xticklabels(monthly['month'], rotation=45, ha='right')


Saved: matplotlib_plots\8_bar_avg_pm2_5_by_month.png


In [12]:
# 9. Area plot: stacked pollutants (small sample to avoid readability issues)
sample = df.set_index('date').resample('M')[['so2','no2','pm2_5']].mean().dropna()
fig, ax = plt.subplots()
ax.stackplot(sample.index.astype(str), sample['so2'], sample['no2'], sample['pm2_5'], labels=['SO2','NO2','PM2.5'])
ax.legend(loc='upper left')
ax.set_title("Monthly stacked pollutant contributions")
save_fig(fig, "9_area_stacked_monthly.png")
plt.close(fig)


  sample = df.set_index('date').resample('M')[['so2','no2','pm2_5']].mean().dropna()


Saved: matplotlib_plots\9_area_stacked_monthly.png


In [13]:
# 10. Violin-like using histogram overlay (since matplotlib has no direct violin without seaborn)
fig, ax = plt.subplots()
ax.hist(df['so2'].dropna(), bins=60, alpha=0.6)
ax.set_title("SO2 Histogram (violin-style proxy)")
save_fig(fig, "10_hist_so2.png")
plt.close(fig)

print("Matplotlib plots completed. Check folder:", output_dir)

Saved: matplotlib_plots\10_hist_so2.png
Matplotlib plots completed. Check folder: matplotlib_plots


In [14]:
import seaborn as sns

In [15]:
# Make new Directory For seaborn plot 
output_dir = "seaborn_plots"
os.makedirs(output_dir, exist_ok=True)

df['month'] = df['date'].dt.month_name().str.slice(stop=3) 


In [16]:
# 1. Line plot with seaborn: PM2.5 over time
fig = plt.figure()
sns.lineplot(data=df, x='date', y='pm2_5', linewidth=1)
plt.title("PM2.5 over time (Seaborn)")
plt.xlabel("Date")
plt.ylabel("PM2.5")
plt.savefig(os.path.join(output_dir,"1_line_pm2_5_seaborn.png"), bbox_inches='tight', dpi=150)
plt.close()

In [17]:
# 2. Scatter with regression: NO2 vs PM2.5
fig = plt.figure()
sns.regplot(data=df, x='no2', y='pm2_5', scatter_kws={'s':10, 'alpha':0.5}, line_kws={'linewidth':1})
plt.title("NO2 vs PM2.5 (regression)")
plt.savefig(os.path.join(output_dir,"2_reg_no2_pm2_5.png"), bbox_inches='tight', dpi=150)
plt.close()

In [18]:
# 3. Histogram + KDE: PM2.5
fig = plt.figure()
sns.histplot(df['pm2_5'].dropna(), kde=True, bins=40)
plt.title("PM2.5 Histogram + KDE")
plt.savefig(os.path.join(output_dir,"3_hist_kde_pm2_5.png"), bbox_inches='tight', dpi=150)
plt.close()

In [19]:
# 4. Boxplot: pollutants
fig = plt.figure()
sns.boxplot(data=df[['so2','no2','rspm','spm','pm2_5']].melt(var_name='pollutant', value_name='value'),
            x='pollutant', y='value')
plt.title("Boxplots (Seaborn)")
plt.savefig(os.path.join(output_dir,"4_boxplot_pollutants_seaborn.png"), bbox_inches='tight', dpi=150)
plt.close()

In [20]:
# 5. Violin plot: PM2.5 by month (showing seasonality)
fig = plt.figure(figsize=(12,4))
sns.violinplot(x='month', y='pm2_5', data=df, order=pd.date_range('2000-01-01', periods=12, freq='M').month_name().str.slice(stop=3).tolist())
plt.title("PM2.5 by Month (Violin)")
plt.savefig(os.path.join(output_dir,"5_violin_pm2_5_by_month.png"), bbox_inches='tight', dpi=150)
plt.close()

  sns.violinplot(x='month', y='pm2_5', data=df, order=pd.date_range('2000-01-01', periods=12, freq='M').month_name().str.slice(stop=3).tolist())


In [21]:
# 6. Pairplot: small subset to avoid huge output
small = df[['so2','no2','rspm','pm2_5']].sample(min(1000, len(df)), random_state=1).dropna()
sns.pairplot(small)
plt.savefig(os.path.join(output_dir,"6_pairplot_sample.png"), bbox_inches='tight', dpi=150)
plt.close()


In [22]:
# 7. Heatmap of correlation
fig = plt.figure(figsize=(6,5))
corr = df[['so2','no2','rspm','spm','pm2_5']].corr()
sns.heatmap(corr, annot=True, fmt=".2f")
plt.title("Correlation heatmap (Seaborn)")
plt.savefig(os.path.join(output_dir,"7_heatmap_corr.png"), bbox_inches='tight', dpi=150)
plt.close()

In [23]:
# 8. Boxen plot (more detail) for NO2
fig = plt.figure()
sns.boxenplot(x=df['no2'].dropna())
plt.title("Boxen plot for NO2")
plt.savefig(os.path.join(output_dir,"8_boxen_no2.png"), bbox_inches='tight', dpi=150)
plt.close()

In [24]:
# 9. Jointplot: PM2.5 vs SO2 with KDE
g = sns.jointplot(x='so2', y='pm2_5', data=df, kind='kde')
g.fig.suptitle("Joint KDE: SO2 vs PM2.5", y=1.02)
g.fig.savefig(os.path.join(output_dir,"9_joint_kde_so2_pm2_5.png"), bbox_inches='tight', dpi=150)
plt.close()

In [25]:
# 10. Rug + histogram: PM2.5
fig = plt.figure()
sns.histplot(df['pm2_5'].dropna(), kde=False, bins=40)
sns.rugplot(df['pm2_5'].dropna())
plt.title("Histogram + Rug for PM2.5")
plt.savefig(os.path.join(output_dir,"10_hist_rug_pm2_5.png"), bbox_inches='tight', dpi=150)
plt.close()

print("Seaborn plots completed. Check folder:", output_dir)

Seaborn plots completed. Check folder: seaborn_plots


In [27]:
import plotly.express as px
import plotly.graph_objects as go

In [28]:
# Make new Directory For Plotly plot
output_dir = "plotly_plots"
os.makedirs(output_dir, exist_ok=True)

In [29]:
# 1. Time series line (PM2.5)
fig = px.line(df, x='date', y='pm2_5', title="PM2.5 over Time")
fig.write_html(os.path.join(output_dir,"1_line_pm2_5.html"))
print("Saved:", os.path.join(output_dir,"1_line_pm2_5.html"))

Saved: plotly_plots\1_line_pm2_5.html


In [30]:
# 2. Multi-line
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['date'], y=df['so2'], mode='lines', name='SO2'))
fig.add_trace(go.Scatter(x=df['date'], y=df['no2'], mode='lines', name='NO2'))
fig.add_trace(go.Scatter(x=df['date'], y=df['pm2_5'], mode='lines', name='PM2.5'))
fig.update_layout(title="Multiple Pollutants Over Time")
fig.write_html(os.path.join(output_dir,"2_multi_line.html"))

In [32]:
# 3. Scatter with trendline
fig = px.scatter(df, x='no2', y='pm2_5', trendline='ols', title="NO2 vs PM2.5 with Trendline")
fig.write_html(os.path.join(output_dir,"3_scatter_trend.html"))

In [33]:
# 4. Histogram
fig = px.histogram(df, x='pm2_5', nbins=40, title="PM2.5 Histogram")
fig.write_html(os.path.join(output_dir,"4_hist_pm2_5.html"))

In [34]:
# 5. Box plots
fig = px.box(df.melt(value_vars=['so2','no2','rspm','spm','pm2_5'], var_name='pollutant', value_name='value'),
             x='pollutant', y='value', title="Boxplots for Pollutants")
fig.write_html(os.path.join(output_dir,"5_boxplots.html"))

In [35]:
# 6. Heatmap (correlation)
corr = df[['so2','no2','rspm','spm','pm2_5']].corr()
fig = px.imshow(corr, text_auto=True, title="Correlation Heatmap")
fig.write_html(os.path.join(output_dir,"6_corr_heatmap.html"))


In [36]:
# 7. Violin plot: PM2.5 by month
df['month'] = df['date'].dt.month_name().str.slice(stop=3)
fig = px.violin(df, x='month', y='pm2_5', category_orders={'month': ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']},
                title="PM2.5 by Month (Violin)")
fig.write_html(os.path.join(output_dir,"7_violin_pm2_5_month.html"))


In [37]:
# 8. Pairwise scatter matrix
fig = px.scatter_matrix(df[['so2','no2','rspm','pm2_5']].dropna().sample(min(1000,len(df))),
                        title="Scatter matrix (sample)")
fig.write_html(os.path.join(output_dir,"8_scatter_matrix.html"))

In [38]:
# 9. Rolling mean interactive (30-day)
df['pm2_5_30d'] = df['pm2_5'].rolling(30, min_periods=1).mean()
fig = px.line(df, x='date', y='pm2_5_30d', title="30-day Rolling Mean PM2.5 (Interactive)")
fig.write_html(os.path.join(output_dir,"9_rolling_pm2_5.html"))

In [39]:
# 10. Area (stacked) monthly
monthly = df.set_index('date').resample('M')[['so2','no2','pm2_5']].mean().dropna().reset_index()
fig = go.Figure()
fig.add_trace(go.Scatter(x=monthly['date'], y=monthly['so2'], stackgroup='one', name='SO2'))
fig.add_trace(go.Scatter(x=monthly['date'], y=monthly['no2'], stackgroup='one', name='NO2'))
fig.add_trace(go.Scatter(x=monthly['date'], y=monthly['pm2_5'], stackgroup='one', name='PM2.5'))
fig.update_layout(title="Monthly Stacked Pollutants (Area)")
fig.write_html(os.path.join(output_dir,"10_area_stacked_monthly.html"))

print("Plotly plots saved to:", output_dir)


'M' is deprecated and will be removed in a future version, please use 'ME' instead.



Plotly plots saved to: plotly_plots


In [41]:
from bokeh.plotting import figure, output_file, save, show
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource, HoverTool

In [42]:
# Make new Directory For Bokeh plot
output_dir = "bokeh_plots"
os.makedirs(output_dir, exist_ok=True)

df['month'] = df['date'].dt.month_name().str.slice(stop=3)
source = ColumnDataSource(df)

In [43]:
# 1. Time series PM2.5
p = figure(x_axis_type='datetime', title="PM2.5 over Time", width=900, height=300)
p.line('date', 'pm2_5', source=source)
output_file(os.path.join(output_dir,"1_time_pm2_5.html"))
save(p)

'D:\\Internship\\MILESTONE_1\\Day_15\\bokeh_plots\\1_time_pm2_5.html'

In [46]:
# 2. Multi-line pollutants
mp = figure(x_axis_type='datetime', title="SO2, NO2, PM2.5 over Time", width=900, height=300)
mp.line('date', 'so2', source=source, legend_label='SO2')
mp.line('date', 'no2', source=source, legend_label='NO2')
mp.line('date', 'pm2_5', source=source, legend_label='PM2.5')
mp.legend.location = "top_left"
output_file(os.path.join(output_dir,"2_multi_line.html"))
save(mp)

RuntimeError: Models must be owned by only a single document, Selection(id='p1007', ...) is already in a doc