In [1]:
# AQI_and_Prominent_Plots_notebook.py
# Ready-to-run notebook cells (paste into a .ipynb code cell) that:
# - auto-detects your calendar-style AQI sheets for 2023/2024/2025
# - melts Day x Month -> Date rows
# - reads prominent-pollutant sheets (auto-detected by sheet name)
# - produces and saves all requested PNG charts
# Requirements: pandas, numpy, matplotlib

# ---------------------- CELL 1: Imports & file discovery ----------------------
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import calendar

DATA_DIR = 'D:\Downloads\FieldProjectAQI_Data'
# Pattern-match files you uploaded; adjust if your filenames differ
aqi_files = sorted(glob.glob(os.path.join(DATA_DIR, '*mumbai*202*.xlsx')))
prom_files = sorted(glob.glob(os.path.join(DATA_DIR, 'Prominent*202*.xlsx')) +
                    glob.glob(os.path.join(DATA_DIR, 'prominent*202*.xlsx')) +
                    glob.glob(os.path.join(DATA_DIR, '*prominent*202*.xlsx')))

print('AQI files found:', aqi_files)
print('Prominent pollutant files found:', prom_files)

# ---------------------- CELL 2: Helpers to melt calendar-style AQI ----------------------

def melt_calendar_aqi(path, year=None):
    """Read a calendar-style sheet (columns: Day, January..December) and return long DataFrame.
    If the file contains multiple sheets, the function chooses the first sheet containing 'AQI' or uses sheet 0.
    """
    xls = pd.ExcelFile(path)
    # choose sheet containing 'AQI' (case-insensitive) else first sheet
    sheet = None
    for s in xls.sheet_names:
        if 'aqi' in s.lower():
            sheet = s
            break
    if sheet is None:
        sheet = xls.sheet_names[0]
    df = pd.read_excel(path, sheet_name=sheet)
    df = df.rename(columns=lambda c: str(c).strip())
    if 'Day' not in df.columns:
        raise ValueError(f"Expected column 'Day' in {path} (sheet: {sheet}). Columns: {df.columns.tolist()}")
    month_cols = [c for c in df.columns if str(c).strip().lower() != 'day']
    long = df.melt(id_vars=['Day'], value_vars=month_cols, var_name='Month', value_name='AQI')
    long = long.dropna(subset=['AQI'])
    long['Day'] = pd.to_numeric(long['Day'], errors='coerce').astype('Int64')
    month_map = {calendar.month_name[i]: i for i in range(1,13)}
    month_map.update({calendar.month_abbr[i]: i for i in range(1,13)})
    long['MonthNum'] = long['Month'].map(lambda x: month_map.get(str(x).strip(), np.nan))
    # If year not provided, try to parse from filename
    if year is None:
        import re
        m = re.search(r"(20\d{2})", os.path.basename(path))
        year = int(m.group(1)) if m else None
    if year is None:
        raise ValueError('Unable to determine year for ' + path)
    def make_date(row):
        try:
            return pd.Timestamp(year=year, month=int(row['MonthNum']), day=int(row['Day']))
        except Exception:
            return pd.NaT
    long['Date'] = long.apply(make_date, axis=1)
    long = long.dropna(subset=['Date']).reset_index(drop=True)
    long['AQI'] = pd.to_numeric(long['AQI'], errors='coerce')
    long['Year'] = year
    long['Month'] = long['MonthNum']
    return long[['Date','Year','Month','Day','AQI']]

# ---------------------- CELL 3: Load all AQI files into a single DataFrame ----------------------
all_long = []
for p in aqi_files:
    try:
        df_long = melt_calendar_aqi(p)
        all_long.append(df_long)
    except Exception as e:
        print('Failed to melt', p, '->', str(e))

if not all_long:
    raise SystemExit('No AQI long-form data was created. Check file formats and file names in /mnt/data.')

all_long = pd.concat(all_long, ignore_index=True)
all_long['Month'] = all_long['Month'].astype(int)
print('Combined rows:', len(all_long))

# Optional quick preview (uncomment if running interactively)
# display(all_long.head())

# ---------------------- CELL 4: Read prominent pollutant sheets (auto-detect) ----------------------
# This function looks for a sheet whose name contains 'prominent' or 'dominant' or 'pollutant'

def read_prominent_file(path):
    xls = pd.ExcelFile(path)
    candidate = None
    for s in xls.sheet_names:
        lname = s.lower()
        if any(k in lname for k in ['prominent','dominant','pollutant','param']):
            candidate = s
            break
    if candidate is None:
        candidate = xls.sheet_names[0]
    df = pd.read_excel(path, sheet_name=candidate)
    df = df.rename(columns=lambda c: str(c).strip())
    return df, candidate

prom_long_list = []
for p in prom_files:
    try:
        dfp, sheet = read_prominent_file(p)
        print('Reading', p, 'sheet:', sheet, 'columns:', list(dfp.columns)[:10])
        # Attempt to melt similar calendar-style if it has Day + months
        if 'Day' in dfp.columns:
            months = [c for c in dfp.columns if str(c).strip().lower()!='day']
            tmp = dfp.melt(id_vars=['Day'], value_vars=months, var_name='Month', value_name='Pollutant')
            tmp['Day'] = pd.to_numeric(tmp['Day'], errors='coerce').astype('Int64')
            month_map = {calendar.month_name[i]: i for i in range(1,13)}
            month_map.update({calendar.month_abbr[i]: i for i in range(1,13)})
            tmp['MonthNum'] = tmp['Month'].map(lambda x: month_map.get(str(x).strip(), np.nan))
            # get year from filename
            import re
            m = re.search(r"(20\d{2})", os.path.basename(p))
            year = int(m.group(1)) if m else None
            def make_date(row):
                try:
                    return pd.Timestamp(year=year, month=int(row['MonthNum']), day=int(row['Day']))
                except:
                    return pd.NaT
            tmp['Date'] = tmp.apply(make_date, axis=1)
            tmp = tmp.dropna(subset=['Date']).reset_index(drop=True)
            tmp['Year'] = year
            tmp = tmp[['Date','Year','MonthNum','Day','Pollutant']].rename(columns={'MonthNum':'Month'})
            prom_long_list.append(tmp)
        else:
            print('Prominent sheet not in Day x Month layout; trying to find Date or Day columns.')
            # If it already has a Date column and a Pollutant column
            possible_date = None
            for col in dfp.columns:
                if 'date' in col.lower():
                    possible_date = col
                    break
            poll_col = None
            for col in dfp.columns:
                if any(k in col.lower() for k in ['dominant','prominent','pollutant','param','characteristic']):
                    poll_col = col
                    break
            if possible_date is not None and poll_col is not None:
                tmp = dfp[[possible_date, poll_col]].dropna()
                tmp.columns = ['Date','Pollutant']
                tmp['Date'] = pd.to_datetime(tmp['Date'], errors='coerce')
                tmp = tmp.dropna(subset=['Date']).reset_index(drop=True)
                tmp['Year'] = tmp['Date'].dt.year
                tmp['Month'] = tmp['Date'].dt.month
                prom_long_list.append(tmp[['Date','Year','Month','Pollutant']])
            else:
                print('Could not auto-parse prominent pollutant sheet', p, '- please inspect the sheet manually if parsing fails.')
    except Exception as e:
        print('Error reading prominent file', p, ':', e)

if prom_long_list:
    prom_all = pd.concat(prom_long_list, ignore_index=True)
    prom_all['Pollutant'] = prom_all['Pollutant'].astype(str).str.strip()
    print('Prominent pollutant records:', len(prom_all))
else:
    prom_all = pd.DataFrame()
    print('No prominent pollutant data parsed automatically.')

# ==============================================================================
# CELL 5 (REVISED): Plotting functions & CPCB categories
# ==============================================================================

def aqi_category(a):
    try:
        a = float(a)
    except (ValueError, TypeError):
        return np.nan
    if a <= 50: return 'Good'
    if a <= 100: return 'Satisfactory'
    if a <= 200: return 'Moderate'
    if a <= 300: return 'Poor'
    if a <= 400: return 'Very Poor'
    return 'Severe'

OUTPUT_DIR = DATA_DIR
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 3-Year Comparative Monthly Averages
monthly_avg = all_long.groupby(['Year','Month'])['AQI'].mean().reset_index()
pivot_monthly = monthly_avg.pivot(index='Month', columns='Year', values='AQI').reindex(index=range(1,13))
plt.figure(figsize=(14, 7)) # Increased figure size for better spacing
pivot_monthly.plot(kind='bar', width=0.8, ax=plt.gca()) # Use ax=plt.gca() to plot on the current figure
plt.xlabel('Month', fontsize=12)
plt.ylabel('Average AQI', fontsize=12)
plt.title('3-Year Comparative Monthly Average AQI (Mumbai)', fontsize=16)
plt.xticks(ticks=range(12), labels=[calendar.month_abbr[i+1] for i in range(12)], rotation=0)
# FIX: Move legend to a better position
plt.legend(title='Year', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout(rect=[0, 0, 0.9, 1]) # Adjust layout to make space for legend
out1 = os.path.join(OUTPUT_DIR, '3yr_monthly_avg_aqi.png')
plt.savefig(out1)
plt.close()
print('Saved', out1)

# Overall Annual AQI Trend (bar chart for better label visibility)
annual_avg = all_long.groupby('Year')['AQI'].mean().reset_index()
plt.figure(figsize=(7, 5))
# FIX: Switched to a bar chart and adjusted x-ticks for clarity
bars = plt.bar(annual_avg['Year'], annual_avg['AQI'], color=['skyblue', 'salmon', 'lightgreen'])
plt.xlabel('Year', fontsize=12)
plt.ylabel('Average AQI', fontsize=12)
plt.title('Overall Annual Average AQI', fontsize=14)
# FIX: Ensure x-axis ticks are integers for the years
plt.xticks(annual_avg['Year'].astype(int))
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
out2 = os.path.join(OUTPUT_DIR, 'annual_avg_aqi.png')
plt.savefig(out2)
plt.close()
print('Saved', out2)

# ... (The code for Daily Trends and Heatmaps remains the same as it didn't have issues) ...
# Daily AQI Trends for Each Year
for y in sorted(all_long['Year'].unique()):
    dfy = all_long[all_long['Year']==y].sort_values('Date')
    plt.figure(figsize=(14,4))
    plt.plot(dfy['Date'], dfy['AQI'])
    plt.xlabel('Date')
    plt.ylabel('AQI')
    plt.title(f'Daily AQI Trend - {y}')
    plt.tight_layout()
    out = os.path.join(OUTPUT_DIR, f'daily_trend_{y}.png')
    plt.savefig(out)
    plt.close()
    print('Saved', out)

# AQI Heatmap (Month vs Day) for each year
for y in sorted(all_long['Year'].unique()):
    dfy = all_long[all_long['Year']==y].copy()
    heat = dfy.pivot_table(index='Month', columns='Day', values='AQI', aggfunc='mean').reindex(index=range(1,13))
    plt.figure(figsize=(14,6))
    plt.imshow(heat, aspect='auto', origin='lower')
    plt.colorbar(label='AQI')
    plt.yticks(ticks=np.arange(12), labels=[calendar.month_name[i+1] for i in range(12)])
    plt.xticks(ticks=np.arange(31), labels=list(range(1,32)))
    plt.xlabel('Day of Month')
    plt.ylabel('Month')
    plt.title(f'AQI Heatmap (Month vs Day) - {y}')
    plt.tight_layout()
    out = os.path.join(OUTPUT_DIR, f'aqi_heatmap_{y}.png')
    plt.savefig(out)
    plt.close()
    print('Saved', out)

# AQI Category Distribution by Year
all_long['Category'] = all_long['AQI'].apply(aqi_category)
cat_counts = all_long.groupby(['Year','Category']).size().reset_index(name='days')
pivot_cat = cat_counts.pivot(index='Category', columns='Year', values='days').reindex(index=['Good','Satisfactory','Moderate','Poor','Very Poor','Severe']).fillna(0)
plt.figure(figsize=(10, 6)) # Increased figure size
pivot_cat.plot(kind='bar', ax=plt.gca())
plt.xlabel('AQI Category', fontsize=12)
plt.ylabel('Days', fontsize=12)
plt.title('AQI Category Distribution by Year', fontsize=16)
# FIX: Rotate labels for better visibility
plt.xticks(rotation=45, ha='right')
# FIX: Move legend to a better position
plt.legend(title='Year', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout for legend
out = os.path.join(OUTPUT_DIR, 'aqi_category_distribution.png')
plt.savefig(out)
plt.close()
print('Saved', out)

# Monthly AQI Distribution Boxplot
monthly_boxes = [all_long[all_long['Month']==m]['AQI'].dropna().values for m in range(1,13)]
plt.figure(figsize=(12,6))
plt.boxplot(monthly_boxes, labels=[calendar.month_abbr[m] for m in range(1,13)])
plt.xlabel('Month', fontsize=12)
plt.ylabel('AQI', fontsize=12)
plt.title('Monthly AQI Distribution (Boxplot) - All years combined', fontsize=16)
plt.tight_layout()
out = os.path.join(OUTPUT_DIR, 'monthly_aqi_boxplot.png')
plt.savefig(out)
plt.close()
print('Saved', out)


# ==============================================================================
# CELL 6 (REVISED): Pollutant-specific charts (if data exists)
# ==============================================================================
if not prom_all.empty:
    prom_all['Year'] = prom_all['Date'].dt.year
    prom_all['Month'] = prom_all['Date'].dt.month
    monthly_prom = prom_all.groupby(['Year','Month','Pollutant']).size().reset_index(name='days')
    for y in sorted(prom_all['Year'].unique()):
        dfy = monthly_prom[monthly_prom['Year']==y]
        pivot = dfy.pivot(index='Month', columns='Pollutant', values='days').reindex(index=range(1,13)).fillna(0)
        plt.figure(figsize=(12, 6)) # Increased figure size
        pivot.plot(kind='bar', stacked=True, ax=plt.gca())
        plt.xlabel('Month', fontsize=12)
        plt.ylabel('Days (count)', fontsize=12)
        plt.title(f'Monthly Prominent Pollutant Breakdown - {y}', fontsize=16)
        plt.xticks(ticks=range(12), labels=[calendar.month_abbr[i+1] for i in range(12)], rotation=0)
        # FIX: Move legend to a better position
        plt.legend(title='Pollutant', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout(rect=[0, 0, 0.8, 1]) # Adjust layout for legend
        out = os.path.join(OUTPUT_DIR, f'monthly_prominent_pollutant_{y}.png')
        plt.savefig(out)
        plt.close()
        print('Saved', out)

    # Overall pollutant dominance (pie)
    overall = prom_all['Pollutant'].value_counts()
    plt.figure(figsize=(10, 8)) # Increased figure size
    # FIX: Create a function to prevent label overlap on the pie chart
    def autopct_generator(limit):
        """Don't show percentage label for slices smaller than limit."""
        def inner_autopct(pct):
            return ('%1.1f%%' % pct) if pct > limit else ''
        return inner_autopct
    
    overall.plot(kind='pie', autopct=autopct_generator(3), ylabel='', textprops={'fontsize': 10})
    plt.title('Overall Pollutant Dominance (All years)', fontsize=16)
    # FIX: Create a legend for the pie chart instead of labels on the slices
    plt.legend(labels=overall.index, bbox_to_anchor=(1.15, 0.9), loc="upper right", title="Pollutants")
    plt.tight_layout()
    out = os.path.join(OUTPUT_DIR, 'overall_pollutant_dominance.png')
    plt.savefig(out)
    plt.close()
    print('Saved', out)
else:
    print('Prominent pollutant dataset is empty — pollutant-specific charts were skipped.')

print('\nAll done. Check the PNG files in', OUTPUT_DIR)

  DATA_DIR = 'D:\Downloads\FieldProjectAQI_Data'


AQI files found: ['D:\\Downloads\\FieldProjectAQI_Data\\AQI_daily_city_level_mumbai_2023_mumbai_2023.xlsx', 'D:\\Downloads\\FieldProjectAQI_Data\\AQI_daily_city_level_mumbai_2024_mumbai_2024.xlsx', 'D:\\Downloads\\FieldProjectAQI_Data\\AQI_daily_city_level_mumbai_2025_mumbai_2025.xlsx', 'D:\\Downloads\\FieldProjectAQI_Data\\~$AQI_daily_city_level_mumbai_2023_mumbai_2023.xlsx', 'D:\\Downloads\\FieldProjectAQI_Data\\~$AQI_daily_city_level_mumbai_2024_mumbai_2024.xlsx', 'D:\\Downloads\\FieldProjectAQI_Data\\~$AQI_daily_city_level_mumbai_2025_mumbai_2025.xlsx']
Prominent pollutant files found: ['D:\\Downloads\\FieldProjectAQI_Data\\Prominent_param2023.xlsx', 'D:\\Downloads\\FieldProjectAQI_Data\\Prominent_param2023.xlsx', 'D:\\Downloads\\FieldProjectAQI_Data\\Prominent_param2023.xlsx', 'D:\\Downloads\\FieldProjectAQI_Data\\Prominent_param2025.xlsx', 'D:\\Downloads\\FieldProjectAQI_Data\\Prominent_param2025.xlsx', 'D:\\Downloads\\FieldProjectAQI_Data\\Prominent_param2025.xlsx', 'D:\\Downloa

  plt.boxplot(monthly_boxes, labels=[calendar.month_abbr[m] for m in range(1,13)])


Saved D:\Downloads\FieldProjectAQI_Data\monthly_aqi_boxplot.png
Saved D:\Downloads\FieldProjectAQI_Data\monthly_prominent_pollutant_2023.png
Saved D:\Downloads\FieldProjectAQI_Data\monthly_prominent_pollutant_2024.png
Saved D:\Downloads\FieldProjectAQI_Data\monthly_prominent_pollutant_2025.png
Saved D:\Downloads\FieldProjectAQI_Data\overall_pollutant_dominance.png

All done. Check the PNG files in D:\Downloads\FieldProjectAQI_Data
