In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import lognorm, weibull_min, gamma
import warnings
warnings.filterwarnings('ignore')

# Set plot aesthetics using Seaborn
sns.set_style("whitegrid")  # Replace plt.style.use
sns.set_context("paper", font_scale=1.5)

In [33]:
# Load 2019(pre-SEZ) and 2021 (post-SEZ) data
# Assuming your Excel files are named accordingly
pre_sez = pd.read_csv('data/sector51gurugram2021.csv', parse_dates=['Timestamp'])
post_sez = pd.read_csv('data/sector51gurugram2023.csv', parse_dates=['Timestamp'])

# Display basic information
print("Pre-SEZ Data (2019) Shape:", pre_sez.shape)
print("Post-SEZ Data (2021) Shape:", post_sez.shape)

Pre-SEZ Data (2019) Shape: (365, 25)
Post-SEZ Data (2021) Shape: (365, 25)


In [34]:
# Preview the data
print("\nPre-SEZ Data Preview:")
pre_sez.head()


Pre-SEZ Data Preview:


Unnamed: 0,Timestamp,PM2.5 (µg/m³),PM10 (µg/m³),NO (µg/m³),NO2 (µg/m³),NOx (ppb),NH3 (µg/m³),SO2 (µg/m³),CO (mg/m³),Ozone (µg/m³),...,MP-Xylene (µg/m³),AT (°C),RH (%),WS (m/s),WD (deg),RF (mm),TOT-RF (mm),SR (W/mt2),BP (mmHg),VWS (m/s)
0,2021-01-01,252.92,334.37,80.28,73.49,21.54,161.44,1.71,2.63,19.22,...,16.18,10.73,77.46,0.4,230.52,0.0,0.0,28.92,,
1,2021-01-02,164.21,217.25,4.38,14.05,23.72,74.97,1.27,1.28,29.68,...,6.74,13.8,66.5,0.39,191.32,0.04,3.5,31.02,,
2,2021-01-03,97.29,142.61,7.94,11.46,15.27,62.42,1.34,1.0,4.95,...,5.46,14.11,93.14,0.42,202.4,0.68,64.5,22.85,,
3,2021-01-04,51.5,95.6,8.75,8.9,9.7,44.61,2.24,0.66,7.34,...,6.17,16.12,96.67,0.54,245.77,0.25,24.0,39.02,,
4,2021-01-05,50.22,74.24,6.56,3.18,3.09,3.6,1.49,0.69,13.61,...,6.8,17.55,92.6,0.45,175.25,0.01,0.5,39.41,,


In [35]:
# Check for missing values
print("\nMissing Values in Pre-SEZ Data:")
print(pre_sez.isnull().sum())
print("\nMissing Values in Post-SEZ Data:")
print(post_sez.isnull().sum())


Missing Values in Pre-SEZ Data:
Timestamp                0
PM2.5 (µg/m³)            0
PM10 (µg/m³)             0
NO (µg/m³)               1
NO2 (µg/m³)              1
NOx (ppb)                0
NH3 (µg/m³)              1
SO2 (µg/m³)              0
CO (mg/m³)               0
Ozone (µg/m³)            0
Benzene (µg/m³)        115
Toluene (µg/m³)        115
Xylene (µg/m³)         115
O Xylene (µg/m³)       365
Eth-Benzene (µg/m³)    115
MP-Xylene (µg/m³)        0
AT (°C)                  6
RH (%)                   4
WS (m/s)                 0
WD (deg)               115
RF (mm)                  0
TOT-RF (mm)              0
SR (W/mt2)               1
BP (mmHg)              365
VWS (m/s)              365
dtype: int64

Missing Values in Post-SEZ Data:
Timestamp                0
PM2.5 (µg/m³)            0
PM10 (µg/m³)             0
NO (µg/m³)               1
NO2 (µg/m³)              1
NOx (ppb)                0
NH3 (µg/m³)              1
SO2 (µg/m³)              0
CO (mg/m³)               0
Oz

In [36]:
def preprocess_data(df):
    """
    Preprocess air quality data by:
    1. Setting date as index
    2. Filling missing values using linear interpolation
    3. Removing any rows that still have missing values
    4. Ensuring pollutant values are numeric
    """
    # Copy to avoid modifying original
    df_clean = df.copy()
    
    # Set date as index if not already
    if 'Timestamp' in df_clean.columns:
        df_clean.set_index('Timestamp', inplace=True)
    
    # Select only pollution columns we need
    pollutants = ['PM2.5 (µg/m³)','PM10 (µg/m³)','NO (µg/m³)','NO2 (µg/m³)','CO (mg/m³)','SO2 (µg/m³)','Ozone (µg/m³)']
    df_clean = df_clean[pollutants]
    
    # Convert 'NA' strings to np.nan
    df_clean.replace('NA', np.nan, inplace=True)
    
    # Ensure all values are numeric
    for col in df_clean.columns:
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
    
    # Fill missing values using linear interpolation
    df_clean = df_clean.interpolate(method='linear')
    
    # Drop any remaining rows with NaN values
    df_clean.dropna(inplace=True)
    
    return df_clean

# Apply preprocessing
pre_clean = preprocess_data(pre_sez)
post_clean = preprocess_data(post_sez)

# Check if preprocessing was successful
print("Pre-SEZ clean data shape:", pre_clean.shape)
print("Post-SEZ clean data shape:", post_clean.shape)


Pre-SEZ clean data shape: (365, 7)
Post-SEZ clean data shape: (365, 7)


In [37]:
pre_sez.columns

Index(['Timestamp', 'PM2.5 (µg/m³)', 'PM10 (µg/m³)', 'NO (µg/m³)',
       'NO2 (µg/m³)', 'NOx (ppb)', 'NH3 (µg/m³)', 'SO2 (µg/m³)', 'CO (mg/m³)',
       'Ozone (µg/m³)', 'Benzene (µg/m³)', 'Toluene (µg/m³)', 'Xylene (µg/m³)',
       'O Xylene (µg/m³)', 'Eth-Benzene (µg/m³)', 'MP-Xylene (µg/m³)',
       'AT (°C)', 'RH (%)', 'WS (m/s)', 'WD (deg)', 'RF (mm)', 'TOT-RF (mm)',
       'SR (W/mt2)', 'BP (mmHg)', 'VWS (m/s)'],
      dtype='object')

In [38]:
print("\nPre-SEZ Summary Statistics:")
pre_clean.describe()


Pre-SEZ Summary Statistics:


Unnamed: 0,PM2.5 (µg/m³),PM10 (µg/m³),NO (µg/m³),NO2 (µg/m³),CO (mg/m³),SO2 (µg/m³),Ozone (µg/m³)
count,365.0,365.0,365.0,365.0,365.0,365.0,365.0
mean,111.169425,218.481671,8.947671,11.421315,0.924521,3.619178,31.538548
std,73.28556,115.721985,7.674716,7.850084,0.692774,3.497029,13.904514
min,24.78,35.16,0.53,1.09,0.25,0.91,1.33
25%,53.79,115.6,4.46,6.29,0.44,2.0,20.93
50%,86.75,205.49,7.09,9.98,0.64,2.84,29.62
75%,151.46,298.09,10.9,14.6,1.22,3.84,39.16
max,542.86,635.18,80.28,73.49,3.99,39.88,95.96


In [39]:
print("\nPost-SEZ Summary Statistics:")
post_clean.describe()


Post-SEZ Summary Statistics:


Unnamed: 0,PM2.5 (µg/m³),PM10 (µg/m³),NO (µg/m³),NO2 (µg/m³),CO (mg/m³),SO2 (µg/m³),Ozone (µg/m³)
count,365.0,365.0,365.0,365.0,365.0,365.0,365.0
mean,115.695926,196.406607,19.412083,25.367551,1.1393,2.997641,33.759941
std,57.080458,96.48522,22.590411,18.775793,1.025102,2.281548,8.591613
min,13.379348,21.243478,1.139201,1.955694,0.195625,0.378785,3.403333
25%,76.003571,130.171875,6.334306,11.781806,0.438021,2.034167,29.187049
50%,108.760417,184.35942,10.514762,23.161778,0.736319,2.463333,35.067951
75%,142.897917,246.499653,22.250451,32.449063,1.507986,3.114931,35.819529
max,389.013768,616.342935,175.525797,179.464819,6.178125,29.430417,57.058833


In [40]:
def fit_distributions(data, pollutant):
    """
    Fit Lognormal, Weibull, and Gamma distributions to pollution data.
    Returns parameters and goodness-of-fit statistics.
    """
    # Get pollutant data
    values = data[pollutant].values
    
    # Initialize results dictionary
    results = {}
    
    # Fit Lognormal distribution
    shape_ln, loc_ln, scale_ln = stats.lognorm.fit(values)
    results['Lognormal'] = {
        'params': (shape_ln, loc_ln, scale_ln),
        'aic': stats.lognorm.nnlf((shape_ln, loc_ln, scale_ln), values) * 2 + 6  # AIC calculation
    }
    
    # Fit Weibull distribution
    shape_wb, loc_wb, scale_wb = stats.weibull_min.fit(values)
    results['Weibull'] = {
        'params': (shape_wb, loc_wb, scale_wb),
        'aic': stats.weibull_min.nnlf((shape_wb, loc_wb, scale_wb), values) * 2 + 6
    }
    
    # Fit Gamma distribution
    shape_g, loc_g, scale_g = stats.gamma.fit(values)
    results['Gamma'] = {
        'params': (shape_g, loc_g, scale_g),
        'aic': stats.gamma.nnlf((shape_g, loc_g, scale_g), values) * 2 + 6
    }
    
    return results

In [41]:
def chi_square_test(data, pollutant, dist_name, params):
    """
    Perform Chi-square goodness-of-fit test on the fitted distribution.
    Lower Chi-square values indicate better fit.
    """
    values = data[pollutant].values
    
    # Create histogram of observed values
    hist, bin_edges = np.histogram(values, bins='auto', density=False)
    
    # Calculate bin midpoints for expected calculation
    bin_midpoints = (bin_edges[1:] + bin_edges[:-1]) / 2
    
    # Calculate expected frequencies based on the fitted distribution
    if dist_name == 'Lognormal':
        cdf_values = stats.lognorm.cdf(bin_edges, *params)
    elif dist_name == 'Weibull':
        cdf_values = stats.weibull_min.cdf(bin_edges, *params)
    elif dist_name == 'Gamma':
        cdf_values = stats.gamma.cdf(bin_edges, *params)
    
    # Calculate expected frequencies in each bin
    expected = len(values) * np.diff(cdf_values)
    
    # Handle zeros in expected frequencies (add a small value to avoid division by zero)
    expected = np.where(expected < 0.001, 0.001, expected)
    
    # Calculate Chi-square statistic
    chi2_stat = np.sum((hist - expected)**2 / expected)
    
    # Calculate degrees of freedom (bins - parameters - 1)
    df = len(hist) - len(params) - 1
    
    # Calculate p-value
    p_value = 1 - stats.chi2.cdf(chi2_stat, df)
    
    return {'chi2': chi2_stat, 'p_value': p_value, 'df': df}

In [42]:
# Define pollutants to analyze
pollutants = ['PM2.5 (µg/m³)','PM10 (µg/m³)','NO (µg/m³)','NO2 (µg/m³)','CO (mg/m³)','SO2 (µg/m³)','Ozone (µg/m³)']

# Initialize dictionaries to store fitting results
pre_distributions = {}
post_distributions = {}
chi_square_results = {}

# Fit distributions to each pollutant in pre-SEZ data
for pollutant in pollutants:
    pre_distributions[pollutant] = fit_distributions(pre_clean, pollutant)
    
    # Find best distribution based on AIC
    best_dist = min(pre_distributions[pollutant], 
                    key=lambda x: pre_distributions[pollutant][x]['aic'])
    
    # Perform Chi-square test on the best distribution
    params = pre_distributions[pollutant][best_dist]['params']
    chi_result = chi_square_test(pre_clean, pollutant, best_dist, params)
    
    # Store results
    chi_square_results[f"pre_{pollutant}"] = {
        'best_distribution': best_dist,
        'chi2': chi_result['chi2'],
        'p_value': chi_result['p_value']
    }

# Fit distributions to each pollutant in post-SEZ data
for pollutant in pollutants:
    post_distributions[pollutant] = fit_distributions(post_clean, pollutant)
    
    # Find best distribution based on AIC
    best_dist = min(post_distributions[pollutant], 
                    key=lambda x: post_distributions[pollutant][x]['aic'])
    
    # Perform Chi-square test on the best distribution
    params = post_distributions[pollutant][best_dist]['params']
    chi_result = chi_square_test(post_clean, pollutant, best_dist, params)
    
    # Store results
    chi_square_results[f"post_{pollutant}"] = {
        'best_distribution': best_dist,
        'chi2': chi_result['chi2'],
        'p_value': chi_result['p_value']
    }

# Display results
for key, value in chi_square_results.items():
    print(f"{key}: Best fit = {value['best_distribution']}, Chi² = {value['chi2']:.2f}, p = {value['p_value']:.4f}")

pre_PM2.5 (µg/m³): Best fit = Gamma, Chi² = 35.20, p = 0.0023
pre_PM10 (µg/m³): Best fit = Weibull, Chi² = 19.84, p = 0.0109
pre_NO (µg/m³): Best fit = Lognormal, Chi² = 185.74, p = 0.0000
pre_NO2 (µg/m³): Best fit = Gamma, Chi² = 344.13, p = 0.0000
pre_CO (mg/m³): Best fit = Lognormal, Chi² = 29.80, p = 0.0081
pre_SO2 (µg/m³): Best fit = Lognormal, Chi² = 966.25, p = 0.0000
pre_Ozone (µg/m³): Best fit = Lognormal, Chi² = 12.80, p = 0.6175
post_PM2.5 (µg/m³): Best fit = Lognormal, Chi² = 23.76, p = 0.1261
post_PM10 (µg/m³): Best fit = Gamma, Chi² = 19.61, p = 0.1875
post_NO (µg/m³): Best fit = Lognormal, Chi² = 71.99, p = 0.0003
post_NO2 (µg/m³): Best fit = Lognormal, Chi² = 94.32, p = 0.0000
post_CO (mg/m³): Best fit = Lognormal, Chi² = 42.68, p = 0.0003
post_SO2 (µg/m³): Best fit = Lognormal, Chi² = 2161.13, p = 0.0000
post_Ozone (µg/m³): Best fit = Lognormal, Chi² = 956.94, p = 0.0000


In [43]:
# Define Indian NAAQS standards (in μg/m³)
NAAQS = {
    'PM2.5 (µg/m³)': 60,  # 24-hour average
    
    'NO2 (µg/m³)': 80,    # 24-hour average
    'SO2 (µg/m³)': 80,     # 24-hour average
    'PM10 (µg/m³)': 100,  # 24-hour average
    'NO (µg/m³)': 80,
    'CO (mg/m³)': 4.0,     # 8-hour average
    'Ozone (µg/m³)': 100      # 8-hour average (Ozone)
}

# Calculate exceedance rates for pre-SEZ
pre_exceedance = {}
for pollutant in pollutants:
    exceedance_count = (pre_clean[pollutant] > NAAQS[pollutant]).sum()
    exceedance_percent = (exceedance_count / len(pre_clean)) * 100
    pre_exceedance[pollutant] = {
        'count': exceedance_count,
        'percent': exceedance_percent
    }

# Calculate exceedance rates for post-SEZ
post_exceedance = {}
for pollutant in pollutants:
    exceedance_count = (post_clean[pollutant] > NAAQS[pollutant]).sum()
    exceedance_percent = (exceedance_count / len(post_clean)) * 100
    post_exceedance[pollutant] = {
        'count': exceedance_count,
        'percent': exceedance_percent
    }

# Print exceedance results
print("\nNAAQS Exceedance Analysis:")
print("-" * 50)
print(f"{'Pollutant':<10} | {'Pre-SEZ (2005)':<20} | {'Post-SEZ (2007)':<20}")
print("-" * 50)
for pollutant in pollutants:
    pre_pct = pre_exceedance[pollutant]['percent']
    post_pct = post_exceedance[pollutant]['percent']
    print(f"{pollutant.split()[0]:<10} | {pre_pct:.1f}% ({pre_exceedance[pollutant]['count']} days) | "
          f"{post_pct:.1f}% ({post_exceedance[pollutant]['count']} days)")


NAAQS Exceedance Analysis:
--------------------------------------------------
Pollutant  | Pre-SEZ (2005)       | Post-SEZ (2007)     
--------------------------------------------------
PM2.5      | 69.3% (253 days) | 85.8% (313 days)
PM10       | 82.7% (302 days) | 84.7% (309 days)
NO         | 0.3% (1 days) | 2.2% (8 days)
NO2        | 0.0% (0 days) | 1.9% (7 days)
CO         | 0.0% (0 days) | 1.9% (7 days)
SO2        | 0.0% (0 days) | 0.0% (0 days)
Ozone      | 0.0% (0 days) | 0.0% (0 days)


In [44]:
def plot_distribution_fit(data, pollutant, period):
    """
    Plot histogram of observed data with the fitted distributions.
    """
    # Get data values
    values = data[pollutant].values
    
    # Get distribution fitting results
    if period == 'Pre-SEZ':
        dist_results = pre_distributions[pollutant]
    else:
        dist_results = post_distributions[pollutant]
    
    # Find the best distribution
    best_dist = min(dist_results, key=lambda x: dist_results[x]['aic'])
    
    # Create figure
    plt.figure(figsize=(12, 6))
    
    # Plot histogram
    plt.hist(values, bins='auto', density=True, alpha=0.6, color='orange', 
             label='Observed Data')
    
    # Create points for plotting distributions
    x = np.linspace(min(values), max(values), 1000)
    
    # Plot the best distribution
    if best_dist == 'Lognormal':
        params = dist_results['Lognormal']['params']
        y = stats.lognorm.pdf(x, *params)
        plt.plot(x, y, 'r-', linewidth=2, label=f'Lognormal (Best Fit)')
    elif best_dist == 'Weibull':
        params = dist_results['Weibull']['params']
        y = stats.weibull_min.pdf(x, *params)
        plt.plot(x, y, 'r-', linewidth=2, label=f'Weibull (Best Fit)')
    elif best_dist == 'Gamma':
        params = dist_results['Gamma']['params']
        y = stats.gamma.pdf(x, *params)
        plt.plot(x, y, 'r-', linewidth=2, label=f'Gamma (Best Fit)')
    
    # Plot the other distributions with thinner lines
    for dist_name in dist_results:
        if dist_name != best_dist:
            params = dist_results[dist_name]['params']
            if dist_name == 'Lognormal':
                y = stats.lognorm.pdf(x, *params)
            elif dist_name == 'Weibull':
                y = stats.weibull_min.pdf(x, *params)
            elif dist_name == 'Gamma':
                y = stats.gamma.pdf(x, *params)
            plt.plot(x, y, '--', linewidth=1, alpha=0.7, label=dist_name)
    
    # Add NAAQS limit line
    plt.axvline(x=NAAQS[pollutant], color='red', linestyle='--', 
                label=f'NAAQS Limit ({NAAQS[pollutant]} μg/m³)')
    
    # Add labels and title
    plt.xlabel(pollutant)
    plt.ylabel('Probability Density')
    plt.title(f'{period} (2019 if Pre, 2021 if Post): {pollutant} Distribution Fitting')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Save plot
    plt.savefig(f'{period.lower().replace("-", "_")}_{pollutant.split()[0]}_distribution.png', dpi=300)
    plt.close()

# Generate distribution plots for each pollutant in both periods
for pollutant in pollutants:
    plot_distribution_fit(pre_clean, pollutant, 'Pre-SEZ')
    plot_distribution_fit(post_clean, pollutant, 'Post-SEZ')

In [45]:
# Calculate percentage change in pollutant concentrations
percent_change = {}
for pollutant in pollutants:
    pre_mean = pre_clean[pollutant].mean()
    post_mean = post_clean[pollutant].mean()
    change = ((post_mean - pre_mean) / pre_mean) * 100
    percent_change[pollutant] = change

# Create a comprehensive results dataframe
results_data = []
for pollutant in pollutants:
    pre_best = chi_square_results[f"pre_{pollutant}"]['best_distribution']
    post_best = chi_square_results[f"post_{pollutant}"]['best_distribution']
    
    results_data.append({
        'Pollutant': pollutant.split()[0],
        'Pre-SEZ Mean': pre_clean[pollutant].mean(),
        'Post-SEZ Mean': post_clean[pollutant].mean(),
        'Change (%)': percent_change[pollutant],
        'Pre-SEZ Distribution': pre_best,
        'Post-SEZ Distribution': post_best,
        'Distribution Changed': pre_best != post_best,
        'Pre-SEZ Exceedance (%)': pre_exceedance[pollutant]['percent'],
        'Post-SEZ Exceedance (%)': post_exceedance[pollutant]['percent'],
        'Exceedance Change (pp)': post_exceedance[pollutant]['percent'] - pre_exceedance[pollutant]['percent']
    })

# Convert to DataFrame for better display
results_df = pd.DataFrame(results_data)

# Save results to CSV
results_df.to_csv('noida_sez_impact_results.csv', index=False)

# Display the results
print("\nFinal Results Summary:")
print("-" * 100)
print(results_df.to_string(index=False))
print("-" * 100)

# Create a bar chart for concentration changes
plt.figure(figsize=(12, 6))
colors = ['yellow' if x < 0 else 'red' for x in results_df['Change (%)'].values]
plt.bar(results_df['Pollutant'], results_df['Change (%)'], color=colors)
plt.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
plt.title('Percentage Change in Pollutant Concentrations (Post-SEZ vs Pre-SEZ)')
plt.ylabel('Change (%)')
plt.grid(axis='y', alpha=0.3)
for i, v in enumerate(results_df['Change (%)']):
    plt.text(i, v + (5 if v > 0 else -5), f"{v:.1f}%", ha='center', va='center')
plt.savefig('pollutant_concentration_changes.png', dpi=300)
plt.close()


Final Results Summary:
----------------------------------------------------------------------------------------------------
Pollutant  Pre-SEZ Mean  Post-SEZ Mean  Change (%) Pre-SEZ Distribution Post-SEZ Distribution  Distribution Changed  Pre-SEZ Exceedance (%)  Post-SEZ Exceedance (%)  Exceedance Change (pp)
    PM2.5    111.169425     115.695926    4.071715                Gamma             Lognormal                  True               69.315068                85.753425               16.438356
     PM10    218.481671     196.406607  -10.103852              Weibull                 Gamma                  True               82.739726                84.657534                1.917808
       NO      8.947671      19.412083  116.951237            Lognormal             Lognormal                 False                0.273973                 2.191781                1.917808
      NO2     11.421315      25.367551  122.107093                Gamma             Lognormal                  True    