In [2]:
# ======================================================
# 📊 Air Quality & Weather EDA - Singapore, Thailand, Malaysia
# ======================================================

# --- 0. IMPORT LIBRARIES ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from datetime import datetime
import os
import warnings

# --- CONFIGURATION ---
warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

# Paths
data_folder = '../data'
viz_folder = '../visualizations'
os.makedirs(viz_folder, exist_ok=True)

# Countries & files
countries_files = {
    'Singapore': f'{data_folder}/singapore/SG.csv',
    'Thailand': f'{data_folder}/thailand/THAI.csv',
    'Malaysia': f'{data_folder}/malaysia/MYS.csv'
}

print("✅ Libraries imported and folders ready!\n")


# ======================================================
# 1️⃣ LOAD AND COMBINE DATA
# ======================================================
dfs = []
for country, path in countries_files.items():
    print(f"Loading {country} data...")
    df_country = pd.read_csv(path)

    # Ensure 'Date' exists
    if 'Date' not in df_country.columns:
        raise ValueError(f"'Date' column missing in {country} dataset")
    df_country['Date'] = pd.to_datetime(df_country['Date'], errors='coerce')

    # Numeric columns
    for col in ['AQI', 'Temperature', 'RelativeHumidity', 'WindSpeed']:
        if col not in df_country.columns:
            df_country[col] = np.nan
        df_country[col] = pd.to_numeric(df_country[col], errors='coerce').round(2)

    # Region column
    if 'Region' not in df_country.columns:
        df_country['Region'] = 'Unknown'

    df_country['Country'] = country
    dfs.append(df_country)

# Combine all countries
df = pd.concat(dfs, ignore_index=True)
print(f"\n✅ Combined dataset shape: {df.shape}")
print(f"Countries: {', '.join(df['Country'].unique())}")


# ======================================================
# 2️⃣ FEATURE ENGINEERING
# ======================================================
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Quarter'] = df['Date'].dt.quarter
df['MonthName'] = df['Date'].dt.month_name()

# AQI Categories
def categorize_aqi(aqi):
    if pd.isna(aqi):
        return 'Unknown'
    elif aqi <= 50:
        return 'Good'
    elif aqi <= 100:
        return 'Moderate'
    elif aqi <= 150:
        return 'Unhealthy for Sensitive Groups'
    elif aqi <= 200:
        return 'Unhealthy'
    elif aqi <= 300:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

df['AQI_Category'] = df['AQI'].apply(categorize_aqi)

print("✅ Feature engineering complete!\n")


# ======================================================
# 3️⃣ DESCRIPTIVE STATISTICS
# ======================================================
summary_cols = ['AQI', 'Temperature', 'RelativeHumidity', 'WindSpeed']

print("Overall descriptive statistics:")
print(df[summary_cols].describe().round(2))

print("\nCountry-wise statistics:")
for country in df['Country'].unique():
    country_data = df[df['Country'] == country]
    print(f"\n{country}:")
    for col in summary_cols:
        mean_val = country_data[col].mean()
        std_val = country_data[col].std()
        print(f"  {col}: {mean_val:.2f} ± {std_val:.2f}")


# ======================================================
# 4️⃣ MISSING VALUE ANALYSIS
# ======================================================
missing = df.isna().sum()
print("\nMissing values per column:")
print(missing)


# ======================================================
# 5️⃣ CORRELATION ANALYSIS
# ======================================================
corr = df[summary_cols].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.savefig(f'{viz_folder}/correlation_matrix.png', dpi=300)
plt.close()
print("\n✅ Correlation matrix saved!")


# ======================================================
# 6️⃣ TIME SERIES ANALYSIS
# ======================================================
plt.figure(figsize=(12,6))
for country in df['Country'].unique():
    country_data = df[df['Country'] == country].groupby('Date')['AQI'].mean()
    plt.plot(country_data.index, country_data.values, label=country)
plt.xlabel('Date')
plt.ylabel('Average AQI')
plt.title('Daily Average AQI by Country')
plt.legend()
plt.tight_layout()
plt.savefig(f'{viz_folder}/time_series_aqi.png', dpi=300)
plt.close()
print("✅ Time series plot saved!")


# ======================================================
# 7️⃣ REGIONAL ANALYSIS
# ======================================================
regional_avg = df.groupby('Region')[summary_cols].mean().round(2)

colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8', '#FFD700', '#ADFF2F', '#FF69B4']
colors = colors[:len(regional_avg)]

# AQI by region
plt.figure(figsize=(10,6))
plt.bar(regional_avg.index, regional_avg['AQI'], color=colors, edgecolor='black')
plt.xticks(rotation=45)
plt.ylabel('AQI')
plt.title('Average AQI by Region')
for i, v in enumerate(regional_avg['AQI']):
    plt.text(i, v + 1, f"{v:.2f}", ha='center', va='bottom', fontweight='bold')
plt.tight_layout()
plt.savefig(f'{viz_folder}/regional_aqi.png', dpi=300)
plt.close()
print("✅ Regional AQI plot saved!")


# ======================================================
# 8️⃣ MONTHLY TRENDS
# ======================================================
monthly_avg = df.groupby(['MonthName','Country'])[summary_cols].mean().reset_index()
monthly_order = ['January','February','March','April','May','June','July','August','September','October','November','December']

plt.figure(figsize=(14,6))
sns.lineplot(data=monthly_avg, x='MonthName', y='AQI', hue='Country', hue_order=df['Country'].unique(), sort=False, 
             markers=True, style='Country')
plt.xticks(rotation=45)
plt.xlabel('Month')
plt.ylabel('Average AQI')
plt.title('Monthly AQI Trends')
plt.tight_layout()
plt.savefig(f'{viz_folder}/monthly_aqi_trends.png', dpi=300)
plt.close()
print("✅ Monthly trends plot saved!")


# ======================================================
# 9️⃣ SUMMARY TABLES
# ======================================================
# Top 5 regions by AQI
top_regions = regional_avg['AQI'].sort_values(ascending=False).head(5)
print("\nTop 5 Regions by Average AQI:")
print(top_regions)

# Save combined dataset for further analysis
df.to_csv(f'{data_folder}/FINAL_ANALYSIS.csv', index=False)
print("\n✅ Combined dataset saved!")


✅ Libraries imported and folders ready!

Loading Singapore data...
Loading Thailand data...
Loading Malaysia data...

✅ Combined dataset shape: (61581, 7)
Countries: Singapore, Thailand, Malaysia
✅ Feature engineering complete!

Overall descriptive statistics:
           AQI  Temperature  RelativeHumidity  WindSpeed
count 61049.00     61581.00          61581.00   61581.00
mean     70.70        27.10             80.78       9.16
std      51.87         1.67              9.18       4.23
min       0.00        11.38             26.79       0.70
25%      36.00        26.14             76.49       5.75
50%      52.00        27.04             82.58       8.91
75%      84.44        28.09             87.21      11.81
max     500.00        35.96             99.80      39.48

Country-wise statistics:

Singapore:
  AQI: 46.96 ± 12.13
  Temperature: 28.05 ± 1.15
  RelativeHumidity: 79.53 ± 5.75
  WindSpeed: 5.12 ± 2.24

Thailand:
  AQI: 132.28 ± 53.14
  Temperature: 27.29 ± 2.35
  RelativeHumidity: 