In [7]:
# Exploratory Data Analysis (EDA) - Singapore Air Quality & Weather
# This notebook performs comprehensive data analysis and saves visualizations for reporting

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from datetime import datetime
import warnings
import os

# Configure settings
warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

# Set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

# Create visualization folder if it doesn't exist
viz_folder = '../visualizations/malaysia_eda'
os.makedirs(viz_folder, exist_ok=True)

print("✅ Libraries imported and visualization folder created!")

# ============================================================================
# 1. LOAD AND PREPARE DATA
# ============================================================================

print("\n" + "="*80)
print("LOADING DATASET")
print("="*80)

# Load the dataset
df = pd.read_csv('../data/malaysia/MYS.csv')
df['Date'] = pd.to_datetime(df['Date'])

# Extract time-based features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Quarter'] = df['Date'].dt.quarter
df['MonthName'] = df['Date'].dt.month_name()

print(f"✅ Dataset loaded successfully!")
print(f"   Shape: {df.shape}")
print(f"   Date range: {df['Date'].min()} to {df['Date'].max()}")
print(f"   Regions: {', '.join(df['Region'].unique())}")

# ============================================================================
# 2. DATA OVERVIEW
# ============================================================================

print("\n" + "="*80)
print("DATA SUMMARY")
print("="*80)

print("\n📊 Descriptive Statistics:")
print(df[['AQI', 'Temperature', 'RelativeHumidity', 'WindSpeed']].describe())

print("\n🌍 Regional Statistics:")
for region in sorted(df['Region'].unique()):
    region_data = df[df['Region'] == region]
    print(f"\n{region}:")
    print(f"  Records: {len(region_data):,}")
    region_data['AQI'] = pd.to_numeric(region_data['AQI'], errors='coerce')
    print(f"  AQI: {region_data['AQI'].mean():.2f} ± {region_data['AQI'].std():.2f}")

    print(f"  Temp: {region_data['Temperature'].mean():.2f}°C ± {region_data['Temperature'].std():.2f}")

# ============================================================================
# 3. CORRELATION ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

numeric_cols = ['AQI', 'Temperature', 'RelativeHumidity', 'WindSpeed']
correlation_matrix = df[numeric_cols].corr()

print("\nCorrelation Matrix:")
print(correlation_matrix)

# Statistical significance tests
print("\n📊 Correlation Significance Tests:")
variables = ['Temperature', 'RelativeHumidity', 'WindSpeed']
for var in variables:
    valid_data = df[['AQI', var]].dropna()
    if len(valid_data) > 0:
        corr, p_value = pearsonr(valid_data['AQI'], valid_data[var])
        sig = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
        print(f"  AQI vs {var}: r={corr:.4f}, p={p_value:.6f} {sig}")

# Save: Correlation Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8},
            fmt='.3f', vmin=-1, vmax=1)
plt.title('Correlation Matrix - Weather and Air Quality Variables', 
          fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig(f'{viz_folder}/01_correlation_heatmap.png', dpi=300, bbox_inches='tight')
print(f"\n✅ Saved: 01_correlation_heatmap.png")
plt.close()

# ============================================================================
# 4. DISTRIBUTION ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("DISTRIBUTION ANALYSIS")
print("="*80)
df['AQI'] = pd.to_numeric(df['AQI'], errors='coerce')
df['Temperature'] = pd.to_numeric(df['Temperature'], errors='coerce')
df['RelativeHumidity'] = pd.to_numeric(df['RelativeHumidity'], errors='coerce')
df['WindSpeed'] = pd.to_numeric(df['WindSpeed'], errors='coerce')
# Categorize AQI
def categorize_aqi(aqi):
    if pd.isna(aqi):
        return 'Unknown'
    elif aqi <= 50:
        return 'Good'
    elif aqi <= 100:
        return 'Moderate'
    elif aqi <= 150:
        return 'Unhealthy for Sensitive Groups'
    elif aqi <= 200:
        return 'Unhealthy'
    elif aqi <= 300:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

df['AQI_Category'] = df['AQI'].apply(categorize_aqi)

print("\nAQI Category Distribution:")
aqi_counts = df['AQI_Category'].value_counts()
print(aqi_counts)
print(f"\nPercentages:")
print((aqi_counts / len(df) * 100).round(2))

# Save: Distribution of Variables
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Distribution of Variables', fontsize=16, fontweight='bold')

# AQI Distribution
axes[0, 0].hist(df['AQI'].dropna(), bins=50, color='skyblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('AQI Distribution', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('AQI')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df['AQI'].mean(), color='red', linestyle='--', linewidth=2, 
                   label=f'Mean: {df["AQI"].mean():.2f}')
axes[0, 0].axvline(df['AQI'].median(), color='green', linestyle='--', linewidth=2, 
                   label=f'Median: {df["AQI"].median():.2f}')
axes[0, 0].legend()

# Temperature Distribution
axes[0, 1].hist(df['Temperature'].dropna(), bins=50, color='orange', edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Temperature Distribution', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Temperature (°C)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(df['Temperature'].mean(), color='red', linestyle='--', linewidth=2, 
                   label=f'Mean: {df["Temperature"].mean():.2f}')
axes[0, 1].legend()

# Humidity Distribution
axes[1, 0].hist(df['RelativeHumidity'].dropna(), bins=50, color='lightgreen', edgecolor='black', alpha=0.7)
axes[1, 0].set_title('Relative Humidity Distribution', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Relative Humidity (%)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].axvline(df['RelativeHumidity'].mean(), color='red', linestyle='--', linewidth=2, 
                   label=f'Mean: {df["RelativeHumidity"].mean():.2f}')
axes[1, 0].legend()

# Wind Speed Distribution
axes[1, 1].hist(df['WindSpeed'].dropna(), bins=50, color='lightcoral', edgecolor='black', alpha=0.7)
axes[1, 1].set_title('Wind Speed Distribution', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Wind Speed (km/h)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].axvline(df['WindSpeed'].mean(), color='red', linestyle='--', linewidth=2, 
                   label=f'Mean: {df["WindSpeed"].mean():.2f}')
axes[1, 1].legend()

plt.tight_layout()
plt.savefig(f'{viz_folder}/02_variable_distributions.png', dpi=300, bbox_inches='tight')
print(f"✅ Saved: 02_variable_distributions.png")
plt.close()

# Save: Box Plots by Region
fig, axes = plt.subplots(2, 2, figsize=(18, 14))
fig.suptitle('Distribution by Region', fontsize=16, fontweight='bold')

regions = sorted(df['Region'].unique())

# AQI by Region
axes[0, 0].boxplot([df[df['Region']==r]['AQI'].dropna() for r in regions], labels=regions)
axes[0, 0].set_title('AQI by Region', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('AQI')
axes[0, 0].grid(True, alpha=0.3)

# Temperature by Region
axes[0, 1].boxplot([df[df['Region']==r]['Temperature'].dropna() for r in regions], labels=regions)
axes[0, 1].set_title('Temperature by Region', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Temperature (°C)')
axes[0, 1].grid(True, alpha=0.3)

# Humidity by Region
axes[1, 0].boxplot([df[df['Region']==r]['RelativeHumidity'].dropna() for r in regions], labels=regions)
axes[1, 0].set_title('Relative Humidity by Region', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Relative Humidity (%)')
axes[1, 0].grid(True, alpha=0.3)

# Wind Speed by Region
axes[1, 1].boxplot([df[df['Region']==r]['WindSpeed'].dropna() for r in regions], labels=regions)
axes[1, 1].set_title('Wind Speed by Region', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('Wind Speed (km/h)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f'{viz_folder}/03_regional_boxplots.png', dpi=300, bbox_inches='tight')
print(f"✅ Saved: 03_regional_boxplots.png")
plt.close()

# ============================================================================
# 5. TEMPORAL ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("TEMPORAL ANALYSIS")
print("="*80)

# Save: Time Series - Overall Trends
fig, axes = plt.subplots(4, 1, figsize=(20, 16))
fig.suptitle('Time Series Analysis - Overall Trends (2016-2024)', fontsize=16, fontweight='bold')

daily_avg = df.groupby('Date')[['AQI', 'Temperature', 'RelativeHumidity', 'WindSpeed']].mean()

# AQI over time
axes[0].plot(daily_avg.index, daily_avg['AQI'], linewidth=0.5, alpha=0.5, color='skyblue')
rolling_aqi = daily_avg['AQI'].rolling(window=30).mean()
axes[0].plot(rolling_aqi.index, rolling_aqi, color='blue', linewidth=2, label='30-Day Moving Average')
axes[0].set_title('AQI Trend Over Time', fontsize=14, fontweight='bold')
axes[0].set_ylabel('AQI', fontsize=12)
axes[0].grid(True, alpha=0.3)
axes[0].legend()

# Temperature over time
axes[1].plot(daily_avg.index, daily_avg['Temperature'], linewidth=0.5, alpha=0.5, color='orange')
rolling_temp = daily_avg['Temperature'].rolling(window=30).mean()
axes[1].plot(rolling_temp.index, rolling_temp, color='red', linewidth=2, label='30-Day Moving Average')
axes[1].set_title('Temperature Trend Over Time', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Temperature (°C)', fontsize=12)
axes[1].grid(True, alpha=0.3)
axes[1].legend()

# Humidity over time
axes[2].plot(daily_avg.index, daily_avg['RelativeHumidity'], linewidth=0.5, alpha=0.5, color='lightgreen')
rolling_hum = daily_avg['RelativeHumidity'].rolling(window=30).mean()
axes[2].plot(rolling_hum.index, rolling_hum, color='green', linewidth=2, label='30-Day Moving Average')
axes[2].set_title('Relative Humidity Trend Over Time', fontsize=14, fontweight='bold')
axes[2].set_ylabel('Relative Humidity (%)', fontsize=12)
axes[2].grid(True, alpha=0.3)
axes[2].legend()

# Wind Speed over time
axes[3].plot(daily_avg.index, daily_avg['WindSpeed'], linewidth=0.5, alpha=0.5, color='lightcoral')
rolling_wind = daily_avg['WindSpeed'].rolling(window=30).mean()
axes[3].plot(rolling_wind.index, rolling_wind, color='purple', linewidth=2, label='30-Day Moving Average')
axes[3].set_title('Wind Speed Trend Over Time', fontsize=14, fontweight='bold')
axes[3].set_ylabel('Wind Speed (km/h)', fontsize=12)
axes[3].set_xlabel('Date', fontsize=12)
axes[3].grid(True, alpha=0.3)
axes[3].legend()

plt.tight_layout()
plt.savefig(f'{viz_folder}/04_time_series_trends.png', dpi=300, bbox_inches='tight')
print(f"✅ Saved: 04_time_series_trends.png")
plt.close()

# Save: Yearly Trends
fig, axes = plt.subplots(2, 2, figsize=(18, 12))
fig.suptitle('Yearly Trends (2016-2024)', fontsize=16, fontweight='bold')

yearly_avg = df.groupby('Year')[['AQI', 'Temperature', 'RelativeHumidity', 'WindSpeed']].mean()

axes[0, 0].plot(yearly_avg.index, yearly_avg['AQI'], marker='o', linewidth=2, markersize=8)
axes[0, 0].set_title('Average AQI by Year', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('AQI', fontsize=12)
axes[0, 0].set_xlabel('Year', fontsize=12)
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].plot(yearly_avg.index, yearly_avg['Temperature'], marker='o', linewidth=2, markersize=8, color='orange')
axes[0, 1].set_title('Average Temperature by Year', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Temperature (°C)', fontsize=12)
axes[0, 1].set_xlabel('Year', fontsize=12)
axes[0, 1].grid(True, alpha=0.3)

axes[1, 0].plot(yearly_avg.index, yearly_avg['RelativeHumidity'], marker='o', linewidth=2, markersize=8, color='green')
axes[1, 0].set_title('Average Relative Humidity by Year', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Relative Humidity (%)', fontsize=12)
axes[1, 0].set_xlabel('Year', fontsize=12)
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].plot(yearly_avg.index, yearly_avg['WindSpeed'], marker='o', linewidth=2, markersize=8, color='purple')
axes[1, 1].set_title('Average Wind Speed by Year', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('Wind Speed (km/h)', fontsize=12)
axes[1, 1].set_xlabel('Year', fontsize=12)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f'{viz_folder}/05_yearly_trends.png', dpi=300, bbox_inches='tight')
print(f"✅ Saved: 05_yearly_trends.png")
plt.close()

# Save: Monthly Patterns
fig, axes = plt.subplots(2, 2, figsize=(18, 12))
fig.suptitle('Monthly Patterns - Seasonality Analysis', fontsize=16, fontweight='bold')

monthly_avg = df.groupby('Month')[['AQI', 'Temperature', 'RelativeHumidity', 'WindSpeed']].mean()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

axes[0, 0].bar(monthly_avg.index, monthly_avg['AQI'], color='skyblue', edgecolor='black')
axes[0, 0].set_title('Average AQI by Month', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('AQI', fontsize=12)
axes[0, 0].set_xlabel('Month', fontsize=12)
axes[0, 0].set_xticks(range(1, 13))
axes[0, 0].set_xticklabels(month_names, rotation=45)
axes[0, 0].grid(True, alpha=0.3, axis='y')

axes[0, 1].bar(monthly_avg.index, monthly_avg['Temperature'], color='orange', edgecolor='black')
axes[0, 1].set_title('Average Temperature by Month', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Temperature (°C)', fontsize=12)
axes[0, 1].set_xlabel('Month', fontsize=12)
axes[0, 1].set_xticks(range(1, 13))
axes[0, 1].set_xticklabels(month_names, rotation=45)
axes[0, 1].grid(True, alpha=0.3, axis='y')

axes[1, 0].bar(monthly_avg.index, monthly_avg['RelativeHumidity'], color='lightgreen', edgecolor='black')
axes[1, 0].set_title('Average Relative Humidity by Month', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Relative Humidity (%)', fontsize=12)
axes[1, 0].set_xlabel('Month', fontsize=12)
axes[1, 0].set_xticks(range(1, 13))
axes[1, 0].set_xticklabels(month_names, rotation=45)
axes[1, 0].grid(True, alpha=0.3, axis='y')

axes[1, 1].bar(monthly_avg.index, monthly_avg['WindSpeed'], color='lightcoral', edgecolor='black')
axes[1, 1].set_title('Average Wind Speed by Month', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('Wind Speed (km/h)', fontsize=12)
axes[1, 1].set_xlabel('Month', fontsize=12)
axes[1, 1].set_xticks(range(1, 13))
axes[1, 1].set_xticklabels(month_names, rotation=45)
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(f'{viz_folder}/06_monthly_seasonality.png', dpi=300, bbox_inches='tight')
print(f"✅ Saved: 06_monthly_seasonality.png")
plt.close()

# ============================================================================
# 6. REGIONAL COMPARISON
# ============================================================================

print("\n" + "="*80)
print("REGIONAL COMPARISON")
print("="*80)

# Save: Regional Comparison
fig, axes = plt.subplots(2, 2, figsize=(18, 12))
fig.suptitle('Regional Comparison - Average Values', fontsize=16, fontweight='bold')

regional_avg = df.groupby('Region')[['AQI', 'Temperature', 'RelativeHumidity', 'WindSpeed']].mean()
regions = sorted(regional_avg.index)
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']

axes[0, 0].bar(regions, regional_avg.loc[regions, 'AQI'], color=colors, edgecolor='black')
axes[0, 0].set_title('Average AQI by Region', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('AQI', fontsize=12)
axes[0, 0].grid(True, alpha=0.3, axis='y')
for i, v in enumerate(regional_avg.loc[regions, 'AQI']):
    axes[0, 0].text(i, v + 1, f'{v:.2f}', ha='center', va='bottom', fontweight='bold')

axes[0, 1].bar(regions, regional_avg.loc[regions, 'Temperature'], color=colors, edgecolor='black')
axes[0, 1].set_title('Average Temperature by Region', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Temperature (°C)', fontsize=12)
axes[0, 1].grid(True, alpha=0.3, axis='y')
for i, v in enumerate(regional_avg.loc[regions, 'Temperature']):
    axes[0, 1].text(i, v + 0.1, f'{v:.2f}', ha='center', va='bottom', fontweight='bold')

axes[1, 0].bar(regions, regional_avg.loc[regions, 'RelativeHumidity'], color=colors, edgecolor='black')
axes[1, 0].set_title('Average Relative Humidity by Region', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Relative Humidity (%)', fontsize=12)
axes[1, 0].grid(True, alpha=0.3, axis='y')
for i, v in enumerate(regional_avg.loc[regions, 'RelativeHumidity']):
    axes[1, 0].text(i, v + 0.5, f'{v:.2f}', ha='center', va='bottom', fontweight='bold')

axes[1, 1].bar(regions, regional_avg.loc[regions, 'WindSpeed'], color=colors, edgecolor='black')
axes[1, 1].set_title('Average Wind Speed by Region', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('Wind Speed (km/h)', fontsize=12)
axes[1, 1].grid(True, alpha=0.3, axis='y')
for i, v in enumerate(regional_avg.loc[regions, 'WindSpeed']):
    axes[1, 1].text(i, v + 0.1, f'{v:.2f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig(f'{viz_folder}/07_regional_comparison.png', dpi=300, bbox_inches='tight')
print(f"✅ Saved: 07_regional_comparison.png")
plt.close()

# ============================================================================
# 7. AQI CATEGORY ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("AQI CATEGORY ANALYSIS")
print("="*80)

# Save: AQI Category Distribution
fig, axes = plt.subplots(1, 2, figsize=(18, 7))
fig.suptitle('AQI Category Distribution', fontsize=16, fontweight='bold')

colors = ['#00E400', '#FFFF00', '#FF7E00', '#FF0000', '#8F3F97', '#7E0023']
category_order = ['Good', 'Moderate', 'Unhealthy for Sensitive Groups', 'Unhealthy', 'Very Unhealthy', 'Hazardous']
aqi_counts_ordered = df['AQI_Category'].value_counts().reindex(category_order, fill_value=0)

# Pie chart
axes[0].pie(aqi_counts_ordered, labels=aqi_counts_ordered.index, autopct='%1.1f%%', 
            colors=colors, startangle=90, textprops={'fontsize': 10, 'fontweight': 'bold'})
axes[0].set_title('AQI Category Distribution', fontsize=14, fontweight='bold')

# Bar chart
axes[1].bar(range(len(aqi_counts_ordered)), aqi_counts_ordered.values, color=colors, edgecolor='black')
axes[1].set_xticks(range(len(aqi_counts_ordered)))
axes[1].set_xticklabels(aqi_counts_ordered.index, rotation=45, ha='right')
axes[1].set_ylabel('Count', fontsize=12)
axes[1].set_title('AQI Category Frequency', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')
for i, v in enumerate(aqi_counts_ordered):
    axes[1].text(i, v + 1000, f'{v:,}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig(f'{viz_folder}/08_aqi_categories.png', dpi=300, bbox_inches='tight')
print(f"✅ Saved: 08_aqi_categories.png")
plt.close()

# ============================================================================
# 8. SCATTER PLOTS - AQI VS WEATHER
# ============================================================================

print("\n" + "="*80)
print("RELATIONSHIP ANALYSIS")
print("="*80)

# Save: Scatter Plots
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('AQI vs Weather Variables - Relationship Analysis', fontsize=16, fontweight='bold')

# AQI vs Temperature
temp_aqi_data = df[['Temperature', 'AQI']].dropna()
axes[0].scatter(temp_aqi_data['Temperature'], temp_aqi_data['AQI'], alpha=0.3, s=10)
axes[0].set_xlabel('Temperature (°C)', fontsize=12)
axes[0].set_ylabel('AQI', fontsize=12)
axes[0].set_title(f'AQI vs Temperature\nCorrelation: {df["AQI"].corr(df["Temperature"]):.2f}', fontsize=12)
axes[0].grid(True, alpha=0.3)
if len(temp_aqi_data) > 0:
    z = np.polyfit(temp_aqi_data['Temperature'], temp_aqi_data['AQI'], 1)
    p = np.poly1d(z)
    axes[0].plot(temp_aqi_data['Temperature'].sort_values(), 
                 p(temp_aqi_data['Temperature'].sort_values()), 
                 "r--", linewidth=2, label='Trend Line')
    axes[0].legend()

# AQI vs Humidity
humidity_aqi_data = df[['RelativeHumidity', 'AQI']].dropna()
axes[1].scatter(humidity_aqi_data['RelativeHumidity'], humidity_aqi_data['AQI'], alpha=0.3, s=10, color='green')
axes[1].set_xlabel('Relative Humidity (%)', fontsize=12)
axes[1].set_ylabel('AQI', fontsize=12)
axes[1].set_title(f'AQI vs Humidity\nCorrelation: {df["AQI"].corr(df["RelativeHumidity"]):.2f}', fontsize=12)
axes[1].grid(True, alpha=0.3)
if len(humidity_aqi_data) > 0:
    z = np.polyfit(humidity_aqi_data['RelativeHumidity'], humidity_aqi_data['AQI'], 1)
    p = np.poly1d(z)
    axes[1].plot(humidity_aqi_data['RelativeHumidity'].sort_values(), 
                 p(humidity_aqi_data['RelativeHumidity'].sort_values()), 
                 "r--", linewidth=2, label='Trend Line')
    axes[1].legend()

# AQI vs Wind Speed
wind_aqi_data = df[['WindSpeed', 'AQI']].dropna()
axes[2].scatter(wind_aqi_data['WindSpeed'], wind_aqi_data['AQI'], alpha=0.3, s=10, color='orange')
axes[2].set_xlabel('Wind Speed (km/h)', fontsize=12)
axes[2].set_ylabel('AQI', fontsize=12)
axes[2].set_title(f'AQI vs Wind Speed\nCorrelation: {df["AQI"].corr(df["WindSpeed"]):.2f}', fontsize=12)
axes[2].grid(True, alpha=0.3)
if len(wind_aqi_data) > 0:
    z = np.polyfit(wind_aqi_data['WindSpeed'], wind_aqi_data['AQI'], 1)
    p = np.poly1d(z)
    axes[2].plot(wind_aqi_data['WindSpeed'].sort_values(), 
                 p(wind_aqi_data['WindSpeed'].sort_values()), 
                 "r--", linewidth=2, label='Trend Line')
    axes[2].legend()

plt.tight_layout()
plt.savefig(f'{viz_folder}/09_aqi_weather_relationships.png', dpi=300, bbox_inches='tight')
print(f"✅ Saved: 09_aqi_weather_relationships.png")
plt.close()

# ============================================================================
# 9. KEY FINDINGS SUMMARY
# ============================================================================

print("\n" + "="*80)
print("KEY FINDINGS SUMMARY")
print("="*80)

print("\n📊 DATA OVERVIEW:")
print(f"  • Total Records: {len(df):,}")
print(f"  • Date Range: {df['Date'].min().date()} to {df['Date'].max().date()}")
print(f"  • Regions: {', '.join(sorted(df['Region'].unique()))}")
print(f"  • Years Covered: {df['Year'].min()} - {df['Year'].max()}")

print("\n🌡️ WEATHER PATTERNS:")
print(f"  • Average Temperature: {df['Temperature'].mean():.2f}°C (Range: {df['Temperature'].min():.2f}°C - {df['Temperature'].max():.2f}°C)")
print(f"  • Average Humidity: {df['RelativeHumidity'].mean():.2f}% (Range: {df['RelativeHumidity'].min():.2f}% - {df['RelativeHumidity'].max():.2f}%)")
print(f"  • Average Wind Speed: {df['WindSpeed'].mean():.2f} km/h (Range: {df['WindSpeed'].min():.2f} - {df['WindSpeed'].max():.2f} km/h)")

print("\n🏭 AIR QUALITY PATTERNS:")
print(f"  • Average AQI: {df['AQI'].mean():.2f}")
print(f"  • Median AQI: {df['AQI'].median():.2f}")
print(f"  • AQI Range: {df['AQI'].min():.2f} - {df['AQI'].max():.2f}")
print(f"  • Most Common Category: {df['AQI_Category'].mode()[0]}")
print(f"  • Good Air Quality Days: {(df['AQI_Category'] == 'Good').sum():,} ({(df['AQI_Category'] == 'Good').sum()/len(df)*100:.1f}%)")

print("\n🔗 CORRELATIONS WITH AQI:")
print(f"  • Temperature: {df['AQI'].corr(df['Temperature']):.4f} {'(Positive)' if df['AQI'].corr(df['Temperature']) > 0 else '(Negative)'}")
print(f"  • Humidity: {df['AQI'].corr(df['RelativeHumidity']):.4f} {'(Positive)' if df['AQI'].corr(df['RelativeHumidity']) > 0 else '(Negative)'}")
print(f"  • Wind Speed: {df['AQI'].corr(df['WindSpeed']):.4f} {'(Positive)' if df['AQI'].corr(df['WindSpeed']) > 0 else '(Negative)'}")

print("\n🌍 REGIONAL INSIGHTS:")
regional_aqi = df.groupby('Region')['AQI'].mean().sort_values(ascending=False)
print(f"  • Highest AQI Region: {regional_aqi.index[0]} (AQI: {regional_aqi.values[0]:.2f})")
print(f"  • Lowest AQI Region: {regional_aqi.index[-1]} (AQI: {regional_aqi.values[-1]:.2f})")
print(f"  • Regional Variation: {regional_aqi.std():.2f} (Standard Deviation)")

print("\n📅 TEMPORAL PATTERNS:")
monthly_aqi = df.groupby('Month')['AQI'].mean()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
print(f"  • Worst Month for AQI: {month_names[monthly_aqi.idxmax()-1]} (AQI: {monthly_aqi.max():.2f})")
print(f"  • Best Month for AQI: {month_names[monthly_aqi.idxmin()-1]} (AQI: {monthly_aqi.min():.2f})")

yearly_aqi = df.groupby('Year')['AQI'].mean()
print(f"  • Worst Year for AQI: {yearly_aqi.idxmax()} (AQI: {yearly_aqi.max():.2f})")
print(f"  • Best Year for AQI: {yearly_aqi.idxmin()} (AQI: {yearly_aqi.min():.2f})")

print("\n⚠️ DATA QUALITY:")
missing_data = {
    'AQI': df['AQI'].isna().sum(),
    'Temperature': df['Temperature'].isna().sum(),
    'Humidity': df['RelativeHumidity'].isna().sum(),
    'Wind Speed': df['WindSpeed'].isna().sum()
}
for var, count in missing_data.items():
    if count > 0:
        print(f"  • Missing {var}: {count:,} ({count/len(df)*100:.2f}%)")
    else:
        print(f"  • Missing {var}: None ✓")

# Save summary to text file
summary_file = f'{viz_folder}/00_analysis_summary.txt'
with open(summary_file, 'w') as f:
    f.write("="*80 + "\n")
    f.write("EXPLORATORY DATA ANALYSIS - KEY FINDINGS SUMMARY\n")
    f.write("Singapore Air Quality & Weather (2016-2024)\n")
    f.write("="*80 + "\n\n")
    
    f.write("DATA OVERVIEW:\n")
    f.write(f"  • Total Records: {len(df):,}\n")
    f.write(f"  • Date Range: {df['Date'].min().date()} to {df['Date'].max().date()}\n")
    f.write(f"  • Regions: {', '.join(sorted(df['Region'].unique()))}\n")
    f.write(f"  • Years Covered: {df['Year'].min()} - {df['Year'].max()}\n\n")
    
    f.write("WEATHER PATTERNS:\n")
    f.write(f"  • Average Temperature: {df['Temperature'].mean():.2f}°C\n")
    f.write(f"  • Average Humidity: {df['RelativeHumidity'].mean():.2f}%\n")
    f.write(f"  • Average Wind Speed: {df['WindSpeed'].mean():.2f} km/h\n\n")
    
    f.write("AIR QUALITY PATTERNS:\n")
    f.write(f"  • Average AQI: {df['AQI'].mean():.2f}\n")
    f.write(f"  • Median AQI: {df['AQI'].median():.2f}\n")
    f.write(f"  • Most Common Category: {df['AQI_Category'].mode()[0]}\n")
    f.write(f"  • Good Air Quality Days: {(df['AQI_Category'] == 'Good').sum()/len(df)*100:.1f}%\n\n")
    
    f.write("CORRELATIONS WITH AQI:\n")
    f.write(f"  • Temperature: {df['AQI'].corr(df['Temperature']):.4f}\n")
    f.write(f"  • Humidity: {df['AQI'].corr(df['RelativeHumidity']):.4f}\n")
    f.write(f"  • Wind Speed: {df['AQI'].corr(df['WindSpeed']):.4f}\n\n")
    
    f.write("REGIONAL INSIGHTS:\n")
    for region in sorted(regional_aqi.index):
        f.write(f"  • {region}: AQI {regional_aqi[region]:.2f}\n")
    
    f.write("\nTEMPORAL PATTERNS:\n")
    f.write(f"  • Worst Month: {month_names[monthly_aqi.idxmax()-1]} (AQI: {monthly_aqi.max():.2f})\n")
    f.write(f"  • Best Month: {month_names[monthly_aqi.idxmin()-1]} (AQI: {monthly_aqi.min():.2f})\n")
    f.write(f"  • Worst Year: {yearly_aqi.idxmax()} (AQI: {yearly_aqi.max():.2f})\n")
    f.write(f"  • Best Year: {yearly_aqi.idxmin()} (AQI: {yearly_aqi.min():.2f})\n\n")
    
    

print(f"\n✅ Saved: 00_analysis_summary.txt")

print("\n" + "="*80)
print("✅ EXPLORATORY DATA ANALYSIS COMPLETE!")
print("="*80)
print(f"\n📁 All visualizations saved to: {viz_folder}/")
print("\n📊 Generated Files:")
print("  01_correlation_heatmap.png")
print("  02_variable_distributions.png")
print("  03_regional_boxplots.png")
print("  04_time_series_trends.png")
print("  05_yearly_trends.png")
print("  06_monthly_seasonality.png")
print("  07_regional_comparison.png")
print("  08_aqi_categories.png")
print("  09_aqi_weather_relationships.png")
print("  00_analysis_summary.txt")

print("\n" + "="*80)

✅ Libraries imported and visualization folder created!

LOADING DATASET
✅ Dataset loaded successfully!
   Shape: (29316, 11)
   Date range: 2014-01-01 00:00:00 to 2024-12-29 00:00:00
   Regions: AlorSetar, KotaKinabalu, KualaLumpur, KualaTerengganu, Kuching, Langkawi, PetalingJaya, Seremban

DATA SUMMARY

📊 Descriptive Statistics:
       Temperature  RelativeHumidity  WindSpeed
count     29316.00          29316.00   29316.00
mean         26.54             85.04      11.55
std           1.00              6.29       3.52
min          21.97             46.49       3.26
25%          25.87             82.19       9.11
50%          26.49             86.22      10.97
75%          27.16             89.17      13.38
max          31.43             99.49      39.48

🌍 Regional Statistics:

AlorSetar:
  Records: 3,670
  AQI: 40.50 ± 18.18
  Temp: 27.04°C ± 1.18

KotaKinabalu:
  Records: 3,651
  AQI: 33.46 ± 14.47
  Temp: 26.62°C ± 0.94

KualaLumpur:
  Records: 3,680
  AQI: 52.03 ± 20.56
  Temp: 26

ValueError: could not convert string to float: ' '