# Air Quality PM2.5 Prediction - Exploratory Data Analysis

**Author:** Chukwuahachie Sylvester  
**Date:** December 1, 2025  
**Purpose:** Initial exploration of air quality dataset

## Objectives
1. Understand the dataset structure
2. Identify data quality issues (missing values, outliers)
3. Analyze feature distributions
4. Explore relationships between features and target (PM2.5)
5. Generate insights for feature engineering

In [None]:


# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical analysis
from scipy import stats

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

In [None]:
DATA_PATH = '../data/raw/air_quality.csv'

df = pd.read_csv(DATA_PATH)
    
print(f"\nDataset Shape: {df.shape}")
print(f"Rows: {df.shape[0]:,}")
print(f"Columns: {df.shape[1]}")

In [None]:
print("=" * 60)
print("FIRST 5 ROWS")
print("=" * 60)
display(df.head())

In [None]:
print("\n" + "=" * 60)
print("LAST 5 ROWS")
print("=" * 60)
display(df.tail())

In [None]:
print("\n" + "=" * 60)
print("RANDOM SAMPLE (5 ROWS)")
print("=" * 60)
display(df.sample(5, random_state=42))

In [None]:
#Dataset Information
print('=' * 60)
print('DATASET INFORMATION')
print('=' * 60)
display(df.info())


In [None]:
print('=' * 60)
print('MEMORY USAGE')
print('=' * 60)
print(f"Total memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


In [None]:
#statistical Summary
print('=' * 60)
print('STASTISTICAL SUMMARY - NUMERICAL FEATURES')
print('=' * 60)
display(df.describe())

In [None]:
print("\n" + "=" * 60)
print("STATISTICAL SUMMARY - CATEGORICAL FEATURES")
print("=" * 60)
display(df.describe(include=['object']))

In [None]:
print("=" * 60)
print("MISSING VALUES ANALYSIS")
print("=" * 60)

# Calculate missing values
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100

# Create summary dataframe
missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': missing.values,
    'Missing_Percentage': missing_percent.values
}).sort_values('Missing_Count', ascending=False)

# Display only columns with missing values
missing_df_filtered = missing_df[missing_df['Missing_Count'] > 0]

if len(missing_df_filtered) > 0:
    print("\n Columns with Missing Values:")
    display(missing_df_filtered)
else:
    print("\n No missing values found!")

# Visualize missing values
if missing.sum() > 0:
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.isnull(), cbar=True, cmap='viridis', yticklabels=False)
    plt.title('Missing Values Heatmap', fontsize=16, fontweight='bold')
    plt.xlabel('Columns')
    plt.tight_layout()
    plt.show()
else:
    print("\n No missing values to visualize")

In [None]:
# ============================================
# TARGET VARIABLE ANALYSIS (PM2.5)
# ============================================

# Define your target variable name
TARGET = 'target'  

print("=" * 60)
print(f"TARGET VARIABLE ANALYSIS: {TARGET}")
print("=" * 60)

if TARGET in df.columns:
    print(f"\n Summary Statistics for {TARGET}:")
    print(df[TARGET].describe())
    
    print(f"\n Additional Statistics:")
    print(f"Skewness: {df[TARGET].skew():.3f}")
    print(f"Kurtosis: {df[TARGET].kurtosis():.3f}")
    print(f"Range: {df[TARGET].min():.2f} - {df[TARGET].max():.2f}")
    print(f"IQR: {df[TARGET].quantile(0.75) - df[TARGET].quantile(0.25):.2f}")
    
    # Visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Histogram with KDE
    axes[0, 0].hist(df[TARGET].dropna(), bins=50, edgecolor='black', alpha=0.7)
    axes[0, 0].set_xlabel(TARGET)
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title(f'Distribution of {TARGET}', fontweight='bold')
    axes[0, 0].grid(alpha=0.3)
    
    # 2. Box plot
    axes[0, 1].boxplot(df[TARGET].dropna(), vert=True)
    axes[0, 1].set_ylabel(TARGET)
    axes[0, 1].set_title(f'Box Plot of {TARGET}', fontweight='bold')
    axes[0, 1].grid(alpha=0.3)
    
    # 3. KDE plot
    df[TARGET].dropna().plot(kind='kde', ax=axes[1, 0], linewidth=2)
    axes[1, 0].set_xlabel(TARGET)
    axes[1, 0].set_ylabel('Density')
    axes[1, 0].set_title(f'Kernel Density Plot of {TARGET}', fontweight='bold')
    axes[1, 0].grid(alpha=0.3)
    
    # 4. Q-Q plot
    stats.probplot(df[TARGET].dropna(), dist="norm", plot=axes[1, 1])
    axes[1, 1].set_title(f'Q-Q Plot of {TARGET}', fontweight='bold')
    axes[1, 1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    


In [None]:


print("=" * 60)
print("FEATURE TYPE IDENTIFICATION")
print("=" * 60)

# Separate numerical and categorical features
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()

# Remove target from numerical if present
if TARGET in numerical_cols:
    numerical_cols.remove(TARGET)

print(f"\n Numerical Features ({len(numerical_cols)}):")
for i, col in enumerate(numerical_cols, 1):
    print(f"   {i}. {col}")

print(f"\n Categorical Features ({len(categorical_cols)}):")
for i, col in enumerate(categorical_cols, 1):
    print(f"   {i}. {col} - {df[col].nunique()} unique values")

if datetime_cols:
    print(f"\n DateTime Features ({len(datetime_cols)}):")
    for i, col in enumerate(datetime_cols, 1):
        print(f"   {i}. {col}")

print(f"\n Target Variable: {TARGET}")

In [None]:
# ============================================
# NUMERICAL FEATURES - DISTRIBUTION ANALYSIS
# ============================================

print("=" * 60)
print("NUMERICAL FEATURES DISTRIBUTION ANALYSIS")
print("=" * 60)



print(f"\nAnalyzing {len(numerical_cols)} numerical features...")

# Calculate statistics for all numerical features
stats_summary = pd.DataFrame({
    'Mean': df[numerical_cols].mean(),
    'Median': df[numerical_cols].median(),
    'Std': df[numerical_cols].std(),
    'Min': df[numerical_cols].min(),
    'Max': df[numerical_cols].max(),
    'Skewness': df[numerical_cols].skew(),
    'Missing_%': (df[numerical_cols].isnull().sum() / len(df) * 100)
}).round(3)

print("\n Top 10 Most Variable Features (by Std Dev):")
display(stats_summary.nlargest(10, 'Std')[['Mean', 'Std', 'Skewness', 'Missing_%']])

print("\n Top 10 Most Skewed Features:")
display(stats_summary.nlargest(10, 'Skewness')[['Mean', 'Skewness', 'Missing_%']])

In [None]:
# ============================================
# WEATHER FEATURES ANALYSIS
# ============================================

print("=" * 60)
print("WEATHER FEATURES ANALYSIS")
print("=" * 60)

# Define weather features 
weather_features = [
    'precipitable_water_entire_atmosphere',
    'relative_humidity_2m_above_ground',
    'specific_humidity_2m_above_ground',
    'temperature_2m_above_ground',
    'u_component_of_wind_10m_above_ground',
    'v_component_of_wind_10m_above_ground'
]

# Filter to existing columns
weather_features = [col for col in weather_features if col in df.columns]

print(f"\n Analyzing {len(weather_features)} weather features")

# Create subplots
n_features = len(weather_features)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, n_rows * 4))
axes = axes.flatten() if n_features > 1 else [axes]

for idx, feature in enumerate(weather_features):
    ax = axes[idx]
    
    # Histogram with KDE
    df[feature].dropna().hist(bins=50, ax=ax, alpha=0.7, edgecolor='black')
    df[feature].dropna().plot(kind='kde', ax=ax, secondary_y=True, color='red', linewidth=2)
    
    ax.set_xlabel(feature.replace('_', ' ').title())
    ax.set_ylabel('Frequency')
    ax.set_title(f'Distribution: {feature[:30]}...', fontweight='bold', fontsize=10)
    ax.grid(alpha=0.3)
    
    # Add statistics text
    stats_text = f"Mean: {df[feature].mean():.2f}\nStd: {df[feature].std():.2f}\nSkew: {df[feature].skew():.2f}"
    ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, 
            verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
            fontsize=8)

# Hide extra subplots
for idx in range(n_features, len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.show()



In [None]:
# ============================================
# CORRELATION ANALYSIS
# ============================================

print("=" * 60)
print("CORRELATION ANALYSIS")
print("=" * 60)



# Compute correlation matrix
corr_matrix = df.corr(numeric_only = True)

# Correlation with target
target_corr = corr_matrix[TARGET].sort_values(ascending=False)

print("\n Top 15 Features Most Correlated with Target:")
print(target_corr.head(16)[1:])  

print("\n Top 15 Features Most Negatively Correlated with Target:")
print(target_corr.tail(15))

# Visualize correlation matrix (top correlated features only)
top_features = target_corr.head(20).index.tolist()

plt.figure(figsize=(14, 12))
sns.heatmap(df[top_features].corr(), 
            annot=True, 
            fmt='.2f', 
            cmap='coolwarm', 
            center=0,
            square=True, 
            linewidths=0.5,
            cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix - Top 20 Features vs Target', 
          fontsize=16, fontweight='bold', pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# ============================================
# OUTLIER DETECTION
# ============================================

print("=" * 60)
print("OUTLIER DETECTION")
print("=" * 60)

def detect_outliers_iqr(data, column):
    """Detect outliers using IQR method"""
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    
    return len(outliers), (len(outliers) / len(data)) * 100

# Check outliers in target variable
outliers_count, outliers_pct = detect_outliers_iqr(df, 'target')
print(f"\n Target Variable")
print(f"   Count: {outliers_count}")
print(f"   Percentage: {outliers_pct:.2f}%")

# Check outliers in key weather features
print(f"\n Weather Features Outliers:")
outlier_summary = []

for feature in weather_features:
    count, pct = detect_outliers_iqr(df, feature)
    outlier_summary.append({
        'Feature': feature,
        'Outlier_Count': count,
        'Outlier_Percentage': round(pct, 2)
    })

outlier_df = pd.DataFrame(outlier_summary).sort_values('Outlier_Percentage', ascending=False)
display(outlier_df)

# Visualize outliers with boxplots
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, feature in enumerate(weather_features):
    ax = axes[idx]
    df.boxplot(column=feature, ax=ax, patch_artist=True)
    ax.set_title(f'Outliers: {feature[:30]}...', fontweight='bold', fontsize=10)
    ax.set_ylabel('Value')
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# ============================================
# FEATURE VS TARGET RELATIONSHIPS
# ============================================

print("=" * 60)
print("FEATURE VS TARGET RELATIONSHIPS")
print("=" * 60)

# Get top correlated features
top_positive_features = target_corr.head(6)[1:].index.tolist()  # Exclude target itself
top_negative_features = target_corr.tail(5).index.tolist()

print(f"\n Plotting relationships for top correlated features...")

# Plot positive correlations
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, feature in enumerate(top_positive_features):
    ax = axes[idx]
    
    # Scatter plot with regression line
    valid_data = df[[feature, 'target']].dropna()
    
    ax.scatter(valid_data[feature], valid_data['target'], alpha=0.3, s=10)
    
    # Add regression line
    z = np.polyfit(valid_data[feature], valid_data['target'], 1)
    p = np.poly1d(z)
    ax.plot(valid_data[feature].sort_values(), 
            p(valid_data[feature].sort_values()), 
            "r--", linewidth=2, label=f'Trend')
    
    corr_val = df[[feature, 'target']].corr().iloc[0, 1]
    ax.set_xlabel(feature[:40], fontsize=9)
    ax.set_ylabel('Target (PM2.5)')
    ax.set_title(f'{feature[:30]}...\nCorr: {corr_val:.3f}', 
                 fontweight='bold', fontsize=9)
    ax.grid(alpha=0.3)
    ax.legend()

plt.tight_layout()
plt.show()

# Plot negative correlations
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, feature in enumerate(top_negative_features):
    ax = axes[idx]
    
    valid_data = df[[feature, 'target']].dropna()
    
    ax.scatter(valid_data[feature], valid_data['target'], alpha=0.3, s=10, color='orange')
    
    z = np.polyfit(valid_data[feature], valid_data['target'], 1)
    p = np.poly1d(z)
    ax.plot(valid_data[feature].sort_values(), 
            p(valid_data[feature].sort_values()), 
            "r--", linewidth=2, label=f'Trend')
    
    corr_val = df[[feature, 'target']].corr().iloc[0, 1]
    ax.set_xlabel(feature[:40], fontsize=9)
    ax.set_ylabel('Target (PM2.5)')
    ax.set_title(f'{feature[:30]}...\nCorr: {corr_val:.3f}', 
                 fontweight='bold', fontsize=9)
    ax.grid(alpha=0.3)
    ax.legend()

# Hide extra subplot if exists
if len(top_negative_features) < 6:
    for idx in range(len(top_negative_features), 6):
        axes[idx].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# ============================================
# CATEGORICAL FEATURES ANALYSIS
# ============================================

print("=" * 60)
print("CATEGORICAL FEATURES ANALYSIS")
print("=" * 60)


print(f"\nAnalyzing {len(categorical_cols)} categorical features:")

for feature in categorical_cols:
    print(f"\n{'='*50}")
    print(f"Feature: {feature}")
    print(f"{'='*50}")
    
    # Value counts
    value_counts = df[feature].value_counts()
    print(f"\nUnique values: {df[feature].nunique()}")
    print(f"\nTop 10 most frequent values:")
    print(value_counts.head(10))
    

In [None]:
# ============================================
# TEMPORAL ANALYSIS
# ============================================

print("=" * 60)
print("TEMPORAL ANALYSIS")
print("=" * 60)

# Convert Date column to datetime if it exists
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Extract temporal features
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['DayOfYear'] = df['Date'].dt.dayofyear
    
    print("\nTemporal features extracted")
    
    # Time series plot
    plt.figure(figsize=(16, 6))
    
    # Daily average PM2.5
    daily_avg = df.groupby('Date')['target'].mean()
    plt.plot(daily_avg.index, daily_avg.values, alpha=0.6, linewidth=1)
    
    # Add 7-day moving average
    plt.plot(daily_avg.index, daily_avg.rolling(window=7).mean(), 
             color='red', linewidth=2, label='7-day Moving Average')
    
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('PM2.5 (μg/m³)', fontsize=12)
    plt.title('PM2.5 Concentration Over Time', fontsize=16, fontweight='bold')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Monthly patterns
    plt.figure(figsize=(14, 5))
    
    plt.subplot(1, 2, 1)
    monthly_stats = df.groupby('Month')['target'].mean()
    monthly_stats.plot(kind='bar', color='steelblue', edgecolor='black')
    plt.xlabel('Month')
    plt.ylabel('Mean Target')
    plt.title('Average Target by Month', fontweight='bold')
    plt.xticks(rotation=0)
    plt.grid(alpha=0.3)
    
    plt.subplot(1, 2, 2)
    dow_stats = df.groupby('DayOfWeek')['target'].mean()
    dow_stats.plot(kind='bar', color='coral', edgecolor='black')
    plt.xlabel('Day of Week (0=Monday)')
    plt.ylabel('Mean Target')
    plt.title('Average target by Day of Week', fontweight='bold')
    plt.xticks(rotation=0)
    plt.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
else:
    print("\n No Date column found for temporal analysis")