# Air Quality PM2.5 Prediction - Exploratory Data Analysis

**Author:** Chukwuahachie Sylvester  
**Date:** December 1, 2025  
**Purpose:** Initial exploration of air quality dataset

## Objectives
1. Understand the dataset structure
2. Identify data quality issues (missing values, outliers)
3. Analyze feature distributions
4. Explore relationships between features and target (PM2.5)
5. Generate insights for feature engineering

In [None]:


# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical analysis
from scipy import stats

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

In [None]:
DATA_PATH = '../data/raw/air_quality.csv'

df = pd.read_csv(DATA_PATH)
    
print(f"\nDataset Shape: {df.shape}")
print(f"Rows: {df.shape[0]:,}")
print(f"Columns: {df.shape[1]}")

In [None]:
print("=" * 60)
print("FIRST 5 ROWS")
print("=" * 60)
display(df.head())

In [None]:
print("\n" + "=" * 60)
print("LAST 5 ROWS")
print("=" * 60)
display(df.tail())

In [None]:
print("\n" + "=" * 60)
print("RANDOM SAMPLE (5 ROWS)")
print("=" * 60)
display(df.sample(5, random_state=42))

In [None]:
#Dataset Information
print('=' * 60)
print('DATASET INFORMATION')
print('=' * 60)
display(df.info())


In [None]:
print('=' * 60)
print('MEMORY USAGE')
print('=' * 60)
print(f"Total memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


In [None]:
#statistical Summary
print('=' * 60)
print('STASTISTICAL SUMMARY - NUMERICAL FEATURES')
print('=' * 60)
display(df.describe())

In [None]:
print("\n" + "=" * 60)
print("STATISTICAL SUMMARY - CATEGORICAL FEATURES")
print("=" * 60)
display(df.describe(include=['object']))

In [None]:
print("=" * 60)
print("MISSING VALUES ANALYSIS")
print("=" * 60)

# Calculate missing values
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100

# Create summary dataframe
missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': missing.values,
    'Missing_Percentage': missing_percent.values
}).sort_values('Missing_Count', ascending=False)

# Display only columns with missing values
missing_df_filtered = missing_df[missing_df['Missing_Count'] > 0]

if len(missing_df_filtered) > 0:
    print("\n Columns with Missing Values:")
    display(missing_df_filtered)
else:
    print("\n No missing values found!")

# Visualize missing values
if missing.sum() > 0:
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.isnull(), cbar=True, cmap='viridis', yticklabels=False)
    plt.title('Missing Values Heatmap', fontsize=16, fontweight='bold')
    plt.xlabel('Columns')
    plt.tight_layout()
    plt.show()
else:
    print("\n No missing values to visualize")

In [None]:
# ============================================
# TARGET VARIABLE ANALYSIS (PM2.5)
# ============================================

# Define your target variable name
TARGET = 'target'  

print("=" * 60)
print(f"TARGET VARIABLE ANALYSIS: {TARGET}")
print("=" * 60)

if TARGET in df.columns:
    print(f"\n Summary Statistics for {TARGET}:")
    print(df[TARGET].describe())
    
    print(f"\n Additional Statistics:")
    print(f"Skewness: {df[TARGET].skew():.3f}")
    print(f"Kurtosis: {df[TARGET].kurtosis():.3f}")
    print(f"Range: {df[TARGET].min():.2f} - {df[TARGET].max():.2f}")
    print(f"IQR: {df[TARGET].quantile(0.75) - df[TARGET].quantile(0.25):.2f}")
    
    # Visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Histogram with KDE
    axes[0, 0].hist(df[TARGET].dropna(), bins=50, edgecolor='black', alpha=0.7)
    axes[0, 0].set_xlabel(TARGET)
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title(f'Distribution of {TARGET}', fontweight='bold')
    axes[0, 0].grid(alpha=0.3)
    
    # 2. Box plot
    axes[0, 1].boxplot(df[TARGET].dropna(), vert=True)
    axes[0, 1].set_ylabel(TARGET)
    axes[0, 1].set_title(f'Box Plot of {TARGET}', fontweight='bold')
    axes[0, 1].grid(alpha=0.3)
    
    # 3. KDE plot
    df[TARGET].dropna().plot(kind='kde', ax=axes[1, 0], linewidth=2)
    axes[1, 0].set_xlabel(TARGET)
    axes[1, 0].set_ylabel('Density')
    axes[1, 0].set_title(f'Kernel Density Plot of {TARGET}', fontweight='bold')
    axes[1, 0].grid(alpha=0.3)
    
    # 4. Q-Q plot
    stats.probplot(df[TARGET].dropna(), dist="norm", plot=axes[1, 1])
    axes[1, 1].set_title(f'Q-Q Plot of {TARGET}', fontweight='bold')
    axes[1, 1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    


In [None]:


print("=" * 60)
print("FEATURE TYPE IDENTIFICATION")
print("=" * 60)

# Separate numerical and categorical features
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()

# Remove target from numerical if present
if TARGET in numerical_cols:
    numerical_cols.remove(TARGET)

print(f"\n Numerical Features ({len(numerical_cols)}):")
for i, col in enumerate(numerical_cols, 1):
    print(f"   {i}. {col}")

print(f"\n Categorical Features ({len(categorical_cols)}):")
for i, col in enumerate(categorical_cols, 1):
    print(f"   {i}. {col} - {df[col].nunique()} unique values")

if datetime_cols:
    print(f"\n DateTime Features ({len(datetime_cols)}):")
    for i, col in enumerate(datetime_cols, 1):
        print(f"   {i}. {col}")

print(f"\n Target Variable: {TARGET}")