In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
sns.set_palette("husl")

In [None]:
def load_data(file_path):
    try:
        
        encodings = ['utf-8', 'latin-1', 'ISO-8859-1']
        for encoding in encodings:
            try:
                df = pd.read_csv(file_path, encoding=encoding)
                print(f"Dataset loaded successfully with {encoding} encoding")
                return df
            except UnicodeDecodeError:
                continue
        
    
        df = pd.read_csv(file_path)
        print("Dataset loaded successfully (auto encoding detection)")
        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

file_path = "UrbanAirPollutionDataset.csv" 
df = load_data(file_path)


Dataset loaded successfully with utf-8 encoding


In [25]:
if df is not None:
    print(f"Dataset shape: {df.shape}")
    print("\nFirst few rows:")
    display(df.head())

Dataset shape: (175205, 15)

First few rows:


Unnamed: 0,Station_ID,DateTime,PM2.5,PM10,NO₂,SO₂,CO,O₃,Temp_C,Humidity_%,Wind_Speed_mps,Wind_Direction_deg,Pressure_hPa,Rain_mm,AQI_Target
0,1,2020-01-01 00:00:00,86.397213,111.814972,26.599649,3.875088,0.570793,42.903768,31.491409,45.704988,3.114026,166.148254,1012.641964,1.0,59.759255
1,1,2020-01-01 01:00:00,73.311679,110.919391,18.585412,11.820056,0.562121,15.127661,24.191965,41.544655,3.480094,282.15806,1011.779447,0.0,45.256996
2,1,2020-01-01 02:00:00,61.359818,47.063317,20.605215,16.531417,0.466105,32.752213,30.719383,73.849227,4.94946,109.450898,1004.118484,0.0,43.411916
3,1,2020-01-01 03:00:00,54.081632,122.981322,14.682654,14.453442,0.607025,18.910033,18.24315,42.086443,1.834147,229.796006,1009.154244,0.0,35.227619
4,1,2020-01-01 04:00:00,43.221175,102.259959,41.515463,17.49567,0.537119,39.582884,33.949777,42.393921,1.987593,159.50137,1021.100094,0.0,41.981803


In [27]:
    print("\nDataset info:")
    df.info()
    


Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175205 entries, 0 to 175204
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Station_ID          175205 non-null  int64  
 1   DateTime            175205 non-null  object 
 2   PM2.5               175205 non-null  float64
 3   PM10                175205 non-null  float64
 4   NO₂                 175205 non-null  float64
 5   SO₂                 175205 non-null  float64
 6   CO                  175205 non-null  float64
 7   O₃                  175205 non-null  float64
 8   Temp_C              175205 non-null  float64
 9   Humidity_%          175205 non-null  float64
 10  Wind_Speed_mps      175205 non-null  float64
 11  Wind_Direction_deg  175205 non-null  float64
 12  Pressure_hPa        175205 non-null  float64
 13  Rain_mm             175205 non-null  float64
 14  AQI_Target          175205 non-null  float64
dtypes: float64(13), int

In [29]:
  print("\nSummary statistics:")
  display(df.describe())


Summary statistics:


Unnamed: 0,Station_ID,PM2.5,PM10,NO₂,SO₂,CO,O₃,Temp_C,Humidity_%,Wind_Speed_mps,Wind_Direction_deg,Pressure_hPa,Rain_mm,AQI_Target
count,175205.0,175205.0,175205.0,175205.0,175205.0,175205.0,175205.0,175205.0,175205.0,175205.0,175205.0,175205.0,175205.0,175205.0
mean,3.0,60.028063,89.970108,24.949886,10.002496,0.599942,30.001676,27.005798,64.957909,2.50336,179.974158,1009.99952,0.094649,43.499332
std,1.414218,19.993724,30.053351,9.964945,3.995001,0.199632,10.002468,4.992958,15.001583,1.003688,103.973354,4.996531,0.238989,10.627132
min,1.0,-30.550414,-33.232821,-23.08719,-6.575932,-0.357247,-11.472321,3.083805,-3.501162,-1.857991,0.000541,987.584963,0.0,-8.410248
25%,2.0,46.556391,69.602188,18.246506,7.307356,0.465172,23.26106,23.647684,54.867365,1.827088,89.873992,1006.60921,0.0,36.388595
50%,3.0,59.95078,89.972045,24.966761,10.006975,0.599811,30.028232,27.017236,64.970205,2.503869,179.918778,1010.013031,0.0,43.470426
75%,4.0,73.515683,110.225852,31.685532,12.707325,0.734121,36.713979,30.370403,75.068157,3.177856,270.079199,1013.371262,0.0,50.647114
max,5.0,145.072926,224.750097,68.019015,28.328038,1.49989,75.843519,48.162096,130.720247,7.156243,359.998065,1033.106645,1.0,89.239073


In [31]:
def preprocess_data(df):
   
    df_clean = df.copy()
    
    print("Missing values before handling:")
    print(df_clean.isnull().sum())
    
    if 'timestamp' in df_clean.columns:
        df_clean.sort_values('timestamp', inplace=True)
    
    numerical_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()
    
    if len(numerical_cols) > 0:
        imputer = KNNImputer(n_neighbors=5)
        df_clean[numerical_cols] = imputer.fit_transform(df_clean[numerical_cols])
    
    for col in categorical_cols:
        if df_clean[col].isnull().sum() > 0:
            mode_val = df_clean[col].mode()[0]
            df_clean[col].fillna(mode_val, inplace=True)
    
    print("\nMissing values after handling:")
    print(df_clean.isnull().sum())
    
    date_columns = ['timestamp', 'date', 'time', 'datetime']  
    for col in date_columns:
        if col in df_clean.columns:
            df_clean[col] = pd.to_datetime(df_clean[col])
    
    
    if 'timestamp' in df_clean.columns:
        df_clean['hour'] = df_clean['timestamp'].dt.hour
        df_clean['day_of_week'] = df_clean['timestamp'].dt.dayofweek
        df_clean['month'] = df_clean['timestamp'].dt.month
        df_clean['season'] = (df_clean['timestamp'].dt.month % 12 + 3) // 3
        df_clean['is_weekend'] = df_clean['day_of_week'].isin([5, 6]).astype(int)
    
    
    for col in categorical_cols:
        if df_clean[col].nunique() <= 10:  
            df_clean = pd.get_dummies(df_clean, columns=[col], prefix=col, drop_first=True)
        else:  
            le = LabelEncoder()
            df_clean[col] = le.fit_transform(df_clean[col].astype(str))
    
    
    initial_count = len(df_clean)
    df_clean.drop_duplicates(inplace=True)
    final_count = len(df_clean)
    print(f"\nRemoved {initial_count - final_count} duplicate rows")
    
   
    for col in numerical_cols:
        if col in df_clean.columns:
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            
            df_clean[col] = np.where(df_clean[col] < lower_bound, lower_bound, df_clean[col])
            df_clean[col] = np.where(df_clean[col] > upper_bound, upper_bound, df_clean[col])
    
    return df_clean


if df is not None:
    df_processed = preprocess_data(df)
    print(f"\nProcessed dataset shape: {df_processed.shape}")
    print("\nProcessed dataset columns:")
    print(df_processed.columns.tolist())

Missing values before handling:
Station_ID            0
DateTime              0
PM2.5                 0
PM10                  0
NO₂                   0
SO₂                   0
CO                    0
O₃                    0
Temp_C                0
Humidity_%            0
Wind_Speed_mps        0
Wind_Direction_deg    0
Pressure_hPa          0
Rain_mm               0
AQI_Target            0
dtype: int64

Missing values after handling:
Station_ID            0
DateTime              0
PM2.5                 0
PM10                  0
NO₂                   0
SO₂                   0
CO                    0
O₃                    0
Temp_C                0
Humidity_%            0
Wind_Speed_mps        0
Wind_Direction_deg    0
Pressure_hPa          0
Rain_mm               0
AQI_Target            0
dtype: int64

Removed 0 duplicate rows

Processed dataset shape: (175205, 15)

Processed dataset columns:
['Station_ID', 'DateTime', 'PM2.5', 'PM10', 'NO₂', 'SO₂', 'CO', 'O₃', 'Temp_C', 'Humidity_%', 'Wi

In [37]:
import pandas as pd

def engineer_features(df):
   
    df_eng = df.copy()

    pollutant_columns = [col for col in df_eng.columns 
                         if any(x in col.lower() for x in ['pm2.5', 'pm10', 'no2', 'so2', 'co', 'o3'])]
    
    if not pollutant_columns:
        print("No pollutant columns found!")
        return df_eng
    
    print(f"Found pollutant columns: {pollutant_columns}")

    
    for col in pollutant_columns:
        if df_eng[col].std() != 0:
            df_eng[f'{col}_norm'] = (df_eng[col] - df_eng[col].mean()) / df_eng[col].std()
    
   
    weights = {'pm2.5': 0.3, 'pm10': 0.2, 'no2': 0.15, 'so2': 0.15, 'co': 0.1, 'o3': 0.1}
    aqi_components = []
    for poll, weight in weights.items():
        for col in pollutant_columns:
            if poll in col.lower() and f'{col}_norm' in df_eng.columns:
                aqi_components.append(df_eng[f'{col}_norm'] * weight)
    
    if aqi_components:
        df_eng['aqi'] = sum(aqi_components)
        df_eng['aqi'] = (df_eng['aqi'] - df_eng['aqi'].min()) / (df_eng['aqi'].max() - df_eng['aqi'].min()) * 500

    if 'aqi' in df_eng.columns:
        bins = [0, 50, 100, 150, 200, 300, 500]
        labels = ['Good', 'Moderate', 'Unhealthy for Sensitive Groups', 
                  'Unhealthy', 'Very Unhealthy', 'Hazardous']
        df_eng['pollution_level'] = pd.cut(df_eng['aqi'], bins=bins, labels=labels, include_lowest=True)

    if 'hour' in df_eng.columns:
        df_eng['morning_rush'] = ((df_eng['hour'] >= 7) & (df_eng['hour'] <= 9)).astype(int)
        df_eng['evening_rush'] = ((df_eng['hour'] >= 16) & (df_eng['hour'] <= 18)).astype(int)
    if 'day_of_week' in df_eng.columns:
        df_eng['is_weekday'] = (df_eng['day_of_week'] < 5).astype(int)

    if all(col in df_eng.columns for col in ['timestamp', 'station_id']):
        df_eng.sort_values(['station_id', 'timestamp'], inplace=True)
        for poll in pollutant_columns[:3]:
            if poll in df_eng.columns:
                df_eng[f'{poll}_lag_1'] = df_eng.groupby('station_id')[poll].shift(1)
                df_eng[f'{poll}_lag_24'] = df_eng.groupby('station_id')[poll].shift(24)

    return df_eng


if df is not None:
    df_final = engineer_features(df_processed)
    print(f"\nFinal dataset shape after feature engineering: {df_final.shape}")


Found pollutant columns: ['PM2.5', 'PM10', 'CO']

Final dataset shape after feature engineering: (175205, 20)
