In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [3]:
station_day = pd.read_csv('station_day.csv\station_day.csv')
stations = pd.read_csv('station_day.csv\stations.csv')

print("Station day shape:", station_day.shape)
print("Stations shape:", stations.shape)

Station day shape: (108035, 16)
Stations shape: (230, 5)


In [None]:
print("Common columns:", set(station_day.columns).intersection(stations.columns))

if 'station_id' in station_day.columns and 'station_id' in stations.columns:
    data = station_day.merge(stations, on='station_id', how='left')
else:
    data = station_day.copy()

print("Merged shape:", data.shape)

Common columns: {'StationId'}
Merged shape: (108035, 16)


In [5]:
drop_cols = ['StationId', 'StationName', 'state', 'city', 'location', 'Date', 'date']
drop_cols = [c for c in drop_cols if c in data.columns]
data = data.drop(columns=drop_cols, errors='ignore')

### Handling Missing Values

In [7]:
missing_percent = data.isna().mean().sort_values(ascending=False)
print("\nMissing value percentages:\n", missing_percent)

cols_high_missing = missing_percent[missing_percent > 0.7].index.tolist()
print("\nColumns with >70% missing:", cols_high_missing)


Missing value percentages:
 Xylene        0.788050
NH3           0.445272
PM10          0.395298
Toluene       0.358236
Benzene       0.291156
O3            0.236664
SO2           0.233295
PM2.5         0.200167
AQI_Bucket    0.194474
AQI           0.194474
NO            0.158338
NO2           0.153163
NOx           0.143472
CO            0.120313
dtype: float64

Columns with >70% missing: ['Xylene']


In [8]:
# Create two datasets:
#   1. one including xylene (after imputation)
#   2. one excluding xylene

data_with_xylene = data.copy()
data_without_xylene = data.drop(columns=cols_high_missing, errors='ignore')

In [9]:
# numeric columns

numeric_cols_with = data_with_xylene.select_dtypes(include=[np.number]).columns
numeric_cols_without = data_without_xylene.select_dtypes(include=[np.number]).columns

In [10]:
# SimpleImputer for median filling
imputer = SimpleImputer(strategy='median')

data_with_xylene[numeric_cols_with] = imputer.fit_transform(data_with_xylene[numeric_cols_with])
data_without_xylene[numeric_cols_without] = imputer.fit_transform(data_without_xylene[numeric_cols_without])

In [None]:
# Handle categorical columns if any i.e., one-hot encoding
cat_cols = data_with_xylene.select_dtypes(exclude=[np.number]).columns

if len(cat_cols) > 0:
    data_with_xylene = pd.get_dummies(data_with_xylene, columns=cat_cols, drop_first=True)
    data_without_xylene = pd.get_dummies(data_without_xylene, columns=cat_cols, drop_first=True)

### Scaling

In [13]:
X_with = data_with_xylene.drop(columns=['AQI', 'AQI_Bucket'], errors='ignore')
y_with = data_with_xylene['AQI']

X_without = data_without_xylene.drop(columns=['AQI', 'AQI_Bucket'], errors='ignore')
y_without = data_without_xylene['AQI']

In [14]:
# Standardize features
scaler_with = StandardScaler()
scaler_without = StandardScaler()

X_with_scaled = pd.DataFrame(
    scaler_with.fit_transform(X_with),
    columns=X_with.columns
)

X_without_scaled = pd.DataFrame(
    scaler_without.fit_transform(X_without),
    columns=X_without.columns
)

In [15]:
# Final Cleaned Outputs
print("\n✅ Data cleaning & feature engineering done!")
print("With Xylene:", X_with_scaled.shape)
print("Without Xylene:", X_without_scaled.shape)

X_with_scaled['AQI'] = y_with.values
X_without_scaled['AQI'] = y_without.values

X_with_scaled.to_csv("cleaned_with_xylene.csv", index=False)
X_without_scaled.to_csv("cleaned_without_xylene.csv", index=False)

print("Saved: cleaned_with_xylene.csv & cleaned_without_xylene.csv")


✅ Data cleaning & feature engineering done!
With Xylene: (108035, 17)
Without Xylene: (108035, 16)
Saved: cleaned_with_xylene.csv & cleaned_without_xylene.csv
