In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

# Define target pollutants and AQI
pollutants = ['CO', 'NO2', 'SO2', 'O3', 'PM10', 'PM2.5']
aqi_column = 'AQI'

def impute_arima(series, order=(2,1,2)):
    if series.isnull().sum() == 0:
        return series
    model = ARIMA(series.dropna(), order=order)
    fitted = model.fit()
    forecast = fitted.predict(start=series.index[0], end=series.index[-1])
    series.loc[series.isnull()] = forecast[series.isnull()]
    return series

def preprocess_data(file_path):
    df = pd.read_csv(file_path, parse_dates=['datetime'], index_col='datetime')
    
    for col in pollutants:
        df[col] = impute_arima(df[col])
    
    df = df.dropna(subset=[aqi_column])
    
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df[pollutants]), 
                             columns=pollutants, index=df.index)
    df_scaled[aqi_column] = df[aqi_column].values  # Append AQI unscaled
    
    return df_scaled, scaler

df_processed, scaler = preprocess_data('epa_data.csv')
