In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, StackingRegressor

# Enable experimental feature to use IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
train_data = pd.read_csv('/kaggle/input/zindi-african-air-quality-prediction-challenge/Train.csv')
test_data = pd.read_csv('/kaggle/input/zindi-african-air-quality-prediction-challenge/Test.csv')

In [None]:
def add_date_features(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

for data in [train_data, test_data]:
    add_date_features(data)

In [None]:
# Define columns to be used
drop_columns = ['id', 'site_id', 'date']
categorical_features = ['city', 'country']
numerical_features = ['year', 'month', 'day', 'weekday', 'is_weekend', 'month_sin', 'month_cos']

X_train = train_data.drop(columns=drop_columns + ['pm2_5'])
y_train = train_data['pm2_5']
X_test = test_data.drop(columns=drop_columns)
ids_test = test_data['id']

In [None]:
# Pipeline components
numeric_pipeline = make_pipeline(
    IterativeImputer(max_iter=10, random_state=0),
    PolynomialFeatures(degree=2, include_bias=False),
    StandardScaler()
)

categorical_pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore')
)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_pipeline, categorical_features),
        ('num', numeric_pipeline, numerical_features)
    ])

In [None]:
# Define multiple models for stacking
estimators = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('svr', SVR(C=100, gamma='scale'))
]

# Main model: Stacking Regressor
stack_model = StackingRegressor(estimators=estimators, final_estimator=SVR())

In [None]:
# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('stack', stack_model)
])

In [None]:
# Model training and prediction
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_train)
rmse = mean_squared_error(y_train, y_pred, squared=False)
print("Training RMSE:", rmse)

# Predictions on test data using the best model
final_predictions = pipeline.predict(X_test)
predictions_df = pd.DataFrame({
    'id': ids_test,
    'pm2_5': final_predictions
})

predictions_df.to_csv('/kaggle/working/test_predictions_svr_optimized6.csv', index=False)