In [14]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.svm import SVR

In [15]:
train_data = pd.read_csv('/kaggle/input/zindi-african-air-quality-prediction-challenge/Train.csv')
test_data = pd.read_csv('/kaggle/input/zindi-african-air-quality-prediction-challenge/Test.csv')

In [16]:
# Preprocessing functions
def add_date_features(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday

def preprocess_data(df):
    # Add date features
    add_date_features(df)
    # Fill numeric missing values
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    for col in numeric_cols:
        if df[col].isnull().any():
            df[col].fillna(df[col].median(), inplace=True)

In [17]:
# Apply preprocessing
for data in [train_data, test_data]:
    preprocess_data(data)

In [18]:
# Define columns to drop and feature categories
drop_columns = ['id', 'site_id', 'date']
categorical_features = ['city', 'country']
numerical_features = ['year', 'month', 'day']

# Prepare training and test data
X_train = train_data.drop(columns=drop_columns + ['pm2_5'])
y_train = train_data['pm2_5']
X_test = test_data.drop(columns=drop_columns)

# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ]
)

In [19]:
# Define base models with provided parameters
lgbm_model = lgb.LGBMRegressor(learning_rate=0.05, n_estimators=300, num_leaves=30)
xgb_model = xgb.XGBRegressor(learning_rate=0.2, max_depth=4, n_estimators=200)
catboost_model = CatBoostRegressor(depth=6, learning_rate=0.2, n_estimators=200, verbose=0)
svr_model = SVR(kernel='rbf', C=359.9463714999908, gamma=0.47732892632361296)

In [20]:
# Create base models list
base_models = [
    ('lgbm', lgbm_model),
    ('xgb', xgb_model),
    ('catboost', catboost_model),
    ('svr', svr_model)
]

In [21]:
# Create stacking regressor with Voting Regressor as final estimator
stacking_regressor = VotingRegressor(estimators=base_models)

# Pipeline setup
stacking_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('stacking_regressor', stacking_regressor)])

# Fit the model
stacking_pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003285 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64
[LightGBM] [Info] Number of data points in the train set: 8071, number of used features: 11
[LightGBM] [Info] Start training from score 24.639296


In [22]:
# Predict
predictions = stacking_pipeline.predict(X_test)

# Create DataFrame for submission
predictions_df = pd.DataFrame({'id': test_data['id'], 'pm2_5': predictions})
predictions_df.to_csv('/kaggle/working/final_predictions_stacked.csv', index=False)