In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Enable experimental feature to use IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
train_data = pd.read_csv('/kaggle/input/zindi-african-air-quality-prediction-challenge/Train.csv')
test_data = pd.read_csv('/kaggle/input/zindi-african-air-quality-prediction-challenge/Test.csv')

In [None]:
def add_date_features(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

for data in [train_data, test_data]:
    add_date_features(data)

In [None]:
# Define columns to be used
drop_columns = ['id', 'site_id', 'date']
categorical_features = ['city', 'country']
numerical_features = ['year', 'month', 'day', 'weekday', 'is_weekend', 'month_sin', 'month_cos']

X_train = train_data.drop(columns=drop_columns + ['pm2_5'])
y_train = train_data['pm2_5']
X_test = test_data.drop(columns=drop_columns)
ids_test = test_data['id']

In [None]:
# Pipeline components
numeric_pipeline = make_pipeline(
    IterativeImputer(max_iter=10, random_state=0),
    PolynomialFeatures(degree=2, include_bias=False),
    StandardScaler()
)

categorical_pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore')
)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_pipeline, categorical_features),
        ('num', numeric_pipeline, numerical_features)
    ])

In [None]:
# LightGBM Model
lgbm_model = LGBMRegressor()

lgbm_params = {
    'lgbm__num_leaves': [20, 30, 40],
    'lgbm__learning_rate': [0.05, 0.1, 0.2],
    'lgbm__n_estimators': [100, 200, 300]
}

lgbm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('lgbm', lgbm_model)
])

lgbm_grid = GridSearchCV(lgbm_pipeline, param_grid=lgbm_params, cv=3, scoring='neg_root_mean_squared_error')
lgbm_grid.fit(X_train, y_train)

print("Best parameters for LightGBM:", lgbm_grid.best_params_)
print("Best RMSE for LightGBM:", -lgbm_grid.best_score_)

In [None]:
# XGBoost Model
xgb_model = XGBRegressor()

xgb_params = {
    'xgb__max_depth': [3, 4, 5],
    'xgb__learning_rate': [0.05, 0.1, 0.2],
    'xgb__n_estimators': [100, 200, 300]
}

xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', xgb_model)
])

xgb_grid = GridSearchCV(xgb_pipeline, param_grid=xgb_params, cv=3, scoring='neg_root_mean_squared_error')
xgb_grid.fit(X_train, y_train)

print("Best parameters for XGBoost:", xgb_grid.best_params_)
print("Best RMSE for XGBoost:", -xgb_grid.best_score_)

In [None]:
# CatBoost Model
catboost_model = CatBoostRegressor()

catboost_params = {
    'catboost__depth': [4, 6, 8],
    'catboost__learning_rate': [0.05, 0.1, 0.2],
    'catboost__n_estimators': [100, 200, 300]
}

catboost_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('catboost', catboost_model)
])

catboost_grid = GridSearchCV(catboost_pipeline, param_grid=catboost_params, cv=3, scoring='neg_root_mean_squared_error')
catboost_grid.fit(X_train, y_train)

print("Best parameters for CatBoost:", catboost_grid.best_params_)
print("Best RMSE for CatBoost:", -catboost_grid.best_score_)


In [None]:
# Predictions using the best model for each algorithm
lgbm_best_model = lgbm_grid.best_estimator_
xgb_best_model = xgb_grid.best_estimator_
catboost_best_model = catboost_grid.best_estimator_

# Predictions on test data
lgbm_predictions = lgbm_best_model.predict(X_test)
xgb_predictions = xgb_best_model.predict(X_test)
catboost_predictions = catboost_best_model.predict(X_test)

# Taking average predictions from all models
final_predictions = (lgbm_predictions + xgb_predictions + catboost_predictions) / 3

# Creating DataFrame for predictions
predictions_df = pd.DataFrame({
    'id': ids_test,
    'pm2_5': final_predictions
})

# Saving predictions to a CSV file
predictions_df.to_csv('/kaggle/working/test_predictions_ensemble.csv', index=False)
