In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

In [None]:
train_data = pd.read_csv('/kaggle/input/zindi-african-air-quality-prediction-challenge/Train.csv')
test_data = pd.read_csv('/kaggle/input/zindi-african-air-quality-prediction-challenge/Test.csv')

In [None]:
def add_date_features(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)  # Adding is_weekend feature

for data in [train_data, test_data]:
    add_date_features(data)


In [None]:
drop_columns = ['id', 'site_id', 'date']
categorical_features = ['city', 'country']
numerical_features = ['year', 'month', 'day', 'weekday', 'is_weekend']

X_train = train_data.drop(columns=drop_columns + ['pm2_5'])
y_train = train_data['pm2_5']
X_test = test_data.drop(columns=drop_columns)
ids_test = test_data['id']

In [None]:
numeric_pipeline = make_pipeline(
    KNNImputer(n_neighbors=5),
    StandardScaler()
)

categorical_pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore')
)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_pipeline, categorical_features),
        ('num', numeric_pipeline, numerical_features)
    ])

In [None]:
svr_model = SVR()

# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('svr', svr_model)
])

# Hyperparameter tuning
param_grid = {
    'svr__C': [1, 10, 100],
    'svr__gamma': ['scale', 'auto'],
    'svr__kernel': ['rbf', 'linear']
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Lowest RMSE:", (-grid_search.best_score_) ** 0.5)

In [None]:
best_model = grid_search.best_estimator_
final_predictions = best_model.predict(X_test)
predictions_df = pd.DataFrame({
    'id': ids_test,
    'pm2_5': final_predictions
})

predictions_df.to_csv('/kaggle/working/test_predictions_svr_optimized.csv', index=False)
