In [1]:
import os
import pandas as pd
import datetime as dt
import numpy as np
import seaborn as sns
from pandas import DataFrame
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import GroupKFold
import lightgbm as lgb 
from sklearn.metrics import root_mean_squared_error
import random
random.seed(42)
import pickle
pd.options.mode.chained_assignment = None

#### Import Data

In [2]:
full_data = pd.read_csv('full_data.csv')
sub = pd.read_csv('SampleSubmission.csv')

#### Feature Engineering

In [3]:
# Apply feature engineering

full_data['date'] = pd.to_datetime(full_data['date'])
full_data['day_of_year'] = full_data['date'].dt.day_of_year
full_data['year'] = full_data['date'].dt.year
full_data['quarter'] = full_data['date'].dt.quarter

#### Modelling

In [4]:
# select features for training the model

selected_features = [col for col in full_data.columns 
                     if col != 'folds' 
                    and col!= 'pm2_5'
                     and col!= 'date'
                     and col!= 'id'
                     and 'Unnamed' not in col
                    ]

print(selected_features)

['site_id', 'site_latitude', 'site_longitude', 'city', 'country', 'hour', 'sulphurdioxide_so2_column_number_density', 'sulphurdioxide_so2_column_number_density_amf', 'sulphurdioxide_so2_slant_column_number_density', 'sulphurdioxide_cloud_fraction', 'sulphurdioxide_sensor_azimuth_angle', 'sulphurdioxide_sensor_zenith_angle', 'sulphurdioxide_solar_azimuth_angle', 'sulphurdioxide_solar_zenith_angle', 'sulphurdioxide_so2_column_number_density_15km', 'month', 'carbonmonoxide_co_column_number_density', 'carbonmonoxide_h2o_column_number_density', 'carbonmonoxide_cloud_height', 'carbonmonoxide_sensor_altitude', 'carbonmonoxide_sensor_azimuth_angle', 'carbonmonoxide_sensor_zenith_angle', 'carbonmonoxide_solar_azimuth_angle', 'carbonmonoxide_solar_zenith_angle', 'nitrogendioxide_no2_column_number_density', 'nitrogendioxide_tropospheric_no2_column_number_density', 'nitrogendioxide_stratospheric_no2_column_number_density', 'nitrogendioxide_no2_slant_column_number_density', 'nitrogendioxide_tropopa

In [5]:
# select train and test from full data
train = full_data[full_data['pm2_5']!= -1]
test = full_data[full_data['pm2_5']== -1]

In [6]:
# select categorical features in the selected features
cat_cols = [col for col in selected_features if full_data[col].dtype == 'object']
cat_cols

['site_id', 'city', 'country']

In [7]:
# converting categorical features to category
for col in cat_cols:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [8]:
# Cross validation configurations

gkf = GroupKFold(n_splits = 4)
train['folds'] = np.nan

# Generate the splits
for fold, (train_index, test_index) in enumerate(gkf.split(train, groups=train['city']), 1):
    # Assign fold index to corresponding rows in the DataFrame
    train.loc[test_index, 'folds'] = fold

# Convert fold indices to integer type
train['folds'] = train['folds'].astype(int)
train.groupby(['folds', 'city'])['pm2_5'].agg({'count'})

  train.groupby(['folds', 'city'])['pm2_5'].agg({'count'})


Unnamed: 0_level_0,Unnamed: 1_level_0,count
folds,city,Unnamed: 2_level_1
1,Bujumbura,0
1,Kampala,5596
1,Lagos,0
1,Nairobi,0
2,Bujumbura,0
2,Kampala,0
2,Lagos,0
2,Nairobi,1500
3,Bujumbura,0
3,Kampala,0


In [9]:

CV_score = 0
all_test_predictions = []


# defining hyperparameters
SEARCH_PARAMS = {'learning_rate': [0.02],
                'max_depth': [8], #4
                 'boosting_type': 'gbdt',
                'num_leaves': [600],
                'feature_fraction': 0.8,
                'subsample': 0.2,
                'early_stopping_rounds':200,
                'verbosity': -1
                }


folds = [1, 2, 3, 4]

for fold in folds:
    
    train_data = train[train['folds']!= fold] # select train data
    val_data = train[train['folds'] == fold] # select validation data


    # put train and validation data into a lightgbm dataset format
    lgb_train = lgb.Dataset(train_data[selected_features], train_data['pm2_5'],     
                        feature_name=selected_features, categorical_feature=cat_cols)

    lgb_eval = lgb.Dataset(val_data[selected_features], val_data['pm2_5'],     
                        feature_name=selected_features, categorical_feature=cat_cols)

    model = lgb.train(params=SEARCH_PARAMS, 
                     train_set= lgb_train,             
                     num_boost_round=300,
                     valid_sets = [lgb_eval],       
                     )

    X_train = train_data[selected_features]
    y_train = train_data['pm2_5']

    X_test = val_data[selected_features]
    y_test = val_data['pm2_5']

    city = val_data['city'].unique()[0]

    rmse = root_mean_squared_error(y_test, model.predict(X_test))
    print(f'RMSE Loss for {city}: {rmse:.3f}')


    # test set
    test_data = test[selected_features]
    test_predictions = model.predict(test_data)
    all_test_predictions.append(test_predictions)
    CV_score += rmse

averaged_preds = np.average(all_test_predictions, axis=0)
print(f'CV_Score: {CV_score/4:.3f}')



RMSE Loss for Kampala: 14.277
RMSE Loss for Nairobi: 24.952
RMSE Loss for Lagos: 52.475
RMSE Loss for Bujumbura: 16.622
CV_Score: 27.082


#### Saving to File

In [10]:
# save predictions

test['pm2_5'] = averaged_preds
test[['id', 'pm2_5']].to_csv('predictions_2.csv', index=False)

In [11]:
# save the full_data to file

full_data.to_csv('full_data.csv')

In [12]:
# save the model

with open('lightgbm_model.pkl', 'wb') as r:
    pickle.dump(model, r)