# Let's go better than baseline

In [7]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

from sklearn.model_selection import KFold, cross_val_score, train_test_split, TimeSeriesSplit
from sklearn.impute import SimpleImputer
from sklearn.metrics import get_scorer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

def impute_missing_values(df):
    # calculate means for all columns for each location
    place_mean = df.groupby(['latitude', 'longitude']).mean()

    # impute missing values with mean of given latitude, longitude from place_mean
    df_imputed = df.groupby(['latitude', 'longitude']).transform(lambda x: x.fillna(x.mean()))
    df_imputed['latitude'] = df['latitude']
    df_imputed['longitude'] = df['longitude'] 
    return df_imputed

def feature_engineering(df):
    df = impute_missing_values(df)
    return df

def train_model(model, raw_df, features, target):
    X_train = feature_engineering(raw_df[features])
    y_train = raw_df[target]
    model.fit(X_train, y_train)
    return model

def get_predictions(model, raw_df, features):
    original_df = raw_df.copy()
    X_test = feature_engineering(raw_df[features])
    original_df[target] = model.predict(X_test)
    return original_df

def magical_constant(df, constant=1.07):
    df[target] *= constant
    return df

def save_submission(df):
    # remove "submission.csv" if it exists
    try:
        os.remove("submission.csv")
    except OSError:
        pass
    df[[id_columns, target]].to_csv("submission.csv", index=False)

data_path = "./data/"
df_train = pd.read_csv(data_path + "train.csv")
df_test = pd.read_csv(data_path + "test.csv")

id_columns = "ID_LAT_LON_YEAR_WEEK"
RANDOM_SEED = 128

target = "emission"
features = list(df_test.columns)
features.remove(id_columns)

# minimal feature set
feature_set_1 = ['latitude', 'longitude', 'year', 'week_no']

# uv only - from EDA & xgboost feature importance
feature_set_2 = feature_set_1 + [ f for f in features if f.startswith('Uv') ]

uv_layer = "UvAerosolLayerHeight_"
uv_index = "UvAerosolIndex_"

feature_set_3 = feature_set_1 + [
    uv_layer + "solar_azimuth_angle",
    uv_layer + "solar_zenith_angle",
    uv_layer + "aerosol_height",
    uv_layer + "aerosol_optical_depth",
    uv_layer + "sensor_zenith_angle",
    uv_layer + "sensor_azimuth_angle",
    uv_index + "solar_azimuth_angle",
]

feature_set_4 = feature_set_3 + [
    "Ozone_O3_column_number_density",
    uv_layer + "aerosol_pressure",
]

feature_sets = [feature_set_1, feature_set_3]

In [8]:
params = {"objective":"reg:squarederror", "max_depth":20, "learning_rate":0.1, "n_estimators":1000, "n_jobs":-1}
xgboost_model = xgb.XGBRegressor(**params)

models = {
    "XGBoost": xgboost_model,
    "RandomForestRegressor": RandomForestRegressor(n_estimators=2000, random_state=RANDOM_SEED, n_jobs=-1)
}

data = []
data_columns = ['model', 'feature_set', 'rmse_train', 'rmse_validation']

for name, model in models.items():
    for i, feature_set in enumerate(feature_sets):
        pipeline = make_pipeline(
            SimpleImputer(strategy='median'),
            model
        )

        # we split the data into train and test sets based on year - 2019, 2020 is train, 2021 is test
        def split_data(df):
            test = df[(df['year'] == 2021) & (df['week_no'] > 15)]
            # train - df not in test
            train = df[~df.index.isin(test.index)]
            
            return train, test

        df_train_imputed = feature_engineering(df_train[feature_set])
        df_train_imputed[target] = df_train[target]

        train, test = split_data(df_train_imputed)

        X_train = train.drop(columns=[target])
        y_train = train[target]

        X_test = test.drop(columns=[target])
        y_test = test[target]
        print(f"Training {name} on feature set {i}...")
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        # calculate rmse
        rmse_train = np.sqrt(mean_squared_error(y_train, pipeline.predict(X_train)))
        rmse_validation = np.sqrt(mean_squared_error(y_test, y_pred))
        data.append([name, i, rmse_train, rmse_validation])

Training XGBoost on feature set 0...
Training XGBoost on feature set 1...
Training RandomForestRegressor on feature set 0...
Training RandomForestRegressor on feature set 1...


In [9]:
df_results = pd.DataFrame(data, columns=data_columns)
df_results.sort_values(by='rmse_validation')

Unnamed: 0,model,feature_set,rmse_train,rmse_validation
0,XGBoost,0,0.006819,15.502037
2,RandomForestRegressor,0,5.277988,16.342146
3,RandomForestRegressor,1,5.165526,22.783129
1,XGBoost,1,0.003665,23.942887


In [10]:
pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    RandomForestRegressor(n_estimators=2000, random_state=RANDOM_SEED, n_jobs=-1)
)

print("Training model")
model = train_model(pipeline, df_train, feature_set_1, target)

print("Getting predictions")
output_test_df = get_predictions(model, df_test, feature_set_1)

print("Applying magical constant")
output_test_df = magical_constant(output_test_df, 1.07)

print("Saving submission file")
save_submission(output_test_df)

Training model
Getting predictions
Applying magical constant
Saving submission file


In [6]:
#! kaggle competitions submit -c playground-series-s3e20 -f submission.csv -m "First submission"