In [50]:
from pathlib import Path

import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV


def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])


def _merge_external_data(X):
    file_path = "external_data.csv"
    df_ext = pd.read_csv(file_path, parse_dates=["date"])

    X_comb = X.join(df_ext.set_index("date"), on="date", rsuffix="right") 
    X_comb.fillna(method="ffill", inplace=True)
    
    return X_comb.drop(["Unnamed: 0", "date_only"], axis=1)


def get_estimator():
    date_encoder = FunctionTransformer(_encode_dates)
    #imputer = KNNImputer(n_neighbors=3, add_indicator=True)
    date_cols = ['year', 'month', 'day', 'weekday', 'hour']

    categorical_encoder = OneHotEncoder(handle_unknown="ignore")
    categorical_cols = ["counter_name", "site_name", "wind_dir"]
    numerical_cols = ['site_id', 'latitude', 'longitude', 'Temperature (C)', 'wind_speed',
                    'Humidity', 'Visibility', 'pressure1', "Precipitation", "sunshine_time",
                     "suntime", "new_cases"]
     
    
    preprocessor = ColumnTransformer(
        [
            ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
            ("cat", categorical_encoder, categorical_cols),
            ("scaler", StandardScaler(), numerical_cols)
        ]
    )

    
    regressor = SVR()
    
    params = {
        "C": np.arange(1,5),
        "gamma": ["scale", "auto"],
        "epsilon": np.linspace(0.05,0.2, 4)
    }
    print(regressor.get_params().keys())
    search = RandomizedSearchCV(regressor, 
                               param_distributions=params,
                               scoring='neg_mean_squared_error', 
                               verbose=10,
                               )

    pipe = make_pipeline(
            FunctionTransformer(_merge_external_data, validate=False),
            date_encoder,
            #imputer,
            preprocessor, 
            search)
    
    return pipe



In [51]:
import problem

X_train, y_train = problem.get_train_data()
X_test, y_test = problem.get_test_data()

In [None]:
X_train_comb = _merge_external_data(X_train)
pipe = get_estimator()
pipe.fit(X_train_comb, y_train)

dict_keys(['C', 'cache_size', 'coef0', 'degree', 'epsilon', 'gamma', 'kernel', 'max_iter', 'shrinking', 'tol', 'verbose'])
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START C=2, epsilon=0.1, gamma=scale..............................


In [13]:
file_path = "external_data.csv"
df_ext = pd.read_csv(file_path, parse_dates=["date"])

X_comb = X_train.join(df_ext.set_index("date"), on="date") 


In [24]:
X_comb.columns[X_comb.isna().any()].tolist()

['Unnamed: 0',
 'Temperature (C)',
 'wind_dir',
 'wind_speed',
 'Humidity',
 'Visibility',
 'Precipitation',
 'pressure1',
 'sunshine_time',
 'suntime',
 'new_cases',
 'date_only',
 'holidays2']