# Data Science Capstone 2

## Weather Forecaster

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import TimeSeriesSplit, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression

In [3]:
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
#Load the datasets
weather_dataset = pd.read_csv("modified_data/weather_prediction_dataset_finalized.csv")

#This dataset is optional and provides a template showing all the times the weather is appropriate for a picnic
weather_for_picnic = pd.read_csv("raw_data/weather_prediction_bbq_labels.csv")

In [6]:
#Load the training datset
train = pd.read_csv("modified_data/weather_prediction_training_dataset.csv")

#Load the testing dataset
test = pd.read_csv("modified_data/weather_prediction_testing_dataset.csv")

In [7]:
#Initial details of the dataset
print("Shape of dataset:", weather_dataset.shape)
weather_dataset.head(30)

Shape of dataset: (65754, 14)


Unnamed: 0,CITY,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
0,BASEL,1,8.0,0.89,1.0286,0.2,0.03,0.0,2.9,1.6,3.9,3.6,,
1,DUSSELDORF,1,8.0,0.92,1.024,0.12,0.22,0.0,4.2,2.5,6.9,6.5,2.5,5.9
2,HEATHROW,1,7.0,0.94,1.0245,0.18,0.0,0.4,7.0,4.9,10.8,7.9,,
3,KASSEL,1,,0.93,1.0237,0.06,0.13,0.0,3.5,1.5,5.0,2.3,2.5,8.2
4,LJUBLJANA,1,6.0,0.83,1.0294,0.57,0.0,5.2,-4.8,-9.1,-1.3,-0.9,0.4,
5,DE_BILT,1,7.0,0.97,1.024,0.11,0.1,0.0,6.1,3.5,8.1,7.3,2.5,8.0
6,MAASTRICHT,1,8.0,0.98,1.0251,0.06,0.17,0.0,5.6,4.1,6.9,6.2,3.1,7.0
7,MALMO,1,,,,,0.27,,2.9,0.9,3.6,3.7,2.5,
8,TOURS,1,,0.97,1.0275,0.25,0.04,,8.5,7.2,9.8,7.9,1.6,
9,MUENCHEN,1,8.0,0.91,1.0273,0.2,0.2,0.0,1.7,-0.5,2.6,1.9,2.6,9.4


In [8]:
train.head(30)

Unnamed: 0,DATE,CITY,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
0,2000-01-01,BASEL,1,8.0,0.89,1.0286,0.2,0.03,0.0,2.9,1.6,3.9,3.6,,
1,2000-01-02,BASEL,1,8.0,0.87,1.0318,0.25,0.0,0.0,3.6,2.7,4.8,2.2,,
2,2000-01-03,BASEL,1,5.0,0.81,1.0314,0.5,0.0,3.7,2.2,0.1,4.8,3.9,,
3,2000-01-04,BASEL,1,7.0,0.79,1.0262,0.63,0.35,6.9,3.9,0.5,7.5,6.0,,
4,2000-01-05,BASEL,1,5.0,0.9,1.0246,0.51,0.07,3.7,6.0,3.8,8.6,4.2,,
5,2000-01-06,BASEL,1,3.0,0.85,1.0244,0.56,0.0,5.7,4.2,1.9,6.9,4.7,,
6,2000-01-07,BASEL,1,8.0,0.84,1.0267,0.2,0.0,0.0,4.7,1.8,6.2,5.6,,
7,2000-01-08,BASEL,1,4.0,0.79,1.0248,0.54,0.0,4.3,5.6,4.1,8.4,4.6,,
8,2000-01-09,BASEL,1,8.0,0.88,1.0243,0.11,0.65,0.0,4.6,3.8,5.7,2.4,,
9,2000-01-10,BASEL,1,8.0,0.91,1.0337,0.06,0.09,0.0,2.4,1.4,3.8,3.2,,


In [9]:
test.head(30)

Unnamed: 0,DATE,CITY,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
0,2008-01-01,BASEL,1,8.0,0.86,1.0257,0.22,0.0,0.0,-0.5,-2.2,0.9,-1.4,,
1,2008-01-02,BASEL,1,8.0,0.67,1.0181,0.25,0.0,0.0,-1.4,-2.1,-0.8,-0.5,,
2,2008-01-03,BASEL,1,7.0,0.68,1.0076,0.28,0.0,0.1,-0.5,-3.1,3.2,-0.3,,
3,2008-01-04,BASEL,1,8.0,0.8,1.0131,0.14,0.0,0.0,-0.3,-3.6,2.3,6.3,,
4,2008-01-05,BASEL,1,8.0,0.74,1.0126,0.1,1.14,0.0,6.3,1.7,11.6,6.4,,
5,2008-01-06,BASEL,1,7.0,0.88,1.0157,0.19,0.54,0.1,6.4,4.6,8.3,7.0,,
6,2008-01-07,BASEL,1,5.0,0.79,1.0211,0.28,0.37,1.2,7.0,2.3,11.5,3.7,,
7,2008-01-08,BASEL,1,4.0,0.76,1.0251,0.6,0.0,6.8,3.7,0.9,6.9,3.7,,
8,2008-01-09,BASEL,1,7.0,0.87,1.0213,0.16,0.12,0.0,3.7,1.0,6.3,6.1,,
9,2008-01-10,BASEL,1,6.0,0.82,1.0203,0.67,0.0,7.5,6.1,3.7,9.3,7.3,,


In [15]:
#tscv = TimeSeriesSplit(n_splits=5)
#for train_ind, test_ind in tscv.split(weather_dataset):
#    train, test = weather_dataset.iloc[train_ind], weather_dataset.iloc[test_ind]

### Modeling/Evaluating Functions

In [10]:
def select_city(train, test, location):
    #Given a location, split the data into X and y
    train = train.loc[train['CITY'] == location]
    test = test.loc[test['CITY'] == location]

    X_train = train.drop(columns='target')
    y_train = train.target
    X_test = test.drop(columns='target')
    y_test = test.target

    return (X_train, y_train, X_test, y_test)

In [11]:
def select_model(model_type, params=None, y_train=None, y_test=None):
    if model_type == 'Linear Regression':
        model = make_pipeline(
            SimpleImputer(strategy=params['strategy']), 
            StandardScaler(),
            LinearRegression()
        )
    elif model_type == 'Random Forest':
        model = make_pipeline(
            SimpleImputer(strategy=params['strategy'], fill_value=params['fill_value']),
            StandardScaler(),
            RandomForestRegressor(random_state=params['random_state'])
        )
    elif model_type == 'XGBoost':
        model = xgb.XGBRegressor(**params)
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        y_test = le.fit_transform(y_test)
    elif model_type == 'CatBoost':
        model = CatBoostRegressor(
            iterations=params['iterations'],      
            learning_rate=params['learning_rate'],   
            depth=params['depth'],              
            verbose=0
        )
    elif model_type == 'LightGBM':
        model = lgb.LGBMRegressor(
            num_leaves=params['num_leaves'],
            learning_rate=params['learning_rate'],
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth']
        )
    else:
        print("Please input an available model type: [Linear Regression, Random Forest, XGBoost, CatBoost, LightGBM]")
        model = DummyRegressor(strategy='mean')
    return model, y_train, y_test

In [12]:
def make_prediction(model, X_train, y_train, X_test, y_test, hypertuning=False):
    if hypertuning == False:
        model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #Evaluate the model with performance metrics
    r2 = r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)
    RMSE = mean_squared_error(y_train, y_train_pred, squared=False), mean_squared_error(y_test, y_test_pred, squared=False)

    return r2, RMSE

In [13]:
def get_metrics(cv_score_list, r2_metric, RMSE_metric):
    cv_test_scores = []
    r2_train_scores = []
    r2_test_scores = []
    RMSE_train_scores = []
    RMSE_test_scores = []
    for score in cv_score_list:
        for cv in score['test_score']:
            cv_test_scores.append(cv)
    for r2 in r2_metric:
        r2_train_scores.append(r2[0])
        r2_test_scores.append(r2[1])
    for rmse in RMSE_metric:
        RMSE_train_scores.append(rmse[0])
        RMSE_test_scores.append(rmse[1])
    return cv_test_scores, r2_train_scores, r2_test_scores, RMSE_train_scores, RMSE_test_scores

In [14]:
def calculate_metrics(cv_test_scores, r2_train_scores, r2_test_scores, RMSE_train_scores, RMSE_test_scores):
    cv_mean = np.mean(cv_test_scores)
    cv_std = np.std(cv_test_scores)
    print("Mean and standard deviation of cross validations:", cv_mean, cv_std)

    r2_train = np.mean(r2_train_scores), np.std(r2_train_scores)
    print("Mean and standard deviation of R-squared for training set:", r2_train)
    r2_test = np.mean(r2_test_scores), np.std(r2_test_scores) 
    print("Mean and standard deviation of R-squared for testing set:", r2_test)

    rmse_train = np.mean(RMSE_train_scores), np.std(RMSE_train_scores)
    print("Mean and standard deviation of RMSE for training set:", rmse_train)
    rmse_test = np.mean(RMSE_test_scores), np.std(RMSE_test_scores)
    print("Mean and standard deviation of RMSE for testing set:", rmse_test)

In [15]:
city_names = ["BASEL", "BUDAPEST", "DE_BILT", "DRESDEN", "DUSSELDORF", "HEATHROW", "KASSEL", "LJUBLJANA", "MAASTRICHT",
              "MALMO", "MONTELIMAR", "MUENCHEN", "OSLO", "PERPIGNAN", "ROMA", "SONNBLICK", "STOCKHOLM", "TOURS"]

In [7]:
X_train = train.drop(columns='target')
y_train = train.target
X_test = test.drop(columns='target')
y_test = test.target

In [8]:
names_list = ['CITY', 'MONTH']
names_train = X_train[names_list]
names_test = X_test[names_list]
X_train.drop(columns=names_list, inplace=True)
X_test.drop(columns=names_list, inplace=True)
X_train.shape, X_test.shape

((54795, 11), (10959, 11))

In [9]:
X_train.dtypes

cloud_cover         float64
humidity            float64
pressure            float64
global_radiation    float64
precipitation       float64
sunshine            float64
temp_mean           float64
temp_min            float64
temp_max            float64
wind_speed          float64
wind_gust           float64
dtype: object

### Random Forest Models

In [16]:
#Random forest with mean imputation
rf1_cv_score_list = []
rf1_r2_metric = []
rf1_RMSE_metric = []

for city in city_names:
    #X, y split
    X_train, y_train, X_test, y_test = select_city(train, test, city)

    #Set up parameters and select the model
    params = {'strategy': 'mean', 'random_state': 5, 'fill_value': None}
    pipe_rf_1, le_train, le_test = select_model('Random Forest', params)

    #Perform cross validation
    names_list = ['DATE', 'CITY', 'MONTH']
    X_train.drop(columns=names_list, inplace=True)
    X_test.drop(columns=names_list, inplace=True)
    cv_scores = cross_validate(pipe_rf_1, X_train, y_train, cv=5)
    rf1_cv_score_list.append(cv_scores)

    r2, RMSE = make_prediction(pipe_rf_1, X_train, y_train, X_test, y_test)
    rf1_r2_metric.append(r2)
    rf1_RMSE_metric.append(RMSE)

In [17]:
rf1_cv_test_scores, rf1_r2_train_scores, rf1_r2_test_scores, rf1_RMSE_train_scores, rf1_RMSE_test_scores = get_metrics(rf1_cv_score_list, rf1_r2_metric, rf1_RMSE_metric)
print("Accuracy metrics for random forest with mean imputation")
calculate_metrics(rf1_cv_test_scores, rf1_r2_train_scores, rf1_r2_test_scores, rf1_RMSE_train_scores, rf1_RMSE_test_scores)

Accuracy metrics for random forest with mean imputation
Mean and standard deviation of cross validations: 0.9141206934989343 0.020242080933146576
Mean and standard deviation of R-squared for training set: (np.float64(0.9887740587214378), np.float64(0.0022955516249328913))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9200700074625385), np.float64(0.015847961207258864))
Mean and standard deviation of RMSE for training set: (np.float64(0.7565545787697364), np.float64(0.0841998222116411))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9718218791209583), np.float64(0.17673121677939715))


In [18]:
#Random forest with median imputation
rf2_cv_score_list = []
rf2_r2_metric = []
rf2_RMSE_metric = []

for city in city_names:
    #X, y split
    X_train, y_train, X_test, y_test = select_city(train, test, city)

    #Set up parameters and select the model
    params = {'strategy': 'median', 'random_state': 5, 'fill_value': None}
    pipe_rf_2, le_train, le_test = select_model('Random Forest', params)

    #Perform cross validation
    names_list = ['DATE', 'CITY', 'MONTH']
    X_train.drop(columns=names_list, inplace=True)
    X_test.drop(columns=names_list, inplace=True)
    cv_scores = cross_validate(pipe_rf_2, X_train, y_train, cv=5)
    rf2_cv_score_list.append(cv_scores)

    r2, RMSE = make_prediction(pipe_rf_2, X_train, y_train, X_test, y_test)
    rf2_r2_metric.append(r2)
    rf2_RMSE_metric.append(RMSE)

In [19]:
rf2_cv_test_scores, rf2_r2_train_scores, rf2_r2_test_scores, rf2_RMSE_train_scores, rf2_RMSE_test_scores = get_metrics(rf2_cv_score_list, rf2_r2_metric, rf2_RMSE_metric)
print("Accuracy metrics for random forest with median imputation")
calculate_metrics(rf2_cv_test_scores, rf2_r2_train_scores, rf2_r2_test_scores, rf2_RMSE_train_scores, rf2_RMSE_test_scores)

Accuracy metrics for random forest with median imputation
Mean and standard deviation of cross validations: 0.9141206934989343 0.020242080933146576
Mean and standard deviation of R-squared for training set: (np.float64(0.9887740587214378), np.float64(0.0022955516249328913))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9200700074625385), np.float64(0.015847961207258864))
Mean and standard deviation of RMSE for training set: (np.float64(0.7565545787697364), np.float64(0.0841998222116411))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9718218791209583), np.float64(0.17673121677939715))


In [20]:
#Random forest with constant imputation of value -64
rf3_cv_score_list = []
rf3_r2_metric = []
rf3_RMSE_metric = []

for city in city_names:
    #X, y split
    X_train, y_train, X_test, y_test = select_city(train, test, city)

    #Set up parameters and select the model
    params = {'strategy': 'constant', 'random_state': 5, 'fill_value': -64}
    pipe_rf_3, le_train, le_test = select_model('Random Forest', params)

    #Perform cross validation
    names_list = ['DATE', 'CITY', 'MONTH']
    X_train.drop(columns=names_list, inplace=True)
    X_test.drop(columns=names_list, inplace=True)
    cv_scores = cross_validate(pipe_rf_3, X_train, y_train, cv=5)
    rf3_cv_score_list.append(cv_scores)

    r2, RMSE = make_prediction(pipe_rf_3, X_train, y_train, X_test, y_test)
    rf3_r2_metric.append(r2)
    rf3_RMSE_metric.append(RMSE)

In [21]:
rf3_cv_test_scores, rf3_r2_train_scores, rf3_r2_test_scores, rf3_RMSE_train_scores, rf3_RMSE_test_scores = get_metrics(rf3_cv_score_list, rf3_r2_metric, rf3_RMSE_metric)
print("Accuracy metrics for random forest with constant (-64) imputation")
calculate_metrics(rf3_cv_test_scores, rf3_r2_train_scores, rf3_r2_test_scores, rf3_RMSE_train_scores, rf3_RMSE_test_scores)

Accuracy metrics for random forest with constant (-64) imputation
Mean and standard deviation of cross validations: 0.9141279949176385 0.020291856784996346
Mean and standard deviation of R-squared for training set: (np.float64(0.9887714046018952), np.float64(0.002307404027098382))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9201741763060648), np.float64(0.015865912599947376))
Mean and standard deviation of RMSE for training set: (np.float64(0.7565909708898793), np.float64(0.0845876202686812))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9703695321196437), np.float64(0.1755729657747447))


In [None]:
# Choose the second variation with the median imputation as the best among the random forest models

In [26]:
#Now lets use GridSearchCV to further tune our hyperparameters and see if we can beat the scores above
n_est = [int(n) for n in np.logspace(start=1, stop=3, num=5)]
grid_params = {
        'randomforestregressor__n_estimators': n_est,
        'standardscaler': [StandardScaler(), None],
        'simpleimputer__strategy': ['median']
}

In [27]:
rf_grid_cv = GridSearchCV(RF_pipe_2, param_grid=grid_params, cv=5, n_jobs=-1)

In [None]:
# NOTE: This cell has repeatedly caused the notebook and web browser to crash likely due to the high time complexity
# rf_grid_cv.fit(X_train, y_train)

Due to the high runtime cost of random forest models, despite having good accuracy values, they will not scale well on larger and more computationaly expensive datasets. Thus the random forest model will not be chosen.

### Categorical Boosting (CatBoost)

In [18]:
cat_cv_score_list = []
cat_r2_metric = []
cat_RMSE_metric = []
cat_models = []

for city in city_names:
    #X, y split
    X_train, y_train, X_test, y_test = select_city(train, test, city)

    #Set up parameters and select the model
    params = {'iterations': 100, 'learning_rate': 0.1, 'depth': 6}
    cat_model, le_train, le_test = select_model('CatBoost', params)

    #Perform cross validation
    names_list = ['DATE', 'CITY', 'MONTH']
    X_train.drop(columns=names_list, inplace=True)
    X_test.drop(columns=names_list, inplace=True)
    cv_scores = cross_validate(cat_model, X_train, y_train, cv=5)
    cat_cv_score_list.append(cv_scores)

    r2, RMSE = make_prediction(cat_model, X_train, y_train, X_test, y_test)
    cat_r2_metric.append(r2)
    cat_RMSE_metric.append(RMSE)
    
    cat_models.append(cat_model)

In [19]:
cat_cv_test_scores, cat_r2_train_scores, cat_r2_test_scores, cat_RMSE_train_scores, cat_RMSE_test_scores = get_metrics(cat_cv_score_list, cat_r2_metric, cat_RMSE_metric)
print("Accuracy metrics for catboost")
calculate_metrics(cat_cv_test_scores, cat_r2_train_scores, cat_r2_test_scores, cat_RMSE_train_scores, cat_RMSE_test_scores)

Accuracy metrics for catboost
Mean and standard deviation of cross validations: 0.9172513800404888 0.019413591700032158
Mean and standard deviation of R-squared for training set: (np.float64(0.9366215449711363), np.float64(0.01337444669457421))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9231878242859488), np.float64(0.014872004283516487))
Mean and standard deviation of RMSE for training set: (np.float64(1.7964137353016925), np.float64(0.20183859445711802))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9335390997356174), np.float64(0.17087140644086615))


In [28]:
cat_models

[<catboost.core.CatBoostRegressor at 0x1dc342e7a40>,
 <catboost.core.CatBoostRegressor at 0x1dc3493d490>,
 <catboost.core.CatBoostRegressor at 0x1dc3493ecc0>,
 <catboost.core.CatBoostRegressor at 0x1dc3493f800>,
 <catboost.core.CatBoostRegressor at 0x1dc3493f650>,
 <catboost.core.CatBoostRegressor at 0x1dc3493f1d0>,
 <catboost.core.CatBoostRegressor at 0x1dc3493eba0>,
 <catboost.core.CatBoostRegressor at 0x1dc3493d6d0>,
 <catboost.core.CatBoostRegressor at 0x1dc3493e8d0>,
 <catboost.core.CatBoostRegressor at 0x1dc7f9d67b0>,
 <catboost.core.CatBoostRegressor at 0x1dc34529fd0>,
 <catboost.core.CatBoostRegressor at 0x1dc3493e2a0>,
 <catboost.core.CatBoostRegressor at 0x1dc3493e3c0>,
 <catboost.core.CatBoostRegressor at 0x1dc342e75c0>,
 <catboost.core.CatBoostRegressor at 0x1dc3493f260>,
 <catboost.core.CatBoostRegressor at 0x1dc3493f920>,
 <catboost.core.CatBoostRegressor at 0x1dc3493f4a0>,
 <catboost.core.CatBoostRegressor at 0x1dc3493d880>]

In [16]:
grid_params = {
    'iterations': [50, 100, 200, 500],
    'learning_rate': [0.1, 0.05, 0.01],
    'depth': [4, 6, 8, 10]
}

In [20]:
# Perform grid search on the catboost model
cat_grid_cv_score_list = []
cat_grid_r2_metric = []
cat_grid_RMSE_metric = []
cat_grids = []

for i in range(18):
    #X, y split
    X_train, y_train, X_test, y_test = select_city(train, test, city_names[i])

    #Set up the gridsearch object
    cat_grid_cv = GridSearchCV(cat_models[i], param_grid=grid_params, cv=5, n_jobs=-1)

    #Find the best parameters
    names_list = ['DATE', 'CITY', 'MONTH']
    X_train.drop(columns=names_list, inplace=True)
    X_test.drop(columns=names_list, inplace=True)
    cat_grid_cv.fit(X_train, y_train)
    print(city_names[i], "best hyperparameters:", cat_grid_cv.best_params_)

    #Perform cross validation
    cat_best_cv_results = cross_validate(cat_grid_cv.best_estimator_, X_train, y_train, cv=5)
    cat_grid_cv_score_list.append(cat_best_cv_results)

    r2, RMSE = make_prediction(cat_grid_cv.best_estimator_, X_train, y_train, X_test, y_test, hypertuning=True)
    cat_grid_r2_metric.append(r2)
    cat_grid_RMSE_metric.append(RMSE)

    cat_grids.append(cat_grid_cv)

BASEL best hyperparameters: {'depth': 10, 'iterations': 200, 'learning_rate': 0.05}
BUDAPEST best hyperparameters: {'depth': 8, 'iterations': 200, 'learning_rate': 0.05}
DE_BILT best hyperparameters: {'depth': 10, 'iterations': 200, 'learning_rate': 0.05}
DRESDEN best hyperparameters: {'depth': 8, 'iterations': 200, 'learning_rate': 0.05}
DUSSELDORF best hyperparameters: {'depth': 6, 'iterations': 200, 'learning_rate': 0.05}
HEATHROW best hyperparameters: {'depth': 6, 'iterations': 500, 'learning_rate': 0.05}
KASSEL best hyperparameters: {'depth': 8, 'iterations': 200, 'learning_rate': 0.05}
LJUBLJANA best hyperparameters: {'depth': 4, 'iterations': 500, 'learning_rate': 0.05}
MAASTRICHT best hyperparameters: {'depth': 8, 'iterations': 200, 'learning_rate': 0.05}
MALMO best hyperparameters: {'depth': 6, 'iterations': 200, 'learning_rate': 0.05}
MONTELIMAR best hyperparameters: {'depth': 4, 'iterations': 500, 'learning_rate': 0.05}
MUENCHEN best hyperparameters: {'depth': 10, 'iteration

BASEL best hyperparameters: {'depth': 10, 'iterations': 200, 'learning_rate': 0.05}
BUDAPEST best hyperparameters: {'depth': 8, 'iterations': 200, 'learning_rate': 0.05}
DE_BILT best hyperparameters: {'depth': 10, 'iterations': 200, 'learning_rate': 0.05}
DRESDEN best hyperparameters: {'depth': 8, 'iterations': 200, 'learning_rate': 0.05}
DUSSELDORF best hyperparameters: {'depth': 6, 'iterations': 200, 'learning_rate': 0.05}
HEATHROW best hyperparameters: {'depth': 6, 'iterations': 500, 'learning_rate': 0.05}
KASSEL best hyperparameters: {'depth': 8, 'iterations': 200, 'learning_rate': 0.05}
LJUBLJANA best hyperparameters: {'depth': 4, 'iterations': 500, 'learning_rate': 0.05}
MAASTRICHT best hyperparameters: {'depth': 8, 'iterations': 200, 'learning_rate': 0.05}
MALMO best hyperparameters: {'depth': 6, 'iterations': 200, 'learning_rate': 0.05}
MONTELIMAR best hyperparameters: {'depth': 4, 'iterations': 500, 'learning_rate': 0.05}
MUENCHEN best hyperparameters: {'depth': 10, 'iterations': 200, 'learning_rate': 0.05}
OSLO best hyperparameters: {'depth': 4, 'iterations': 500, 'learning_rate': 0.05}
PERPIGNAN best hyperparameters: {'depth': 6, 'iterations': 200, 'learning_rate': 0.05}
ROMA best hyperparameters: {'depth': 4, 'iterations': 500, 'learning_rate': 0.05}
SONNBLICK best hyperparameters: {'depth': 8, 'iterations': 200, 'learning_rate': 0.05}
STOCKHOLM best hyperparameters: {'depth': 4, 'iterations': 500, 'learning_rate': 0.05}
TOURS best hyperparameters: {'depth': 4, 'iterations': 200, 'learning_rate': 0.1}

In [21]:
cat_grid_cv_test_scores, cat_grid_r2_train_scores, cat_grid_r2_test_scores, cat_grid_RMSE_train_scores, cat_grid_RMSE_test_scores = get_metrics(cat_grid_cv_score_list, cat_grid_r2_metric, cat_grid_RMSE_metric)
print("Accuracy metrics for catboost after gridsearch hyperparamter tuning")
calculate_metrics(cat_grid_cv_test_scores, cat_grid_r2_train_scores, cat_grid_r2_test_scores, cat_grid_RMSE_train_scores, cat_grid_RMSE_test_scores)

Accuracy metrics for catboost after gridsearch hyperparamter tuning
Mean and standard deviation of cross validations: 0.9185991491203664 0.01944804193727662
Mean and standard deviation of R-squared for training set: (np.float64(0.9446180776416452), np.float64(0.014678802651931527))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9240833478262673), np.float64(0.014856070418597028))
Mean and standard deviation of RMSE for training set: (np.float64(1.6746603170703755), np.float64(0.2262614416507572))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9219318763821436), np.float64(0.17102352810795002))


In [30]:
cat_grid_r2_test_scores

[0.9313307701113895,
 0.9452371614377152,
 0.9140234709366472,
 0.9203631522540899,
 0.9155703238011654,
 0.9334569499215015,
 0.9215522814328946,
 0.936723156263505,
 0.9020045185782191,
 0.9290286721438842,
 0.9264788071326779,
 0.9295888051961296,
 0.9436389065652373,
 0.8949041663977891,
 0.9479314016985741,
 0.9113252575252039,
 0.9285674970680999,
 0.9017749624080885]

Accuracy metrics for catboost after gridsearch hyperparamter tuning
Mean and standard deviation of cross validations: 0.9185991491203664 0.01944804193727662
Mean and standard deviation of R-squared for training set: (np.float64(0.9446180776416452), np.float64(0.014678802651931527))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9240833478262673), np.float64(0.014856070418597028))
Mean and standard deviation of RMSE for training set: (np.float64(1.6746603170703755), np.float64(0.2262614416507572))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9219318763821436), np.float64(0.17102352810795002))

Results above seem promising, and computation wise, it is much faster than random forest.

### Light Gradient Boosting Machine (LightGBM)

In [16]:
import sys
!{sys.executable} -m pip install lightgbm
import lightgbm as lgb


[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: C:\Users\tanks\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip




In [17]:
light_cv_score_list = []
light_r2_metric = []
light_RMSE_metric = []
light_models = []

for city in city_names:
    #X, y split
    X_train, y_train, X_test, y_test = select_city(train, test, city)

    #Set up parameters and select the model
    params = {'num_leaves': 10, 'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 5}
    light_model, le_train, le_test = select_model('LightGBM', params)

    #Perform cross validation
    names_list = ['DATE', 'CITY', 'MONTH']
    X_train.drop(columns=names_list, inplace=True)
    X_test.drop(columns=names_list, inplace=True)
    cv_scores = cross_validate(light_model, X_train, y_train, cv=5)
    light_cv_score_list.append(cv_scores)

    r2, RMSE = make_prediction(light_model, X_train, y_train, X_test, y_test)
    light_r2_metric.append(r2)
    light_RMSE_metric.append(RMSE)

    light_models.append(light_model)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000807 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1599
[LightGBM] [Info] Number of data points in the train set: 2337, number of used features: 9
[LightGBM] [Info] Start training from score 10.878006
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000321 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1598
[LightGBM] [Info] Number of data points in the train set: 2337, number of used features: 9
[LightGBM] [Info] Start training from score 11.380274
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000319 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1588
[LightGBM] [Info] Number of data points in the train set: 2338, number of used features: 9
[LightGBM] [Info] Start trainin

In [18]:
light_cv_test_scores, light_r2_train_scores, light_r2_test_scores, light_RMSE_train_scores, light_RMSE_test_scores = get_metrics(light_cv_score_list, light_r2_metric, light_RMSE_metric)
print("Accuracy metrics for lightgbm")
calculate_metrics(light_cv_test_scores, light_r2_train_scores, light_r2_test_scores, light_RMSE_train_scores, light_RMSE_test_scores)

Accuracy metrics for lightgbm
Mean and standard deviation of cross validations: 0.917369802366846 0.020247899898922776
Mean and standard deviation of R-squared for training set: (np.float64(0.9438240192208459), np.float64(0.012229443268340806))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9233835766548757), np.float64(0.015135363772859681))
Mean and standard deviation of RMSE for training set: (np.float64(1.690231650397576), np.float64(0.19079817271216706))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9307142275504126), np.float64(0.17724529168536374))


In [19]:
light_models

[LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10),
 LGBMRegressor(max_depth=5, num_leaves=10)]

In [20]:
# Begin hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

In [21]:
rs_params = {
    'num_leaves': [5, 10, 30, 50, 100],
    'learning_rate': [0.5, 0.1, 0.05, 0.01],
    'n_estimators': [100, 200, 500],
    'max_depth': [-1, 3, 5, 10]
}

In [22]:
# Perform random search on the lightgbm model
light_rs_cv_score_list = []
light_rs_r2_metric = []
light_rs_RMSE_metric = []
light_rss = []

for i in range(18):
    #X, y split
    X_train, y_train, X_test, y_test = select_city(train, test, city_names[i])

    #Set up the random search object
    light_rs_cv = RandomizedSearchCV(light_models[i], rs_params, n_iter=60, cv=5, n_jobs=-1)

    #Find the best parameters
    names_list = ['DATE', 'CITY', 'MONTH']
    X_train.drop(columns=names_list, inplace=True)
    X_test.drop(columns=names_list, inplace=True)
    light_rs_cv.fit(X_train, y_train)
    print(city_names[i], "best hyperparameters:", light_rs_cv.best_params_)

    #Perform cross validation
    light_best_cv_results = cross_validate(light_rs_cv.best_estimator_, X_train, y_train, cv=5)
    light_rs_cv_score_list.append(light_best_cv_results)

    r2, RMSE = make_prediction(light_rs_cv.best_estimator_, X_train, y_train, X_test, y_test, hypertuning=True)
    light_rs_r2_metric.append(r2)
    light_rs_RMSE_metric.append(RMSE)

    light_rss.append(light_rs_cv)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000389 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1617
[LightGBM] [Info] Number of data points in the train set: 2922, number of used features: 9
[LightGBM] [Info] Start training from score 11.034326
BASEL best hyperparameters: {'num_leaves': 10, 'n_estimators': 200, 'max_depth': -1, 'learning_rate': 0.05}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000344 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1599
[LightGBM] [Info] Number of data points in the train set: 2337, number of used features: 9
[LightGBM] [Info] Start training from score 10.878006
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000294 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1598
[LightGBM] [In

BASEL best hyperparameters: {'num_leaves': 10, 'n_estimators': 500, 'max_depth': -1, 'learning_rate': 0.01}
BUDAPEST best hyperparameters: {'num_leaves': 10, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05}
DE_BILT best hyperparameters: {'num_leaves': 5, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.05}
DRESDEN best hyperparameters: {'num_leaves': 100, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.01}
DUSSELDORF best hyperparameters: {'num_leaves': 100, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.05}
HEATHROW best hyperparameters: {'num_leaves': 5, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.05}
KASSEL best hyperparameters: {'num_leaves': 5, 'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.1}
LJUBLJANA best hyperparameters: {'num_leaves': 10, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05}
MAASTRICHT best hyperparameters: {'num_leaves': 5, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05}
MALMO best hyperparameters: {'num_leaves': 100, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1}
MONTELIMAR best hyperparameters: {'num_leaves': 100, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.05}
MUENCHEN best hyperparameters: {'num_leaves': 30, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.05}
OSLO best hyperparameters: {'num_leaves': 10, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1}
PERPIGNAN best hyperparameters: {'num_leaves': 10, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05}
ROMA best hyperparameters: {'num_leaves': 100, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05}
SONNBLICK best hyperparameters: {'num_leaves': 30, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.01}
STOCKHOLM best hyperparameters: {'num_leaves': 100, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1}
TOURS best hyperparameters: {'num_leaves': 50, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1}

In [23]:
light_rs_cv_test_scores, light_rs_r2_train_scores, light_rs_r2_test_scores, light_rs_RMSE_train_scores, light_rs_RMSE_test_scores = get_metrics(light_rs_cv_score_list, light_rs_r2_metric, light_rs_RMSE_metric)
print("Accuracy metrics for lightgbm after randomsearch hyperparamter tuning")
calculate_metrics(light_rs_cv_test_scores, light_rs_r2_train_scores, light_rs_r2_test_scores, light_rs_RMSE_train_scores, light_rs_RMSE_test_scores)

Accuracy metrics for lightgbm after randomsearch hyperparamter tuning
Mean and standard deviation of cross validations: 0.9179420188718316 0.020094391818457878
Mean and standard deviation of R-squared for training set: (np.float64(0.9437145742367584), np.float64(0.012324052314926797))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9235249076787192), np.float64(0.01537232667287275))
Mean and standard deviation of RMSE for training set: (np.float64(1.6943685665522934), np.float64(0.20394300384435515))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9283415503763786), np.float64(0.17631756993434292))


In [27]:
for i in range(18):
    print(light_rs_r2_train_scores[i], light_rs_r2_test_scores[i], light_rs_RMSE_train_scores[i], light_rs_RMSE_test_scores[i])

0.9479359410966095 0.9314329301366876 1.6949689797515204 1.9256353882723218
0.9564358844834195 0.9443753168435423 1.838448559334197 1.9905139671283454
0.946411443498179 0.9115976886551772 1.4307236621865262 1.8516184307229415
0.946070951415414 0.9167725295459257 1.8408269529695556 2.1788423447254623
0.9407301881877407 0.9156756101419317 1.6301490701636923 1.9313446451281708
0.9611764246084639 0.9328326212810363 1.1064752112009844 1.4453715048430533
0.9400531116115541 0.9194307667044769 1.7615850150303733 2.054008572213445
0.9531990344504258 0.9374716040433935 1.790684113142183 2.0302309543757935
0.9216532362127329 0.9009866346433425 1.8521506382909627 2.0585125853022883
0.9375181530061726 0.9306282979477556 1.7384182565716788 1.7541440778954087
0.9525495076762601 0.9240586664543635 1.5759670285876024 1.934700323495245
0.9537761777251137 0.9307236562212304 1.709980659610902 2.0247369980015058
0.94858661532779 0.9438800215748595 1.8226389235314475 1.844356201056823
0.9236269836795062 0.8

Accuracy metrics for lightgbm after randomsearch hyperparamter tuning
Mean and standard deviation of cross validations: 0.9180053284299741 0.01987337934091608
Mean and standard deviation of R-squared for training set: (np.float64(0.9425033680636711), np.float64(0.012803927943043482))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9235298037238778), np.float64(0.015249725231646536))
Mean and standard deviation of RMSE for training set: (np.float64(1.7093685534867753), np.float64(0.192974451921603))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9288062896766358), np.float64(0.17899232406025684))

### Final Comparision

In [50]:
cat_grid_cv_test_scores, cat_grid_r2_train_scores, cat_grid_r2_test_scores, cat_grid_RMSE_train_scores, cat_grid_RMSE_test_scores = get_metrics(cat_grid_cv_score_list, cat_grid_r2_metric, cat_grid_RMSE_metric)
print("Accuracy metrics for catboost after gridsearch hyperparamter tuning")
calculate_metrics(cat_grid_cv_test_scores, cat_grid_r2_train_scores, cat_grid_r2_test_scores, cat_grid_RMSE_train_scores, cat_grid_RMSE_test_scores)

Accuracy metrics for catboost after gridsearch hyperparamter tuning
Mean and standard deviation of cross validations: 0.9185991491203664 0.01944804193727662
Mean and standard deviation of R-squared for training set: (np.float64(0.9446180776416452), np.float64(0.014678802651931527))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9240833478262673), np.float64(0.014856070418597028))
Mean and standard deviation of RMSE for training set: (np.float64(1.6746603170703755), np.float64(0.2262614416507572))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9219318763821436), np.float64(0.17102352810795002))


In [51]:
light_rs_cv_test_scores, light_rs_r2_train_scores, light_rs_r2_test_scores, light_rs_RMSE_train_scores, light_rs_RMSE_test_scores = get_metrics(light_rs_cv_score_list, light_rs_r2_metric, light_rs_RMSE_metric)
print("Accuracy metrics for lightgbm after randomsearch hyperparamter tuning")
calculate_metrics(light_rs_cv_test_scores, light_rs_r2_train_scores, light_rs_r2_test_scores, light_rs_RMSE_train_scores, light_rs_RMSE_test_scores)

Accuracy metrics for lightgbm after randomsearch hyperparamter tuning
Mean and standard deviation of cross validations: 0.9180053284299741 0.01987337934091608
Mean and standard deviation of R-squared for training set: (np.float64(0.9425033680636711), np.float64(0.012803927943043482))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9235298037238778), np.float64(0.015249725231646536))
Mean and standard deviation of RMSE for training set: (np.float64(1.7093685534867753), np.float64(0.192974451921603))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9288062896766358), np.float64(0.17899232406025684))


### Conclusion

Though between CatBoost and LightGBM, each model performed well close of each other, the better model seems to be the CatBoost model. Marginal improved values on R-squared, MAE, and RMSE. In addition, CatBoost models are generally easier to handle as they require less data preprocessing and hyperparamter tuning compared to LightGBM, they are also optimized for speed and memory.

However, it is important to note that LightGBM should not be discarded in this case, but can be seen as a backup to CatBoost. In terms of scalibility, LightGBM is stronger. If the (CatBoost) model is well received and is planned upon to be expanded to cover many more locations as well as handle much more weather measurements (thus more features/columns), then CatBoost might begin to drag on training time, and LightGBM would step in as the alternative.

### Save the Models

In [52]:
import catboost
import lightgbm
import datetime

In [53]:
from library.sb_utils import save_file

In [54]:
# CatBoost Model
#model_1 = cat_grid_cv.best_estimator_
#model_1.version = 1.0
#model_1.pandas_version = pd.__version__
#model_1.numpy_version = np.__version__
#model_1.catboost_version = catboost.__version__
#model_1.X_columns = [col for col in X_train.columns]
#model_1.build_datetime = datetime.datetime.now()

In [55]:
#modelpath = 'models'
#save_file(model_1, 'weather_forecaster_temperature_model_catboost.pkl', modelpath)

In [57]:
# CatBoost Model for Basel
model_1_1 = cat_grids[0].best_estimator_
model_1_1.version = 1.0
model_1_1.pandas_version = pd.__version__
model_1_1.numpy_version = np.__version__
model_1_1.catboost_version = catboost.__version__
model_1_1.X_columns = [col for col in X_train.columns]
model_1_1.build_datetime = datetime.datetime.now()

In [58]:
modelpath = 'models/catboost_submodels'
save_file(model_1_1, 'weather_forecaster_temperature_model_catboost_BASEL.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_BASEL.pkl"


In [59]:
# CatBoost Model for Budapest
model_1_2 = cat_grids[1].best_estimator_
model_1_2.version = 1.0
model_1_2.pandas_version = pd.__version__
model_1_2.numpy_version = np.__version__
model_1_2.catboost_version = catboost.__version__
model_1_2.X_columns = [col for col in X_train.columns]
model_1_2.build_datetime = datetime.datetime.now()

In [60]:
modelpath = 'models/catboost_submodels'
save_file(model_1_2, 'weather_forecaster_temperature_model_catboost_BUDAPEST.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_BUDAPEST.pkl"


In [61]:
# CatBoost Model for De Bilt
model_1_3 = cat_grids[2].best_estimator_
model_1_3.version = 1.0
model_1_3.pandas_version = pd.__version__
model_1_3.numpy_version = np.__version__
model_1_3.catboost_version = catboost.__version__
model_1_3.X_columns = [col for col in X_train.columns]
model_1_3.build_datetime = datetime.datetime.now()

In [62]:
modelpath = 'models/catboost_submodels'
save_file(model_1_3, 'weather_forecaster_temperature_model_catboost_DE_BILT.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_DE_BILT.pkl"


In [63]:
# CatBoost Model for Dresden
model_1_4 = cat_grids[3].best_estimator_
model_1_4.version = 1.0
model_1_4.pandas_version = pd.__version__
model_1_4.numpy_version = np.__version__
model_1_4.catboost_version = catboost.__version__
model_1_4.X_columns = [col for col in X_train.columns]
model_1_4.build_datetime = datetime.datetime.now()

In [64]:
modelpath = 'models/catboost_submodels'
save_file(model_1_4, 'weather_forecaster_temperature_model_catboost_DRESDEN.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_DRESDEN.pkl"


In [65]:
# CatBoost Model for Dusseldorf
model_1_5 = cat_grids[4].best_estimator_
model_1_5.version = 1.0
model_1_5.pandas_version = pd.__version__
model_1_5.numpy_version = np.__version__
model_1_5.catboost_version = catboost.__version__
model_1_5.X_columns = [col for col in X_train.columns]
model_1_5.build_datetime = datetime.datetime.now()

In [66]:
modelpath = 'models/catboost_submodels'
save_file(model_1_5, 'weather_forecaster_temperature_model_catboost_DUSSELDORF.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_DUSSELDORF.pkl"


In [67]:
# CatBoost Model for Heathrow
model_1_6 = cat_grids[5].best_estimator_
model_1_6.version = 1.0
model_1_6.pandas_version = pd.__version__
model_1_6.numpy_version = np.__version__
model_1_6.catboost_version = catboost.__version__
model_1_6.X_columns = [col for col in X_train.columns]
model_1_6.build_datetime = datetime.datetime.now()

In [68]:
modelpath = 'models/catboost_submodels'
save_file(model_1_6, 'weather_forecaster_temperature_model_catboost_HEATHROW.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_HEATHROW.pkl"


In [69]:
# CatBoost Model for Kassel
model_1_7 = cat_grids[6].best_estimator_
model_1_7.version = 1.0
model_1_7.pandas_version = pd.__version__
model_1_7.numpy_version = np.__version__
model_1_7.catboost_version = catboost.__version__
model_1_7.X_columns = [col for col in X_train.columns]
model_1_7.build_datetime = datetime.datetime.now()

In [70]:
modelpath = 'models/catboost_submodels'
save_file(model_1_7, 'weather_forecaster_temperature_model_catboost_KASSEL.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_KASSEL.pkl"


In [71]:
# CatBoost Model for Ljubljana
model_1_8 = cat_grids[7].best_estimator_
model_1_8.version = 1.0
model_1_8.pandas_version = pd.__version__
model_1_8.numpy_version = np.__version__
model_1_8.catboost_version = catboost.__version__
model_1_8.X_columns = [col for col in X_train.columns]
model_1_8.build_datetime = datetime.datetime.now()

In [72]:
modelpath = 'models/catboost_submodels'
save_file(model_1_8, 'weather_forecaster_temperature_model_catboost_LJUBLJANA.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_LJUBLJANA.pkl"


In [73]:
# CatBoost Model for Maastricht
model_1_9 = cat_grids[8].best_estimator_
model_1_9.version = 1.0
model_1_9.pandas_version = pd.__version__
model_1_9.numpy_version = np.__version__
model_1_9.catboost_version = catboost.__version__
model_1_9.X_columns = [col for col in X_train.columns]
model_1_9.build_datetime = datetime.datetime.now()

In [74]:
modelpath = 'models/catboost_submodels'
save_file(model_1_9, 'weather_forecaster_temperature_model_catboost_MAASTRICHT.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_MAASTRICHT.pkl"


In [75]:
# CatBoost Model for Malmo
model_1_10 = cat_grids[9].best_estimator_
model_1_10.version = 1.0
model_1_10.pandas_version = pd.__version__
model_1_10.numpy_version = np.__version__
model_1_10.catboost_version = catboost.__version__
model_1_10.X_columns = [col for col in X_train.columns]
model_1_10.build_datetime = datetime.datetime.now()

In [76]:
modelpath = 'models/catboost_submodels'
save_file(model_1_10, 'weather_forecaster_temperature_model_catboost_MALMO.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_MALMO.pkl"


In [77]:
# CatBoost Model for Montelimar
model_1_11 = cat_grids[10].best_estimator_
model_1_11.version = 1.0
model_1_11.pandas_version = pd.__version__
model_1_11.numpy_version = np.__version__
model_1_11.catboost_version = catboost.__version__
model_1_11.X_columns = [col for col in X_train.columns]
model_1_11.build_datetime = datetime.datetime.now()

In [78]:
modelpath = 'models/catboost_submodels'
save_file(model_1_11, 'weather_forecaster_temperature_model_catboost_MONTELIMAR.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_MONTELIMAR.pkl"


In [79]:
# CatBoost Model for Muenchen
model_1_12 = cat_grids[11].best_estimator_
model_1_12.version = 1.0
model_1_12.pandas_version = pd.__version__
model_1_12.numpy_version = np.__version__
model_1_12.catboost_version = catboost.__version__
model_1_12.X_columns = [col for col in X_train.columns]
model_1_12.build_datetime = datetime.datetime.now()

In [80]:
modelpath = 'models/catboost_submodels'
save_file(model_1_12, 'weather_forecaster_temperature_model_catboost_MUENCHEN.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_MUENCHEN.pkl"


In [81]:
# CatBoost Model for Oslo
model_1_13 = cat_grids[12].best_estimator_
model_1_13.version = 1.0
model_1_13.pandas_version = pd.__version__
model_1_13.numpy_version = np.__version__
model_1_13.catboost_version = catboost.__version__
model_1_13.X_columns = [col for col in X_train.columns]
model_1_13.build_datetime = datetime.datetime.now()

In [82]:
modelpath = 'models/catboost_submodels'
save_file(model_1_13, 'weather_forecaster_temperature_model_catboost_OSLO.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_OSLO.pkl"


In [83]:
# CatBoost Model for Perpignan
model_1_14 = cat_grids[13].best_estimator_
model_1_14.version = 1.0
model_1_14.pandas_version = pd.__version__
model_1_14.numpy_version = np.__version__
model_1_14.catboost_version = catboost.__version__
model_1_14.X_columns = [col for col in X_train.columns]
model_1_14.build_datetime = datetime.datetime.now()

In [84]:
modelpath = 'models/catboost_submodels'
save_file(model_1_14, 'weather_forecaster_temperature_model_catboost_PERPIGNAN.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_PERPIGNAN.pkl"


In [85]:
# CatBoost Model for Roma
model_1_15 = cat_grids[14].best_estimator_
model_1_15.version = 1.0
model_1_15.pandas_version = pd.__version__
model_1_15.numpy_version = np.__version__
model_1_15.catboost_version = catboost.__version__
model_1_15.X_columns = [col for col in X_train.columns]
model_1_15.build_datetime = datetime.datetime.now()

In [86]:
modelpath = 'models/catboost_submodels'
save_file(model_1_15, 'weather_forecaster_temperature_model_catboost_ROMA.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_ROMA.pkl"


In [87]:
# CatBoost Model for Sonnblick
model_1_16 = cat_grids[15].best_estimator_
model_1_16.version = 1.0
model_1_16.pandas_version = pd.__version__
model_1_16.numpy_version = np.__version__
model_1_16.catboost_version = catboost.__version__
model_1_16.X_columns = [col for col in X_train.columns]
model_1_16.build_datetime = datetime.datetime.now()

In [88]:
modelpath = 'models/catboost_submodels'
save_file(model_1_16, 'weather_forecaster_temperature_model_catboost_SONNBLICK.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_SONNBLICK.pkl"


In [89]:
# CatBoost Model for Stockholm
model_1_17 = cat_grids[16].best_estimator_
model_1_17.version = 1.0
model_1_17.pandas_version = pd.__version__
model_1_17.numpy_version = np.__version__
model_1_17.catboost_version = catboost.__version__
model_1_17.X_columns = [col for col in X_train.columns]
model_1_17.build_datetime = datetime.datetime.now()

In [90]:
modelpath = 'models/catboost_submodels'
save_file(model_1_17, 'weather_forecaster_temperature_model_catboost_STOCKHOLM.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_STOCKHOLM.pkl"


In [91]:
# CatBoost Model for Tours
model_1_18 = cat_grids[17].best_estimator_
model_1_18.version = 1.0
model_1_18.pandas_version = pd.__version__
model_1_18.numpy_version = np.__version__
model_1_18.catboost_version = catboost.__version__
model_1_18.X_columns = [col for col in X_train.columns]
model_1_18.build_datetime = datetime.datetime.now()

In [92]:
modelpath = 'models/catboost_submodels'
save_file(model_1_18, 'weather_forecaster_temperature_model_catboost_TOURS.pkl', modelpath)

Writing file.  "models/catboost_submodels\weather_forecaster_temperature_model_catboost_TOURS.pkl"


In [93]:
# LightGBM Model
#model_2 = light_rs_cv.best_estimator_
#model_2.version = 1.0
#model_2.pandas_version = pd.__version__
#model_2.numpy_version = np.__version__
#model_2.lightgbm_version = lightgbm.__version__
#model_2.X_columns = [col for col in X_train.columns]
#model_2.build_datetime = datetime.datetime.now()

In [94]:
#modelpath = 'models'
#save_file(model_2, 'weather_forecaster_temperature_model_lightgbm.pkl', modelpath)

In [95]:
# LightGBM Model for Basel
model_2_1 = light_rss[0].best_estimator_
model_2_1.version = 1.0
model_2_1.pandas_version = pd.__version__
model_2_1.numpy_version = np.__version__
model_2_1.lightgbm_version = lightgbm.__version__
model_2_1.X_columns = [col for col in X_train.columns]
model_2_1.build_datetime = datetime.datetime.now()

In [96]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_1, 'weather_forecaster_temperature_model_lightgbm_BASEL.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_BASEL.pkl"


In [97]:
# LightGBM Model for Budapest
model_2_2 = light_rss[1].best_estimator_
model_2_2.version = 1.0
model_2_2.pandas_version = pd.__version__
model_2_2.numpy_version = np.__version__
model_2_2.lightgbm_version = lightgbm.__version__
model_2_2.X_columns = [col for col in X_train.columns]
model_2_2.build_datetime = datetime.datetime.now()

In [98]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_2, 'weather_forecaster_temperature_model_lightgbm_BUDAPEST.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_BUDAPEST.pkl"


In [99]:
# LightGBM Model for De Bilt
model_2_3 = light_rss[2].best_estimator_
model_2_3.version = 1.0
model_2_3.pandas_version = pd.__version__
model_2_3.numpy_version = np.__version__
model_2_3.lightgbm_version = lightgbm.__version__
model_2_3.X_columns = [col for col in X_train.columns]
model_2_3.build_datetime = datetime.datetime.now()

In [100]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_3, 'weather_forecaster_temperature_model_lightgbm_DE_BILT.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_DE_BILT.pkl"


In [101]:
# LightGBM Model for Dresden
model_2_4 = light_rss[3].best_estimator_
model_2_4.version = 1.0
model_2_4.pandas_version = pd.__version__
model_2_4.numpy_version = np.__version__
model_2_4.lightgbm_version = lightgbm.__version__
model_2_4.X_columns = [col for col in X_train.columns]
model_2_4.build_datetime = datetime.datetime.now()

In [102]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_4, 'weather_forecaster_temperature_model_lightgbm_DRESDEN.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_DRESDEN.pkl"


In [103]:
# LightGBM Model for Dusseldorf
model_2_5 = light_rss[4].best_estimator_
model_2_5.version = 1.0
model_2_5.pandas_version = pd.__version__
model_2_5.numpy_version = np.__version__
model_2_5.lightgbm_version = lightgbm.__version__
model_2_5.X_columns = [col for col in X_train.columns]
model_2_5.build_datetime = datetime.datetime.now()

In [104]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_5, 'weather_forecaster_temperature_model_lightgbm_DUSSELDORF.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_DUSSELDORF.pkl"


In [105]:
# LightGBM Model for Heathrow
model_2_6 = light_rss[5].best_estimator_
model_2_6.version = 1.0
model_2_6.pandas_version = pd.__version__
model_2_6.numpy_version = np.__version__
model_2_6.lightgbm_version = lightgbm.__version__
model_2_6.X_columns = [col for col in X_train.columns]
model_2_6.build_datetime = datetime.datetime.now()

In [106]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_6, 'weather_forecaster_temperature_model_lightgbm_HEATHROW.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_HEATHROW.pkl"


In [107]:
# LightGBM Model for Kassel
model_2_7 = light_rss[6].best_estimator_
model_2_7.version = 1.0
model_2_7.pandas_version = pd.__version__
model_2_7.numpy_version = np.__version__
model_2_7.lightgbm_version = lightgbm.__version__
model_2_7.X_columns = [col for col in X_train.columns]
model_2_7.build_datetime = datetime.datetime.now()

In [108]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_7, 'weather_forecaster_temperature_model_lightgbm_KASSEL.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_KASSEL.pkl"


In [109]:
# LightGBM Model for Ljubljana
model_2_8 = light_rss[7].best_estimator_
model_2_8.version = 1.0
model_2_8.pandas_version = pd.__version__
model_2_8.numpy_version = np.__version__
model_2_8.lightgbm_version = lightgbm.__version__
model_2_8.X_columns = [col for col in X_train.columns]
model_2_8.build_datetime = datetime.datetime.now()

In [110]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_8, 'weather_forecaster_temperature_model_lightgbm_LJUBLJANA.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_LJUBLJANA.pkl"


In [111]:
# LightGBM Model for Maastricht
model_2_9 = light_rss[8].best_estimator_
model_2_9.version = 1.0
model_2_9.pandas_version = pd.__version__
model_2_9.numpy_version = np.__version__
model_2_9.lightgbm_version = lightgbm.__version__
model_2_9.X_columns = [col for col in X_train.columns]
model_2_9.build_datetime = datetime.datetime.now()

In [112]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_9, 'weather_forecaster_temperature_model_lightgbm_MAASTRICHT.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_MAASTRICHT.pkl"


In [113]:
# LightGBM Model for Malmo
model_2_10 = light_rss[9].best_estimator_
model_2_10.version = 1.0
model_2_10.pandas_version = pd.__version__
model_2_10.numpy_version = np.__version__
model_2_10.lightgbm_version = lightgbm.__version__
model_2_10.X_columns = [col for col in X_train.columns]
model_2_10.build_datetime = datetime.datetime.now()

In [114]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_10, 'weather_forecaster_temperature_model_lightgbm_MALMO.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_MALMO.pkl"


In [115]:
# LightGBM Model for Montelimar
model_2_11 = light_rss[10].best_estimator_
model_2_11.version = 1.0
model_2_11.pandas_version = pd.__version__
model_2_11.numpy_version = np.__version__
model_2_11.lightgbm_version = lightgbm.__version__
model_2_11.X_columns = [col for col in X_train.columns]
model_2_11.build_datetime = datetime.datetime.now()

In [116]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_11, 'weather_forecaster_temperature_model_lightgbm_MONTELIMAR.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_MONTELIMAR.pkl"


In [117]:
# LightGBM Model for Muenchen
model_2_12 = light_rss[11].best_estimator_
model_2_12.version = 1.0
model_2_12.pandas_version = pd.__version__
model_2_12.numpy_version = np.__version__
model_2_12.lightgbm_version = lightgbm.__version__
model_2_12.X_columns = [col for col in X_train.columns]
model_2_12.build_datetime = datetime.datetime.now()

In [118]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_12, 'weather_forecaster_temperature_model_lightgbm_MUENCHEN.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_MUENCHEN.pkl"


In [119]:
# LightGBM Model for Oslo
model_2_13 = light_rss[12].best_estimator_
model_2_13.version = 1.0
model_2_13.pandas_version = pd.__version__
model_2_13.numpy_version = np.__version__
model_2_13.lightgbm_version = lightgbm.__version__
model_2_13.X_columns = [col for col in X_train.columns]
model_2_13.build_datetime = datetime.datetime.now()

In [120]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_13, 'weather_forecaster_temperature_model_lightgbm_OSLO.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_OSLO.pkl"


In [121]:
# LightGBM Model for Perpignan
model_2_14 = light_rss[13].best_estimator_
model_2_14.version = 1.0
model_2_14.pandas_version = pd.__version__
model_2_14.numpy_version = np.__version__
model_2_14.lightgbm_version = lightgbm.__version__
model_2_14.X_columns = [col for col in X_train.columns]
model_2_14.build_datetime = datetime.datetime.now()

In [122]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_14, 'weather_forecaster_temperature_model_lightgbm_PERPIGNAN.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_PERPIGNAN.pkl"


In [123]:
# LightGBM Model for Roma
model_2_15 = light_rss[14].best_estimator_
model_2_15.version = 1.0
model_2_15.pandas_version = pd.__version__
model_2_15.numpy_version = np.__version__
model_2_15.lightgbm_version = lightgbm.__version__
model_2_15.X_columns = [col for col in X_train.columns]
model_2_15.build_datetime = datetime.datetime.now()

In [124]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_15, 'weather_forecaster_temperature_model_lightgbm_ROMA.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_ROMA.pkl"


In [125]:
# LightGBM Model for Sonnblick
model_2_16 = light_rss[15].best_estimator_
model_2_16.version = 1.0
model_2_16.pandas_version = pd.__version__
model_2_16.numpy_version = np.__version__
model_2_16.lightgbm_version = lightgbm.__version__
model_2_16.X_columns = [col for col in X_train.columns]
model_2_16.build_datetime = datetime.datetime.now()

In [126]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_16, 'weather_forecaster_temperature_model_lightgbm_SONNBLICK.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_SONNBLICK.pkl"


In [127]:
# LightGBM Model for Stockholm
model_2_17 = light_rss[16].best_estimator_
model_2_17.version = 1.0
model_2_17.pandas_version = pd.__version__
model_2_17.numpy_version = np.__version__
model_2_17.lightgbm_version = lightgbm.__version__
model_2_17.X_columns = [col for col in X_train.columns]
model_2_17.build_datetime = datetime.datetime.now()

In [128]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_17, 'weather_forecaster_temperature_model_lightgbm_STOCKHOLM.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_STOCKHOLM.pkl"


In [129]:
# LightGBM Model for Tours
model_2_18 = light_rss[17].best_estimator_
model_2_18.version = 1.0
model_2_18.pandas_version = pd.__version__
model_2_18.numpy_version = np.__version__
model_2_18.lightgbm_version = lightgbm.__version__
model_2_18.X_columns = [col for col in X_train.columns]
model_2_18.build_datetime = datetime.datetime.now()

In [130]:
modelpath = 'models/lightgbm_submodels'
save_file(model_2_18, 'weather_forecaster_temperature_model_lightgbm_TOURS.pkl', modelpath)

Writing file.  "models/lightgbm_submodels\weather_forecaster_temperature_model_lightgbm_TOURS.pkl"
