# Data Science Capstone 2

## Weather Forecaster

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import TimeSeriesSplit, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression

In [3]:
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

In [4]:
#Load the datasets
weather_dataset = pd.read_csv("modified_data/weather_prediction_dataset_finalized.csv")

#This dataset is optional and provides a template showing all the times the weather is appropriate for a picnic
weather_for_picnic = pd.read_csv("raw_data/weather_prediction_bbq_labels.csv")

In [5]:
#Initial details of the dataset
print("Shape of dataset:", weather_dataset.shape)
weather_dataset.head(30)

Shape of dataset: (65754, 14)


Unnamed: 0,CITY,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
0,BASEL,1,8.0,0.89,1.0286,0.2,0.03,0.0,2.9,1.6,3.9,3.6,,
1,DUSSELDORF,1,8.0,0.92,1.024,0.12,0.22,0.0,4.2,2.5,6.9,6.5,2.5,5.9
2,HEATHROW,1,7.0,0.94,1.0245,0.18,0.0,0.4,7.0,4.9,10.8,7.9,,
3,KASSEL,1,,0.93,1.0237,0.06,0.13,0.0,3.5,1.5,5.0,2.3,2.5,8.2
4,LJUBLJANA,1,6.0,0.83,1.0294,0.57,0.0,5.2,-4.8,-9.1,-1.3,-0.9,0.4,
5,DE_BILT,1,7.0,0.97,1.024,0.11,0.1,0.0,6.1,3.5,8.1,7.3,2.5,8.0
6,MAASTRICHT,1,8.0,0.98,1.0251,0.06,0.17,0.0,5.6,4.1,6.9,6.2,3.1,7.0
7,MALMO,1,,,,,0.27,,2.9,0.9,3.6,3.7,2.5,
8,TOURS,1,,0.97,1.0275,0.25,0.04,,8.5,7.2,9.8,7.9,1.6,
9,MUENCHEN,1,8.0,0.91,1.0273,0.2,0.2,0.0,1.7,-0.5,2.6,1.9,2.6,9.4


In [6]:
tscv = TimeSeriesSplit(n_splits=5)
for train_ind, test_ind in tscv.split(weather_dataset):
    train, test = weather_dataset.iloc[train_ind], weather_dataset.iloc[test_ind]

In [7]:
X_train = train.drop(columns='target')
y_train = train.target
X_test = test.drop(columns='target')
y_test = test.target

In [8]:
names_list = ['CITY', 'MONTH']
names_train = X_train[names_list]
names_test = X_test[names_list]
X_train.drop(columns=names_list, inplace=True)
X_test.drop(columns=names_list, inplace=True)
X_train.shape, X_test.shape

((54795, 11), (10959, 11))

In [9]:
X_train.dtypes

cloud_cover         float64
humidity            float64
pressure            float64
global_radiation    float64
precipitation       float64
sunshine            float64
temp_mean           float64
temp_min            float64
temp_max            float64
wind_speed          float64
wind_gust           float64
dtype: object

### Random Forest Models

In [10]:
RF_pipe_1 = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    RandomForestRegressor(random_state=5)
)

RF_pipe_2 = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
    RandomForestRegressor(random_state=5)
)

RF_pipe_3 = make_pipeline(
    SimpleImputer(strategy='constant', fill_value=-64),
    StandardScaler(),
    RandomForestRegressor(random_state=5)
)

In [11]:
cv_scores_rf_1 = cross_validate(RF_pipe_1, X_train, y_train, cv=5)
cv_scores_rf_2 = cross_validate(RF_pipe_2, X_train, y_train, cv=5)
cv_scores_rf_3 = cross_validate(RF_pipe_3, X_train, y_train, cv=5)

In [12]:
print(cv_scores_rf_1['test_score'])
print(cv_scores_rf_2['test_score'])
print(cv_scores_rf_3['test_score'])

[0.93721296 0.93302379 0.94225063 0.9523483  0.92566994]
[0.93775133 0.93313105 0.94263014 0.95246923 0.92563338]
[0.93771271 0.93337238 0.94294831 0.95241291 0.92542599]


In [13]:
RF_pipe_1.fit(X_train, y_train)

In [14]:
RF_pipe_2.fit(X_train, y_train)

In [15]:
RF_pipe_3.fit(X_train, y_train)

In [16]:
y_train_RF_pred_1 = RF_pipe_1.predict(X_train)
y_test_RF_pred_1 = RF_pipe_1.predict(X_test)

In [17]:
y_train_RF_pred_2 = RF_pipe_2.predict(X_train)
y_test_RF_pred_2 = RF_pipe_2.predict(X_test)

In [18]:
y_train_RF_pred_3 = RF_pipe_3.predict(X_train)
y_test_RF_pred_3 = RF_pipe_3.predict(X_test)

In [19]:
r2 = r2_score(y_train, y_train_RF_pred_1), r2_score(y_test, y_test_RF_pred_1)
RMSE = mean_squared_error(y_train, y_train_RF_pred_1, squared=False), mean_squared_error(y_test, y_test_RF_pred_1, squared=False)
print("R-squared for random forest model with mean imputation:", r2)
print("RMSE for random forest model with mean imputation:", RMSE)

R-squared for random forest model with mean imputation: (0.9918980196643992, 0.9452430548554733)
RMSE for random forest model with mean imputation: (np.float64(0.7550069341702312), np.float64(1.9756860341416411))




In [20]:
r2 = r2_score(y_train, y_train_RF_pred_2), r2_score(y_test, y_test_RF_pred_2)
RMSE = mean_squared_error(y_train, y_train_RF_pred_2, squared=False), mean_squared_error(y_test, y_test_RF_pred_2, squared=False)
print("R-squared for random forest model with median imputation:", r2)
print("RMSE for random forest model with median imputation:", RMSE)

R-squared for random forest model with median imputation: (0.9919250737834931, 0.9453795834362846)
RMSE for random forest model with median imputation: (np.float64(0.753745321153916), np.float64(1.9732214521822253))




In [21]:
r2 = r2_score(y_train, y_train_RF_pred_3), r2_score(y_test, y_test_RF_pred_3)
RMSE = mean_squared_error(y_train, y_train_RF_pred_3, squared=False), mean_squared_error(y_test, y_test_RF_pred_3, squared=False)
print("R-squared for random forest model with constant (-64) imputation:", r2)
print("RMSE for random forest model with mean constant (-64) imputation:", RMSE)

R-squared for random forest model with constant (-64) imputation: (0.9919195361057389, 0.945112961396495)
RMSE for random forest model with mean constant (-64) imputation: (np.float64(0.7540037311512573), np.float64(1.9780315936765587))




In [22]:
rf1_neg_mae = cross_validate(RF_pipe_1, X_train, y_train, 
                            scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
rf1_mae_mean = np.mean(-1 * rf1_neg_mae['test_score'])
rf1_mae_std = np.std(-1 * rf1_neg_mae['test_score'])
mean_absolute_error(y_test, RF_pipe_1.predict(X_test))

np.float64(1.5225099917875717)

In [23]:
rf2_neg_mae = cross_validate(RF_pipe_2, X_train, y_train, 
                            scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
rf2_mae_mean = np.mean(-1 * rf2_neg_mae['test_score'])
rf2_mae_std = np.std(-1 * rf2_neg_mae['test_score'])
mean_absolute_error(y_test, RF_pipe_2.predict(X_test))

np.float64(1.5205956747878455)

In [24]:
rf3_neg_mae = cross_validate(RF_pipe_3, X_train, y_train, 
                            scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
rf3_mae_mean = np.mean(-1 * rf3_neg_mae['test_score'])
rf3_mae_std = np.std(-1 * rf3_neg_mae['test_score'])
mean_absolute_error(y_test, RF_pipe_3.predict(X_test))

np.float64(1.5263660005474953)

In [None]:
# Choose the second variation with the median imputation as the best among the random forest models

In [26]:
#Now lets use GridSearchCV to further tune our hyperparameters and see if we can beat the scores above
n_est = [int(n) for n in np.logspace(start=1, stop=3, num=5)]
grid_params = {
        'randomforestregressor__n_estimators': n_est,
        'standardscaler': [StandardScaler(), None],
        'simpleimputer__strategy': ['median']
}

In [27]:
rf_grid_cv = GridSearchCV(RF_pipe_2, param_grid=grid_params, cv=5, n_jobs=-1)

In [None]:
# NOTE: This cell has repeatedly caused the notebook and web browser to crash likely due to the high time complexity
# rf_grid_cv.fit(X_train, y_train)

In [None]:
rf_grid_cv.best_params_

In [None]:
rf_best_cv_results = cross_validate(rf_grid_cv.best_estimator_, X_train, y_train, cv=5)
rf_best_cv_mean = np.mean(rf_best_cv_results['test_score'])
rf_best_cv_std = np.std(rf_best_cv_results['test_score'])
print(rf_best_cv_mean, rf_best_cv_std)
mean_absolute_error(y_test, rf_grid_cv.best_estimator_.predict(X_test))

Due to the high runtime cost of random forest models, despite having good accuracy values, they will not scale well on larger and more computationaly expensive datasets. Thus the random forest model will not be chosen.

### Categorical Boosting (CatBoost)

In [28]:
cat_model = CatBoostRegressor(
    iterations=100,      
    learning_rate=0.1,   
    depth=6,              
    verbose=0
)

In [29]:
cat_model.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x25c70264830>

In [30]:
y_train_cat_pred = cat_model.predict(X_train)
y_test_cat_pred = cat_model.predict(X_test)

In [31]:
r2 = r2_score(y_train, y_train_cat_pred), r2_score(y_test, y_test_cat_pred)
RMSE = mean_squared_error(y_train, y_train_cat_pred, squared=False), mean_squared_error(y_test, y_test_cat_pred, squared=False)
print("R-squared for catboost:", r2)
print("RMSE for catboost:", RMSE)

R-squared for catboost: (0.9421375016537908, 0.9454468143248129)
RMSE for catboost: (np.float64(2.0176876633538887), np.float64(1.9720066840846475))




In [32]:
cv_scores_cat = cross_validate(cat_model, X_train, y_train, cv=5)

In [33]:
print(cv_scores_cat['test_score'])

[0.93810386 0.93358567 0.94365287 0.95231267 0.9257328 ]


In [34]:
grid_params = {
    'iterations': [50, 100, 200, 500],
    'learning_rate': [0.1, 0.05, 0.01],
    'depth': [4, 6, 8, 10]
}

In [35]:
cat_grid_cv = GridSearchCV(cat_model, param_grid=grid_params, cv=5, n_jobs=-1)

In [36]:
cat_grid_cv.fit(X_train, y_train)

In [37]:
cat_grid_cv.best_params_

{'depth': 10, 'iterations': 500, 'learning_rate': 0.05}

In [38]:
cat_best_cv_results = cross_validate(cat_grid_cv.best_estimator_, X_train, y_train, cv=5)
cat_best_cv_mean = np.mean(cat_best_cv_results['test_score'])
cat_best_cv_std = np.std(cat_best_cv_results['test_score'])
print(cat_best_cv_mean, cat_best_cv_std)
mean_absolute_error(y_test, cat_grid_cv.best_estimator_.predict(X_test))

0.9419834636490719 0.008410206152297347


np.float64(1.471007974830146)

In [39]:
print(cat_best_cv_results['test_score'])

[0.94096289 0.93735785 0.94710464 0.95459213 0.9298998 ]


Results above seem promising, and computation wise, it is much faster than random forest.

### Light Gradient Boosting Machine (LightGBM)

In [40]:
import sys
!{sys.executable} -m pip install lightgbm
import lightgbm as lgb


[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: C:\Users\tanks\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 20.8 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [55]:
light_model = lgb.LGBMRegressor()

In [56]:
params = {
    'num_leaves': 10,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'max_depth': 5
}

In [57]:
light_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002626 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2176
[LightGBM] [Info] Number of data points in the train set: 54795, number of used features: 11
[LightGBM] [Info] Start training from score 10.207930


In [58]:
y_train_light_pred = light_model.predict(X_train)
y_test_light_pred = light_model.predict(X_test)

In [59]:
r2 = r2_score(y_train, y_train_light_pred), r2_score(y_test, y_test_light_pred)
RMSE = mean_squared_error(y_train, y_train_cat_pred, squared=False), mean_squared_error(y_test, y_test_cat_pred, squared=False)
print("R-squared for LightGBM:", r2)
print("RMSE for LightGBM:", RMSE)

R-squared for LightGBM: (0.9486090965502151, 0.9474510198704363)
RMSE for LightGBM: (np.float64(2.0176876633538887), np.float64(1.9720066840846475))




In [60]:
cv_scores_light = cross_validate(light_model, X_train, y_train, cv=5)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004234 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2168
[LightGBM] [Info] Number of data points in the train set: 43836, number of used features: 11
[LightGBM] [Info] Start training from score 10.033940
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002665 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2165
[LightGBM] [Info] Number of data points in the train set: 43836, number of used features: 11
[LightGBM] [Info] Start training from score 10.569797
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2173
[LightGBM] [Info] Number of data points in the train set: 43836, number of used features: 11
[LightGBM] [Info] Start t

In [61]:
print(cv_scores_light['test_score'])

[0.94027629 0.93575938 0.94537752 0.95400454 0.92854058]


In [62]:
# Begin hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

In [63]:
rs_params = {
    'num_leaves': [5, 10, 30, 50, 100],
    'learning_rate': [0.5, 0.1, 0.05, 0.01],
    'n_estimators': [100, 200, 500],
    'max_depth': [-1, 3, 5, 10]
}

In [65]:
light_rs_cv = RandomizedSearchCV(light_model, rs_params, n_iter=60, cv=5, n_jobs=-1)

In [66]:
light_rs_cv.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003697 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2176
[LightGBM] [Info] Number of data points in the train set: 54795, number of used features: 11
[LightGBM] [Info] Start training from score 10.207930


In [67]:
light_rs_cv.best_params_

{'num_leaves': 50, 'n_estimators': 500, 'max_depth': -1, 'learning_rate': 0.05}

In [68]:
light_best_cv_results = cross_validate(light_rs_cv.best_estimator_, X_train, y_train, cv=5)
light_best_cv_mean = np.mean(light_best_cv_results['test_score'])
light_best_cv_std = np.std(light_best_cv_results['test_score'])
print(light_best_cv_mean, light_best_cv_std)
mean_absolute_error(y_test, light_rs_cv.best_estimator_.predict(X_test))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002682 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2168
[LightGBM] [Info] Number of data points in the train set: 43836, number of used features: 11
[LightGBM] [Info] Start training from score 10.033940
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002629 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2165
[LightGBM] [Info] Number of data points in the train set: 43836, number of used features: 11
[LightGBM] [Info] Start training from score 10.569797
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002580 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2173
[LightGBM] [Info] Number of data points in the train set: 43836, number of used features: 11
[LightGBM] [Info] Start t

np.float64(1.481699419533026)

In [70]:
cat_best_cv_results['test_score']

array([0.94096289, 0.93735785, 0.94710464, 0.95459213, 0.9298998 ])

In [71]:
light_best_cv_results['test_score']

array([0.94116684, 0.93629048, 0.94665048, 0.95415027, 0.92961633])

### Final Comparision

In [73]:
best_cat_model = cat_grid_cv.best_estimator_
y_best_cat_pred = best_cat_model.predict(X_test)
print("CatBoost mean and standard deviation:", cat_best_cv_mean, cat_best_cv_std)
print("R-squared value:", r2_score(y_test, y_best_cat_pred))
print("Mean absolute error:", mean_absolute_error(y_test, y_best_cat_pred))
print("Root mean squared error:", mean_squared_error(y_test, y_best_cat_pred, squared=False))

CatBoost mean and standard deviation: 0.9419834636490719 0.008410206152297347
R-squared value: 0.949058117603792
Mean absolute error: 1.471007974830146
Root mean squared error: 1.9056178788253324




In [74]:
best_light_model = light_rs_cv.best_estimator_
y_best_light_pred = best_light_model.predict(X_test)
print("LightGBM mean and standard deviation:", light_best_cv_mean, light_best_cv_std)
print("R-squared value:", r2_score(y_test, y_best_light_pred))
print("Mean absolute error:", mean_absolute_error(y_test, y_best_light_pred))
print("Root mean squared error:", mean_squared_error(y_test, y_best_light_pred, squared=False))

LightGBM mean and standard deviation: 0.9415748809995007 0.008426157487296559
R-squared value: 0.947947851706846
Mean absolute error: 1.481699419533026
Root mean squared error: 1.9262721848800406




### Conclusion

Though between CatBoost and LightGBM, each model performed well close of each other, the better model seems to be the CatBoost model. Marginal improved values on R-squared, MAE, and RMSE. In addition, CatBoost models are generally easier to handle as they require less data preprocessing and hyperparamter tuning compared to LightGBM, they are also optimized for speed and memory.

However, it is important to note that LightGBM should not be discarded in this case, but can be seen as a backup to CatBoost. In terms of scalibility, LightGBM is stronger. If the (CatBoost) model is well received and is planned upon to be expanded to cover many more locations as well as handle much more weather measurements (thus more features/columns), then CatBoost might begin to drag on training time, and LightGBM would step in as the alternative.

### Save the Models

In [79]:
import catboost
import lightgbm
import datetime

In [82]:
from library.sb_utils import save_file

In [80]:
# CatBoost Model
model_1 = cat_grid_cv.best_estimator_
model_1.version = 1.0
model_1.pandas_version = pd.__version__
model_1.numpy_version = np.__version__
model_1.catboost_version = catboost.__version__
model_1.X_columns = [col for col in X_train.columns]
model_1.build_datetime = datetime.datetime.now()

In [84]:
modelpath = 'models'
save_file(model_1, 'weather_forecaster_temperature_model_catboost.pkl', modelpath)

Writing file.  "models\weather_forecaster_temperature_model_catboost.pkl"


In [85]:
# LightGBM Model
model_2 = light_rs_cv.best_estimator_
model_2.version = 1.0
model_2.pandas_version = pd.__version__
model_2.numpy_version = np.__version__
model_2.lightgbm_version = lightgbm.__version__
model_2.X_columns = [col for col in X_train.columns]
model_2.build_datetime = datetime.datetime.now()

In [86]:
modelpath = 'models'
save_file(model_2, 'weather_forecaster_temperature_model_lightgbm.pkl', modelpath)

Writing file.  "models\weather_forecaster_temperature_model_lightgbm.pkl"
