# Data Science Capstone 2

## Weather Forecaster

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import os

In [3]:
#Load the datasets
weather_dataset = pd.read_csv("modified_data/weather_prediction_dataset_cleaned.csv")

#This dataset is optional and provides a template showing all the times the weather is appropriate for a picnic
weather_for_picnic = pd.read_csv("raw_data/weather_prediction_bbq_labels.csv")

In [4]:
#Initial details of the dataset
print("Shape of dataset:", weather_dataset.shape)
weather_dataset.head()

Shape of dataset: (3654, 165)


Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,...,STOCKHOLM_temp_min,STOCKHOLM_temp_max,TOURS_wind_speed,TOURS_humidity,TOURS_pressure,TOURS_global_radiation,TOURS_precipitation,TOURS_temp_mean,TOURS_temp_min,TOURS_temp_max
0,20000101,1,8,0.89,1.0286,0.2,0.03,0.0,2.9,1.6,...,-9.3,0.7,1.6,0.97,1.0275,0.25,0.04,8.5,7.2,9.8
1,20000102,1,8,0.87,1.0318,0.25,0.0,0.0,3.6,2.7,...,0.5,2.0,2.0,0.99,1.0293,0.17,0.16,7.9,6.6,9.2
2,20000103,1,5,0.81,1.0314,0.5,0.0,3.7,2.2,0.1,...,-1.0,2.8,3.4,0.91,1.0267,0.27,0.0,8.1,6.6,9.6
3,20000104,1,7,0.79,1.0262,0.63,0.35,6.9,3.9,0.5,...,2.5,4.6,4.9,0.95,1.0222,0.11,0.44,8.6,6.4,10.8
4,20000105,1,5,0.9,1.0246,0.51,0.07,3.7,6.0,3.8,...,-1.8,2.9,3.6,0.95,1.0209,0.39,0.04,8.0,6.4,9.5


## Features to be used for mean temperature prediction
- MONTH
- temp_mean
- temp_max
- temp_min
- global_radiation
- sunshine
- humidity (negative correlation)

In [5]:
city_names = ["BASEL", "BUDAPEST", "DE_BILT", "DRESDEN", "DUSSELDORF", "HEATHROW", "KASSEL", "LJUBLJANA", "MAASTRICHT",
              "MALMO", "MONTELIMAR", "MUENCHEN", "OSLO", "PERPIGNAN", "ROMA", "SONNBLICK", "STOCKHOLM", "TOURS"]

In [6]:
# For loop to construct a list of sub dataframes, each specific to a city
cities = {}
for city in city_names:
    pattern = f'DATE|MONTH|{city}'
    mask = weather_dataset.columns.str.contains(pattern, regex=True)
    sub_df = weather_dataset.loc[:, mask]
    cities[city] = sub_df

In [7]:
variables = ['temp_mean', 'temp_max', 'temp_min', 'cloud_cover', 'wind_speed', 'wind_gust',
            'humidity', 'pressure', 'global_radiation', 'precipitation', 'sunshine']

In [8]:
# For loop to construct a list of sub dataframes, each specific to a measurement
measurements = {}
for var in variables:
    pattern = f'DATE|MONTH|{var}'
    mask = weather_dataset.columns.str.contains(pattern, regex=True)
    sub_df = weather_dataset.loc[:, mask]
    measurements[var] = sub_df

In [9]:
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor

In [10]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression

In [11]:
df_basel = cities['BASEL']
print(df_basel.shape)
df_basel.head()

(3654, 11)


Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,BASEL_temp_max
0,20000101,1,8,0.89,1.0286,0.2,0.03,0.0,2.9,1.6,3.9
1,20000102,1,8,0.87,1.0318,0.25,0.0,0.0,3.6,2.7,4.8
2,20000103,1,5,0.81,1.0314,0.5,0.0,3.7,2.2,0.1,4.8
3,20000104,1,7,0.79,1.0262,0.63,0.35,6.9,3.9,0.5,7.5
4,20000105,1,5,0.9,1.0246,0.51,0.07,3.7,6.0,3.8,8.6


### Remove city/location names from each column and create the target column

In [12]:
for city in city_names:
    df_city = cities[city]
    cols = df_city.columns

    # Rename the columns without the city name
    df_city.columns = ['DATE','MONTH'] + [col.replace(f"{city}_", "") for col in df_city.columns if col not in ['DATE','MONTH']]

    # Create the target column
    df_city['target'] = df_city['temp_mean'].shift(-1)
    df_city.dropna(inplace=True)

    # Create the city column and update the city dataframe dict
    df_city.insert(loc=0, column='CITY', value=city)
    cities[city] = df_city

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_city['target'] = df_city['temp_mean'].shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_city.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_city['target'] = df_city['temp_mean'].shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/

In [13]:
cities['DE_BILT'].tail()

Unnamed: 0,CITY,DATE,MONTH,cloud_cover,wind_speed,wind_gust,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target
3648,DE_BILT,20091227,12,7,5.3,14.0,0.86,0.999,0.21,0.39,1.4,4.2,2.6,5.1,2.6
3649,DE_BILT,20091228,12,4,2.3,7.0,0.87,1.0044,0.4,0.0,5.2,2.6,-2.5,7.3,0.6
3650,DE_BILT,20091229,12,6,4.3,9.0,0.91,0.9992,0.15,0.55,0.4,0.6,-1.4,1.7,0.3
3651,DE_BILT,20091230,12,8,4.1,9.0,0.94,0.9939,0.11,0.17,0.0,0.3,-0.3,0.9,0.0
3652,DE_BILT,20091231,12,8,4.9,11.0,0.86,0.9971,0.11,0.0,0.0,0.0,-0.5,0.5,-1.6


In [14]:
cities['HEATHROW'].head()

Unnamed: 0,CITY,DATE,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target
0,HEATHROW,20000101,1,7,0.94,1.0245,0.18,0.0,0.4,7.0,4.9,10.8,7.9
1,HEATHROW,20000102,1,7,0.89,1.0253,0.2,0.02,0.7,7.9,5.0,11.5,9.4
2,HEATHROW,20000103,1,8,0.91,1.0186,0.13,0.6,0.0,9.4,7.2,9.5,7.0
3,HEATHROW,20000104,1,5,0.89,1.0148,0.34,0.02,2.9,7.0,4.4,11.0,6.4
4,HEATHROW,20000105,1,5,0.85,1.0142,0.25,0.08,1.3,6.4,1.9,10.8,8.9


In [15]:
cities['MALMO'].head()

Unnamed: 0,CITY,DATE,MONTH,wind_speed,precipitation,temp_mean,temp_min,temp_max,target
0,MALMO,20000101,1,2.5,0.27,2.9,0.9,3.6,3.7
1,MALMO,20000102,1,3.8,0.0,3.7,1.0,5.4,5.6
2,MALMO,20000103,1,4.3,0.06,5.6,4.0,6.9,4.5
3,MALMO,20000104,1,3.9,0.75,4.5,3.0,6.4,3.8
4,MALMO,20000105,1,3.2,0.03,3.8,2.5,5.5,4.1


### Now combine the city dataframes into a new weather dataset

In [16]:
generalized_weather_df = pd.concat(cities, ignore_index=True)
generalized_weather_df.head()

Unnamed: 0,CITY,DATE,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
0,BASEL,20000101,1,8.0,0.89,1.0286,0.2,0.03,0.0,2.9,1.6,3.9,3.6,,
1,BASEL,20000102,1,8.0,0.87,1.0318,0.25,0.0,0.0,3.6,2.7,4.8,2.2,,
2,BASEL,20000103,1,5.0,0.81,1.0314,0.5,0.0,3.7,2.2,0.1,4.8,3.9,,
3,BASEL,20000104,1,7.0,0.79,1.0262,0.63,0.35,6.9,3.9,0.5,7.5,6.0,,
4,BASEL,20000105,1,5.0,0.9,1.0246,0.51,0.07,3.7,6.0,3.8,8.6,4.2,,


In [17]:
print(generalized_weather_df.shape)

(65754, 15)


In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [19]:
#from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
#from sklearn.preprocessing import StandardScaler, MinMaxScaler
#from sklearn.dummy import DummyRegressor
#from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
#from sklearn.pipeline import make_pipeline
#from sklearn.impute import SimpleImputer
#from sklearn.feature_selection import SelectKBest, f_regression

In [20]:
generalized_weather_df['DATE'] = pd.to_datetime(generalized_weather_df.DATE, format="%Y%m%d")
generalized_weather_df.head()

Unnamed: 0,CITY,DATE,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
0,BASEL,2000-01-01,1,8.0,0.89,1.0286,0.2,0.03,0.0,2.9,1.6,3.9,3.6,,
1,BASEL,2000-01-02,1,8.0,0.87,1.0318,0.25,0.0,0.0,3.6,2.7,4.8,2.2,,
2,BASEL,2000-01-03,1,5.0,0.81,1.0314,0.5,0.0,3.7,2.2,0.1,4.8,3.9,,
3,BASEL,2000-01-04,1,7.0,0.79,1.0262,0.63,0.35,6.9,3.9,0.5,7.5,6.0,,
4,BASEL,2000-01-05,1,5.0,0.9,1.0246,0.51,0.07,3.7,6.0,3.8,8.6,4.2,,


In [21]:
generalized_weather_df.tail()

Unnamed: 0,CITY,DATE,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
65749,TOURS,2009-12-27,12,,0.84,1.0091,0.58,0.08,,4.6,-0.5,9.7,6.2,4.6,
65750,TOURS,2009-12-28,12,,0.95,1.0011,0.22,1.5,,6.2,1.8,10.6,10.4,3.7,
65751,TOURS,2009-12-29,12,,0.89,0.9966,0.24,0.4,,10.4,6.2,14.5,10.0,5.3,
65752,TOURS,2009-12-30,12,,0.88,0.9939,0.24,1.0,,10.0,8.7,11.3,8.5,3.8,
65753,TOURS,2009-12-31,12,,0.88,0.9933,0.58,0.02,,8.5,6.2,10.9,0.5,4.2,


In [22]:
generalized_weather_df = generalized_weather_df.set_index('DATE').sort_index()

In [23]:
#Train test split
#X_train, X_test, y_train, y_test = train_test_split(generalized_weather_df.drop(columns='target'), 
#                                                   generalized_weather_df.target, test_size=0.2)
train = generalized_weather_df.loc['2000-01-01':'2007-12-31']
test = generalized_weather_df.loc['2008-01-01':'2009-12-31']

In [24]:
test

Unnamed: 0_level_0,CITY,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2008-01-01,BUDAPEST,1,8.0,0.79,1.0250,0.18,0.50,0.0,-2.1,,-0.6,0.4,,
2008-01-01,PERPIGNAN,1,,0.81,1.0198,0.90,0.00,,3.0,-2.3,8.3,5.0,1.8,
2008-01-01,MONTELIMAR,1,,0.74,1.0219,0.78,0.00,,3.4,1.3,5.5,4.4,4.8,
2008-01-01,OSLO,1,6.0,0.75,1.0349,0.04,0.11,0.0,-0.7,-2.9,0.4,0.4,5.0,10.6
2008-01-01,MUENCHEN,1,8.0,0.90,1.0273,0.20,0.00,0.0,-2.3,-3.5,-0.8,-3.0,1.7,6.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2009-12-31,OSLO,12,5.0,0.88,1.0062,0.11,0.02,1.7,-10.8,-16.8,-8.9,-7.7,0.6,2.4
2009-12-31,PERPIGNAN,12,,0.90,0.9951,0.35,0.06,,8.8,5.3,12.2,11.0,2.2,
2009-12-31,ROMA,12,6.0,0.76,1.0009,0.62,,4.3,14.8,12.0,17.6,16.0,,
2009-12-31,LJUBLJANA,12,6.0,0.95,0.9976,0.12,1.38,0.0,7.8,6.8,9.0,6.9,0.8,


In [25]:
train = train.sort_values(['CITY', 'DATE'])
test = test.sort_values(['CITY', 'DATE'])

In [26]:
# The splitting between the data X and the labels y will happen later
train

Unnamed: 0_level_0,CITY,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2000-01-01,BASEL,1,8.0,0.89,1.0286,0.20,0.03,0.0,2.9,1.6,3.9,3.6,,
2000-01-02,BASEL,1,8.0,0.87,1.0318,0.25,0.00,0.0,3.6,2.7,4.8,2.2,,
2000-01-03,BASEL,1,5.0,0.81,1.0314,0.50,0.00,3.7,2.2,0.1,4.8,3.9,,
2000-01-04,BASEL,1,7.0,0.79,1.0262,0.63,0.35,6.9,3.9,0.5,7.5,6.0,,
2000-01-05,BASEL,1,5.0,0.90,1.0246,0.51,0.07,3.7,6.0,3.8,8.6,4.2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2007-12-27,TOURS,12,,0.97,1.0334,0.20,0.00,,6.2,3.7,8.8,5.2,1.6,
2007-12-28,TOURS,12,,0.97,1.0286,0.21,0.02,,5.2,3.5,6.8,6.1,3.2,
2007-12-29,TOURS,12,,0.92,1.0227,0.18,0.24,,6.1,2.9,9.3,2.8,4.3,
2007-12-30,TOURS,12,,0.97,1.0297,0.25,0.00,,2.8,1.1,4.5,4.2,1.5,


In [30]:
#tscv = TimeSeriesSplit(n_splits=5)
#for train_ind, test_ind in tscv.split(generalized_weather_df):
#    train, test = generalized_weather_df.iloc[train_ind], generalized_weather_df.iloc[test_ind]

In [27]:
train.shape, test.shape

((52596, 14), (13158, 14))

### Construction of modeling functions

Begin constructing the functions that will streamline the model building and evaluating phase of the project. Begin by first testing the functions on a sub dataset consisting of only data points from the Basel location.

In [28]:
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [29]:
def select_city(train, test, location):
    #Given a location, split the data into X and y
    train = train.loc[train['CITY'] == location]
    test = test.loc[test['CITY'] == location]

    X_train = train.drop(columns='target')
    y_train = train.target
    X_test = test.drop(columns='target')
    y_test = test.target

    return (X_train, y_train, X_test, y_test)

In [30]:
basel_X_train, basel_y_train, basel_X_test, basel_y_test = select_city(train, test, 'BASEL')

In [31]:
basel_X_train

Unnamed: 0_level_0,CITY,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,wind_speed,wind_gust
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2000-01-01,BASEL,1,8.0,0.89,1.0286,0.20,0.03,0.0,2.9,1.6,3.9,,
2000-01-02,BASEL,1,8.0,0.87,1.0318,0.25,0.00,0.0,3.6,2.7,4.8,,
2000-01-03,BASEL,1,5.0,0.81,1.0314,0.50,0.00,3.7,2.2,0.1,4.8,,
2000-01-04,BASEL,1,7.0,0.79,1.0262,0.63,0.35,6.9,3.9,0.5,7.5,,
2000-01-05,BASEL,1,5.0,0.90,1.0246,0.51,0.07,3.7,6.0,3.8,8.6,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2007-12-27,BASEL,12,7.0,0.89,1.0370,0.41,0.00,1.5,0.4,-0.6,2.3,,
2007-12-28,BASEL,12,5.0,0.81,1.0336,0.53,0.00,5.2,-0.9,-2.8,1.4,,
2007-12-29,BASEL,12,4.0,0.80,1.0266,0.58,0.47,5.8,0.0,-3.8,3.5,,
2007-12-30,BASEL,12,6.0,0.85,1.0275,0.36,0.00,1.5,3.8,1.3,6.2,,


In [32]:
def select_model(model_type, params=None, y_train=None, y_test=None):
    if model_type == 'Linear Regression':
        model = make_pipeline(
            SimpleImputer(strategy=params['strategy']), 
            StandardScaler(),
            LinearRegression()
        )
    elif model_type == 'Random Forest':
        model = make_pipeline(
            SimpleImputer(strategy=params['strategy'], fill_value=params['fill_value']),
            StandardScaler(),
            RandomForestRegressor(random_state=params['random_state'])
        )
    elif model_type == 'XGBoost':
        model = xgb.XGBRegressor(**params)
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        y_test = le.fit_transform(y_test)
    elif model_type == 'CatBoost':
        model = CatBoostRegressor(
            iterations=params['iterations'],      
            learning_rate=params['learning_rate'],   
            depth=params['depth'],              
            verbose=0
        )
    else:
        print("Please input an available model type: [Linear Regression, Random Forest, XGBoost, CatBoost]")
        model = DummyRegressor(strategy='mean')
    return model, y_train, y_test

In [33]:
params = {'strategy': 'median'}
pipe_lr_1, le_train, le_test = select_model('Linear Regression', params)

In [34]:
names_list = ['CITY', 'MONTH']
names_train = basel_X_train[names_list]
names_test = basel_X_test[names_list]
basel_X_train.drop(columns=names_list, inplace=True)
basel_X_test.drop(columns=names_list, inplace=True)
basel_X_train.shape, basel_X_test.shape

((2922, 11), (731, 11))

In [35]:
pipe_lr_1.fit(basel_X_train, basel_y_train)



In [36]:
y_train_pred = pipe_lr_1.predict(basel_X_train)
y_test_pred = pipe_lr_1.predict(basel_X_test)



In [37]:
r2 = r2_score(basel_y_train, y_train_pred), r2_score(basel_y_test, y_test_pred)
RMSE = mean_squared_error(basel_y_train, y_train_pred, squared=False), mean_squared_error(basel_y_test, y_test_pred, squared=False)
print("R-squared for linear regression with median imputation:", r2)
print("RMSE for linear regression with median imputation:", RMSE)

R-squared for linear regression with median imputation: (0.9220064687651719, 0.9261368632729735)
RMSE for linear regression with median imputation: (np.float64(2.074541060619559), np.float64(1.9986195744473763))




In [41]:
cv_scores_basel = cross_validate(pipe_lr_1, basel_X_train, basel_y_train, cv=5)



In [42]:
print(cv_scores_basel['test_score'])

[0.90305075 0.90525375 0.92420296 0.92383776 0.92284765]


In [38]:
# Test with invalid input, should return a dummy regressor as default
dummy, le_train, le_test = select_model('Logistic Regression', params)

Please input an available model type: [Linear Regression, Random Forest, XGBoost, CatBoost]


In [39]:
dummy.fit(basel_X_train, basel_y_train)
print(dummy.constant_)

[[11.0343258]]


In [40]:
dummy_y_pred = dummy.predict(basel_X_test)
print("Dummy R-squared:", r2_score(basel_y_test, dummy_y_pred))
print("Dummy RMSE:", mean_squared_error(basel_y_test, dummy_y_pred, squared=False))

Dummy R-squared: -3.9984375605373046e-05
Dummy RMSE: 7.35402235135091




In [43]:
def make_prediction(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #Evaluate the model with performance metrics
    r2 = r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)
    RMSE = mean_squared_error(y_train, y_train_pred, squared=False), mean_squared_error(y_test, y_test_pred, squared=False)

    return r2, RMSE

In [73]:
def get_metrics(cv_score_list, r2_metric, RMSE_metric):
    cv_test_scores = []
    r2_train_scores = []
    r2_test_scores = []
    RMSE_train_scores = []
    RMSE_test_scores = []
    for score in cv_score_list:
        for cv in score['test_score']:
            cv_test_scores.append(cv)
    for r2 in r2_metric:
        r2_train_scores.append(r2[0])
        r2_test_scores.append(r2[1])
    for rmse in RMSE_metric:
        RMSE_train_scores.append(rmse[0])
        RMSE_test_scores.append(rmse[1])
    return cv_test_scores, r2_train_scores, r2_test_scores, RMSE_train_scores, RMSE_test_scores

In [84]:
def calculate_metrics(cv_test_scores, r2_train_scores, r2_test_scores, RMSE_train_scores, RMSE_test_scores):
    cv_mean = np.mean(cv_test_scores)
    cv_std = np.std(cv_test_scores)
    print("Mean and standard deviation of cross validations:", cv_mean, cv_std)

    r2_train = np.mean(r2_train_scores), np.std(r2_train_scores)
    print("Mean and standard deviation of R-squared for training set:", r2_train)
    r2_test = np.mean(r2_test_scores), np.std(r2_test_scores) 
    print("Mean and standard deviation of R-squared for testing set:", r2_test)

    rmse_train = np.mean(RMSE_train_scores), np.std(RMSE_train_scores)
    print("Mean and standard deviation of RMSE for training set:", rmse_train)
    rmse_test = np.mean(RMSE_test_scores), np.std(RMSE_test_scores)
    print("Mean and standard deviation of RMSE for testing set:", rmse_test)

### Linear Regression

In [113]:
# Create the for loop to evaluate all the cities on linear regression with mean imputation
lr1_cv_score_list = []
lr1_r2_metric = []
lr1_RMSE_metric = []

for city in city_names:
    #X, y split
    X_train, y_train, X_test, y_test = select_city(train, test, city)

    #Set up parameters and select the model
    params = {'strategy': 'mean'}
    pipe_lr_1, le_train, le_test = select_model('Linear Regression', params)

    #Perform cross validation
    names_list = ['CITY', 'MONTH']
    X_train.drop(columns=names_list, inplace=True)
    X_test.drop(columns=names_list, inplace=True)
    cv_scores = cross_validate(pipe_lr_1, X_train, y_train, cv=5)
    lr1_cv_score_list.append(cv_scores)

    r2, RMSE = make_prediction(pipe_lr_1, X_train, y_train, X_test, y_test)
    lr1_r2_metric.append(r2)
    lr1_RMSE_metric.append(RMSE)

 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation

In [81]:
lr1_cv_test_scores, lr1_r2_train_scores, lr1_r2_test_scores, lr1_RMSE_train_scores, lr1_RMSE_test_scores = get_metrics(lr1_cv_score_list, lr1_r2_metric, lr1_RMSE_metric)
print("Accuracy metrics for linear regression with mean imputation")
calculate_metrics(lr1_cv_test_scores, lr1_r2_train_scores, lr1_r2_test_scores, lr1_RMSE_train_scores, lr1_RMSE_test_scores)

Accuracy metrics for linear regression with mean imputation
Mean and standard deviation of cross validations: 0.9158015927997724 0.021068883721244425
Mean and standard deviation of R-squared for training set: (np.float64(0.9211639009810914), np.float64(0.016729091130693763))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9198305464459859), np.float64(0.016262445864020293))
Mean and standard deviation of RMSE for training set: (np.float64(2.0048044649017043), np.float64(0.23688036778730057))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9758151370777943), np.float64(0.1934826116783981))


In [82]:
# Linear regression but now with median imputation
lr2_cv_score_list = []
lr2_r2_metric = []
lr2_RMSE_metric = []

for city in city_names:
    #X, y split
    X_train, y_train, X_test, y_test = select_city(train, test, city)

    #Set up parameters and select the model
    params = {'strategy': 'median'}
    pipe_lr_2, le_train, le_test = select_model('Linear Regression', params)

    #Perform cross validation
    names_list = ['CITY', 'MONTH']
    X_train.drop(columns=names_list, inplace=True)
    X_test.drop(columns=names_list, inplace=True)
    cv_scores = cross_validate(pipe_lr_2, X_train, y_train, cv=5)
    lr2_cv_score_list.append(cv_scores)

    r2, RMSE = make_prediction(pipe_lr_2, X_train, y_train, X_test, y_test)
    lr2_r2_metric.append(r2)
    lr2_RMSE_metric.append(RMSE)

 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is n

In [85]:
lr2_cv_test_scores, lr2_r2_train_scores, lr2_r2_test_scores, lr2_RMSE_train_scores, lr2_RMSE_test_scores = get_metrics(lr2_cv_score_list, lr2_r2_metric, lr2_RMSE_metric)
print("Accuracy metrics for linear regression with median imputation")
calculate_metrics(lr2_cv_test_scores, lr2_r2_train_scores, lr2_r2_test_scores, lr2_RMSE_train_scores, lr2_RMSE_test_scores)

Accuracy metrics for linear regression with median imputation
Mean and standard deviation of cross validations: 0.9158015927997724 0.021068883721244425
Mean and standard deviation of R-squared for training set: (np.float64(0.9211639009810914), np.float64(0.016729091130693763))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9198305464459859), np.float64(0.016262445864020293))
Mean and standard deviation of RMSE for training set: (np.float64(2.0048044649017043), np.float64(0.23688036778730057))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9758151370777943), np.float64(0.1934826116783981))


In [None]:
# So far seems like in linear regression, there is no significant difference between mean and median imputation

### Random Forest Model

In [87]:
#Random forest with mean imputation
rf1_cv_score_list = []
rf1_r2_metric = []
rf1_RMSE_metric = []

for city in city_names:
    #X, y split
    X_train, y_train, X_test, y_test = select_city(train, test, city)

    #Set up parameters and select the model
    params = {'strategy': 'mean', 'random_state': 5, 'fill_value': None}
    pipe_rf_1, le_train, le_test = select_model('Random Forest', params)

    #Perform cross validation
    names_list = ['CITY', 'MONTH']
    X_train.drop(columns=names_list, inplace=True)
    X_test.drop(columns=names_list, inplace=True)
    cv_scores = cross_validate(pipe_rf_1, X_train, y_train, cv=5)
    rf1_cv_score_list.append(cv_scores)

    r2, RMSE = make_prediction(pipe_rf_1, X_train, y_train, X_test, y_test)
    rf1_r2_metric.append(r2)
    rf1_RMSE_metric.append(RMSE)

 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='mean'.
 'wind_gust']. At least one non-missing value is needed for imputation

In [88]:
rf1_cv_test_scores, rf1_r2_train_scores, rf1_r2_test_scores, rf1_RMSE_train_scores, rf1_RMSE_test_scores = get_metrics(rf1_cv_score_list, rf1_r2_metric, rf1_RMSE_metric)
print("Accuracy metrics for random forest with mean imputation")
calculate_metrics(rf1_cv_test_scores, rf1_r2_train_scores, rf1_r2_test_scores, rf1_RMSE_train_scores, rf1_RMSE_test_scores)

Accuracy metrics for random forest with mean imputation
Mean and standard deviation of cross validations: 0.9141206934989343 0.020242080933146576
Mean and standard deviation of R-squared for training set: (np.float64(0.9887740587214378), np.float64(0.0022955516249328913))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9200700074625385), np.float64(0.015847961207258864))
Mean and standard deviation of RMSE for training set: (np.float64(0.7565545787697364), np.float64(0.0841998222116411))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9718218791209583), np.float64(0.17673121677939715))


In [89]:
#Random forest with median imputation
rf2_cv_score_list = []
rf2_r2_metric = []
rf2_RMSE_metric = []

for city in city_names:
    #X, y split
    X_train, y_train, X_test, y_test = select_city(train, test, city)

    #Set up parameters and select the model
    params = {'strategy': 'median', 'random_state': 5, 'fill_value': None}
    pipe_rf_2, le_train, le_test = select_model('Random Forest', params)

    #Perform cross validation
    names_list = ['CITY', 'MONTH']
    X_train.drop(columns=names_list, inplace=True)
    X_test.drop(columns=names_list, inplace=True)
    cv_scores = cross_validate(pipe_rf_2, X_train, y_train, cv=5)
    rf2_cv_score_list.append(cv_scores)

    r2, RMSE = make_prediction(pipe_rf_2, X_train, y_train, X_test, y_test)
    rf2_r2_metric.append(r2)
    rf2_RMSE_metric.append(RMSE)

 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is needed for imputation with strategy='median'.
 'wind_gust']. At least one non-missing value is n

In [90]:
rf2_cv_test_scores, rf2_r2_train_scores, rf2_r2_test_scores, rf2_RMSE_train_scores, rf2_RMSE_test_scores = get_metrics(rf2_cv_score_list, rf2_r2_metric, rf2_RMSE_metric)
print("Accuracy metrics for random forest with median imputation")
calculate_metrics(rf2_cv_test_scores, rf2_r2_train_scores, rf2_r2_test_scores, rf2_RMSE_train_scores, rf2_RMSE_test_scores)

Accuracy metrics for random forest with median imputation
Mean and standard deviation of cross validations: 0.9141206934989343 0.020242080933146576
Mean and standard deviation of R-squared for training set: (np.float64(0.9887740587214378), np.float64(0.0022955516249328913))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9200700074625385), np.float64(0.015847961207258864))
Mean and standard deviation of RMSE for training set: (np.float64(0.7565545787697364), np.float64(0.0841998222116411))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9718218791209583), np.float64(0.17673121677939715))


In [91]:
#Random forest with constant imputation of value -64
rf3_cv_score_list = []
rf3_r2_metric = []
rf3_RMSE_metric = []

for city in city_names:
    #X, y split
    X_train, y_train, X_test, y_test = select_city(train, test, city)

    #Set up parameters and select the model
    params = {'strategy': 'constant', 'random_state': 5, 'fill_value': -64}
    pipe_rf_3, le_train, le_test = select_model('Random Forest', params)

    #Perform cross validation
    names_list = ['CITY', 'MONTH']
    X_train.drop(columns=names_list, inplace=True)
    X_test.drop(columns=names_list, inplace=True)
    cv_scores = cross_validate(pipe_rf_3, X_train, y_train, cv=5)
    rf3_cv_score_list.append(cv_scores)

    r2, RMSE = make_prediction(pipe_rf_3, X_train, y_train, X_test, y_test)
    rf3_r2_metric.append(r2)
    rf3_RMSE_metric.append(RMSE)



In [92]:
rf3_cv_test_scores, rf3_r2_train_scores, rf3_r2_test_scores, rf3_RMSE_train_scores, rf3_RMSE_test_scores = get_metrics(rf3_cv_score_list, rf3_r2_metric, rf3_RMSE_metric)
print("Accuracy metrics for random forest with constant (-64) imputation")
calculate_metrics(rf3_cv_test_scores, rf3_r2_train_scores, rf3_r2_test_scores, rf3_RMSE_train_scores, rf3_RMSE_test_scores)

Accuracy metrics for random forest with constant (-64) imputation
Mean and standard deviation of cross validations: 0.9141279949176385 0.020291856784996346
Mean and standard deviation of R-squared for training set: (np.float64(0.9887714046018952), np.float64(0.002307404027098382))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9201741763060648), np.float64(0.015865912599947376))
Mean and standard deviation of RMSE for training set: (np.float64(0.7565909708898793), np.float64(0.0845876202686812))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9703695321196437), np.float64(0.1755729657747447))


In [105]:
# For random forest, seems like the best performing model has a constant imputation, though there seems to be some overfitting, the runtime also is the longest

### Extreme Gradient Boosting (XGBoost)

In [95]:
import sys
!{sys.executable} -m pip install xgboost
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder


[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: C:\Users\tanks\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip




In [97]:
#Random forest with mean imputation
xgb_cv_score_list = []
xgb_r2_metric = []
xgb_RMSE_metric = []

for city in city_names:
    #X, y split
    X_train, y_train, X_test, y_test = select_city(train, test, city)

    #Set up parameters and select the model
    params = {
        'objective': 'reg:squarederror',
        'max_depth': 3,
        'learning_rate': 0.01,
        'n_estimators': 100,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': 5
    }
    xgb_model, le_train, le_test = select_model('XGBoost', params, y_train, y_test)

    #Perform cross validation
    names_list = ['CITY', 'MONTH']
    X_train.drop(columns=names_list, inplace=True)
    X_test.drop(columns=names_list, inplace=True)
    cv_scores = cross_validate(xgb_model, X_train, le_train, cv=5)
    xgb_cv_score_list.append(cv_scores)

    r2, RMSE = make_prediction(xgb_model, X_train, le_train, X_test, le_test)
    xgb_r2_metric.append(r2)
    xgb_RMSE_metric.append(RMSE)



In [98]:
xgb_cv_test_scores, xgb_r2_train_scores, xgb_r2_test_scores, xgb_RMSE_train_scores, xgb_RMSE_test_scores = get_metrics(xgb_cv_score_list, xgb_r2_metric, xgb_RMSE_metric)
print("Accuracy metrics for xgboost against encoded labels")
calculate_metrics(xgb_cv_test_scores, xgb_r2_train_scores, xgb_r2_test_scores, xgb_RMSE_train_scores, xgb_RMSE_test_scores)

Accuracy metrics for xgboost against encoded labels
Mean and standard deviation of cross validations: 0.7652387983269162 0.01967937515746202
Mean and standard deviation of R-squared for training set: (np.float64(0.7852597733338674), np.float64(0.014689667543397017))
Mean and standard deviation of R-squared for testing set: (np.float64(0.4054016735818651), np.float64(0.20549523261102237))
Mean and standard deviation of RMSE for training set: (np.float64(33.14586720931615), np.float64(3.338090538229187))
Mean and standard deviation of RMSE for testing set: (np.float64(49.12073930097171), np.float64(10.68092065345258))


In [99]:
xgb_r2_metric

[(0.7896053791046143, 0.6043912172317505),
 (0.8036642074584961, 0.35514795780181885),
 (0.7676788568496704, 0.6315122842788696),
 (0.7857470512390137, 0.20817607641220093),
 (0.7740885019302368, 0.6906671524047852),
 (0.7983508706092834, 0.7253567576408386),
 (0.7822771668434143, 0.5027811527252197),
 (0.7933459281921387, 0.47702568769454956),
 (0.7732987403869629, 0.5619446039199829),
 (0.794952392578125, 0.29533588886260986),
 (0.7915902137756348, 0.15933740139007568),
 (0.7831767797470093, 0.311668336391449),
 (0.793583333492279, 0.2314102053642273),
 (0.7625179290771484, 0.35739243030548096),
 (0.8138988614082336, 0.6532739400863647),
 (0.755996584892273, 0.19860875606536865),
 (0.7977450489997864, -0.024206876754760742),
 (0.773158073425293, 0.3574071526527405)]

In [None]:
# There appears to be critical overfitting as the test accuracy scores are much worse than the training counterparts
# R-squared value also suffers from serious inconsistencies, anywhere between 0.72 to 0.00

### Categorical Boosting (CatBoost)

In [100]:
!{sys.executable} -m pip install catboost
from catboost import CatBoostRegressor




[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: C:\Users\tanks\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip


In [102]:
cat_cv_score_list = []
cat_r2_metric = []
cat_RMSE_metric = []

for city in city_names:
    #X, y split
    X_train, y_train, X_test, y_test = select_city(train, test, city)

    #Set up parameters and select the model
    params = {'iterations': 100, 'learning_rate': 0.1, 'depth': 6}
    cat_model, le_train, le_test = select_model('CatBoost', params)

    #Perform cross validation
    names_list = ['CITY', 'MONTH']
    X_train.drop(columns=names_list, inplace=True)
    X_test.drop(columns=names_list, inplace=True)
    cv_scores = cross_validate(cat_model, X_train, y_train, cv=5)
    cat_cv_score_list.append(cv_scores)

    r2, RMSE = make_prediction(cat_model, X_train, y_train, X_test, y_test)
    cat_r2_metric.append(r2)
    cat_RMSE_metric.append(RMSE)



In [103]:
cat_cv_test_scores, cat_r2_train_scores, cat_r2_test_scores, cat_RMSE_train_scores, cat_RMSE_test_scores = get_metrics(cat_cv_score_list, cat_r2_metric, cat_RMSE_metric)
print("Accuracy metrics for catboost")
calculate_metrics(cat_cv_test_scores, cat_r2_train_scores, cat_r2_test_scores, cat_RMSE_train_scores, cat_RMSE_test_scores)

Accuracy metrics for catboost
Mean and standard deviation of cross validations: 0.9172513800404888 0.019413591700032158
Mean and standard deviation of R-squared for training set: (np.float64(0.9366215449711363), np.float64(0.01337444669457421))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9231878242859488), np.float64(0.014872004283516487))
Mean and standard deviation of RMSE for training set: (np.float64(1.7964137353016925), np.float64(0.20183859445711802))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9335390997356174), np.float64(0.17087140644086615))


In [None]:
# Seems CatBoosting manages to avoid the overfitting problem from XGBoosting, scoreing very high in both R-squared and RMSE values
# An alternative to be considered would be LightGBM

### Model Evaluations Summary

In [104]:
lr2_cv_test_scores, lr2_r2_train_scores, lr2_r2_test_scores, lr2_RMSE_train_scores, lr2_RMSE_test_scores = get_metrics(lr2_cv_score_list, lr2_r2_metric, lr2_RMSE_metric)
print("Accuracy metrics for linear regression with median imputation")
calculate_metrics(lr2_cv_test_scores, lr2_r2_train_scores, lr2_r2_test_scores, lr2_RMSE_train_scores, lr2_RMSE_test_scores)

Accuracy metrics for linear regression with median imputation
Mean and standard deviation of cross validations: 0.9158015927997724 0.021068883721244425
Mean and standard deviation of R-squared for training set: (np.float64(0.9211639009810914), np.float64(0.016729091130693763))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9198305464459859), np.float64(0.016262445864020293))
Mean and standard deviation of RMSE for training set: (np.float64(2.0048044649017043), np.float64(0.23688036778730057))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9758151370777943), np.float64(0.1934826116783981))


In [106]:
rf3_cv_test_scores, rf3_r2_train_scores, rf3_r2_test_scores, rf3_RMSE_train_scores, rf3_RMSE_test_scores = get_metrics(rf3_cv_score_list, rf3_r2_metric, rf3_RMSE_metric)
print("Accuracy metrics for random forest with constant (-64) imputation")
calculate_metrics(rf3_cv_test_scores, rf3_r2_train_scores, rf3_r2_test_scores, rf3_RMSE_train_scores, rf3_RMSE_test_scores)

Accuracy metrics for random forest with constant (-64) imputation
Mean and standard deviation of cross validations: 0.9141279949176385 0.020291856784996346
Mean and standard deviation of R-squared for training set: (np.float64(0.9887714046018952), np.float64(0.002307404027098382))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9201741763060648), np.float64(0.015865912599947376))
Mean and standard deviation of RMSE for training set: (np.float64(0.7565909708898793), np.float64(0.0845876202686812))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9703695321196437), np.float64(0.1755729657747447))


In [107]:
xgb_cv_test_scores, xgb_r2_train_scores, xgb_r2_test_scores, xgb_RMSE_train_scores, xgb_RMSE_test_scores = get_metrics(xgb_cv_score_list, xgb_r2_metric, xgb_RMSE_metric)
print("Accuracy metrics for xgboost against encoded labels")
calculate_metrics(xgb_cv_test_scores, xgb_r2_train_scores, xgb_r2_test_scores, xgb_RMSE_train_scores, xgb_RMSE_test_scores)

Accuracy metrics for xgboost against encoded labels
Mean and standard deviation of cross validations: 0.7652387983269162 0.01967937515746202
Mean and standard deviation of R-squared for training set: (np.float64(0.7852597733338674), np.float64(0.014689667543397017))
Mean and standard deviation of R-squared for testing set: (np.float64(0.4054016735818651), np.float64(0.20549523261102237))
Mean and standard deviation of RMSE for training set: (np.float64(33.14586720931615), np.float64(3.338090538229187))
Mean and standard deviation of RMSE for testing set: (np.float64(49.12073930097171), np.float64(10.68092065345258))


In [108]:
cat_cv_test_scores, cat_r2_train_scores, cat_r2_test_scores, cat_RMSE_train_scores, cat_RMSE_test_scores = get_metrics(cat_cv_score_list, cat_r2_metric, cat_RMSE_metric)
print("Accuracy metrics for catboost")
calculate_metrics(cat_cv_test_scores, cat_r2_train_scores, cat_r2_test_scores, cat_RMSE_train_scores, cat_RMSE_test_scores)

Accuracy metrics for catboost
Mean and standard deviation of cross validations: 0.9172513800404888 0.019413591700032158
Mean and standard deviation of R-squared for training set: (np.float64(0.9366215449711363), np.float64(0.01337444669457421))
Mean and standard deviation of R-squared for testing set: (np.float64(0.9231878242859488), np.float64(0.014872004283516487))
Mean and standard deviation of RMSE for training set: (np.float64(1.7964137353016925), np.float64(0.20183859445711802))
Mean and standard deviation of RMSE for testing set: (np.float64(1.9335390997356174), np.float64(0.17087140644086615))


### Conclusion

Based on the evaluation metrics of R-squared, RMSE and cross validations, the 2 best models were the Random Forest Regressor and the Categorical Boosting Model. These 2 models will proceed to hyperparameter tuning before the final model is selected.

In [109]:
generalized_weather_df.head(30)

Unnamed: 0_level_0,CITY,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2000-01-01,BASEL,1,8.0,0.89,1.0286,0.2,0.03,0.0,2.9,1.6,3.9,3.6,,
2000-01-01,DUSSELDORF,1,8.0,0.92,1.024,0.12,0.22,0.0,4.2,2.5,6.9,6.5,2.5,5.9
2000-01-01,HEATHROW,1,7.0,0.94,1.0245,0.18,0.0,0.4,7.0,4.9,10.8,7.9,,
2000-01-01,KASSEL,1,,0.93,1.0237,0.06,0.13,0.0,3.5,1.5,5.0,2.3,2.5,8.2
2000-01-01,LJUBLJANA,1,6.0,0.83,1.0294,0.57,0.0,5.2,-4.8,-9.1,-1.3,-0.9,0.4,
2000-01-01,DE_BILT,1,7.0,0.97,1.024,0.11,0.1,0.0,6.1,3.5,8.1,7.3,2.5,8.0
2000-01-01,MAASTRICHT,1,8.0,0.98,1.0251,0.06,0.17,0.0,5.6,4.1,6.9,6.2,3.1,7.0
2000-01-01,MALMO,1,,,,,0.27,,2.9,0.9,3.6,3.7,2.5,
2000-01-01,TOURS,1,,0.97,1.0275,0.25,0.04,,8.5,7.2,9.8,7.9,1.6,
2000-01-01,MUENCHEN,1,8.0,0.91,1.0273,0.2,0.2,0.0,1.7,-0.5,2.6,1.9,2.6,9.4


In [121]:
from library.sb_utils import save_file

In [147]:
# save the data to a new csv file
datapath = 'modified_data'
save_file(generalized_weather_df, 'weather_prediction_dataset_finalized.csv', datapath)

Writing file.  "modified_data\weather_prediction_dataset_finalized.csv"


In [116]:
train.reset_index(inplace=True)
test.reset_index(inplace=True)

In [117]:
train

Unnamed: 0,DATE,CITY,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
0,2000-01-01,BASEL,1,8.0,0.89,1.0286,0.20,0.03,0.0,2.9,1.6,3.9,3.6,,
1,2000-01-02,BASEL,1,8.0,0.87,1.0318,0.25,0.00,0.0,3.6,2.7,4.8,2.2,,
2,2000-01-03,BASEL,1,5.0,0.81,1.0314,0.50,0.00,3.7,2.2,0.1,4.8,3.9,,
3,2000-01-04,BASEL,1,7.0,0.79,1.0262,0.63,0.35,6.9,3.9,0.5,7.5,6.0,,
4,2000-01-05,BASEL,1,5.0,0.90,1.0246,0.51,0.07,3.7,6.0,3.8,8.6,4.2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52591,2007-12-27,TOURS,12,,0.97,1.0334,0.20,0.00,,6.2,3.7,8.8,5.2,1.6,
52592,2007-12-28,TOURS,12,,0.97,1.0286,0.21,0.02,,5.2,3.5,6.8,6.1,3.2,
52593,2007-12-29,TOURS,12,,0.92,1.0227,0.18,0.24,,6.1,2.9,9.3,2.8,4.3,
52594,2007-12-30,TOURS,12,,0.97,1.0297,0.25,0.00,,2.8,1.1,4.5,4.2,1.5,


In [118]:
test

Unnamed: 0,DATE,CITY,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
0,2008-01-01,BASEL,1,8.0,0.86,1.0257,0.22,0.00,0.0,-0.5,-2.2,0.9,-1.4,,
1,2008-01-02,BASEL,1,8.0,0.67,1.0181,0.25,0.00,0.0,-1.4,-2.1,-0.8,-0.5,,
2,2008-01-03,BASEL,1,7.0,0.68,1.0076,0.28,0.00,0.1,-0.5,-3.1,3.2,-0.3,,
3,2008-01-04,BASEL,1,8.0,0.80,1.0131,0.14,0.00,0.0,-0.3,-3.6,2.3,6.3,,
4,2008-01-05,BASEL,1,8.0,0.74,1.0126,0.10,1.14,0.0,6.3,1.7,11.6,6.4,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13153,2009-12-27,TOURS,12,,0.84,1.0091,0.58,0.08,,4.6,-0.5,9.7,6.2,4.6,
13154,2009-12-28,TOURS,12,,0.95,1.0011,0.22,1.50,,6.2,1.8,10.6,10.4,3.7,
13155,2009-12-29,TOURS,12,,0.89,0.9966,0.24,0.40,,10.4,6.2,14.5,10.0,5.3,
13156,2009-12-30,TOURS,12,,0.88,0.9939,0.24,1.00,,10.0,8.7,11.3,8.5,3.8,


In [123]:
datapath = 'modified_data'
save_file(train, 'weather_prediction_training_dataset.csv', datapath)

Writing file.  "modified_data\weather_prediction_training_dataset.csv"


In [124]:
datapath = 'modified_data'
save_file(test, 'weather_prediction_testing_dataset.csv', datapath)

Writing file.  "modified_data\weather_prediction_testing_dataset.csv"
