# Data Science Capstone 2

## Weather Forecaster

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import os

In [3]:
#Load the datasets
weather_dataset = pd.read_csv("modified_data/weather_prediction_dataset_cleaned.csv")

#This dataset is optional and provides a template showing all the times the weather is appropriate for a picnic
weather_for_picnic = pd.read_csv("raw_data/weather_prediction_bbq_labels.csv")

In [4]:
#Initial details of the dataset
print("Shape of dataset:", weather_dataset.shape)
weather_dataset.head()

Shape of dataset: (3654, 165)


Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,...,STOCKHOLM_temp_min,STOCKHOLM_temp_max,TOURS_wind_speed,TOURS_humidity,TOURS_pressure,TOURS_global_radiation,TOURS_precipitation,TOURS_temp_mean,TOURS_temp_min,TOURS_temp_max
0,20000101,1,8,0.89,1.0286,0.2,0.03,0.0,2.9,1.6,...,-9.3,0.7,1.6,0.97,1.0275,0.25,0.04,8.5,7.2,9.8
1,20000102,1,8,0.87,1.0318,0.25,0.0,0.0,3.6,2.7,...,0.5,2.0,2.0,0.99,1.0293,0.17,0.16,7.9,6.6,9.2
2,20000103,1,5,0.81,1.0314,0.5,0.0,3.7,2.2,0.1,...,-1.0,2.8,3.4,0.91,1.0267,0.27,0.0,8.1,6.6,9.6
3,20000104,1,7,0.79,1.0262,0.63,0.35,6.9,3.9,0.5,...,2.5,4.6,4.9,0.95,1.0222,0.11,0.44,8.6,6.4,10.8
4,20000105,1,5,0.9,1.0246,0.51,0.07,3.7,6.0,3.8,...,-1.8,2.9,3.6,0.95,1.0209,0.39,0.04,8.0,6.4,9.5


## Features to be used for mean temperature prediction
- MONTH
- temp_mean
- temp_max
- temp_min
- global_radiation
- sunshine
- humidity (negative correlation)

In [5]:
city_names = ["BASEL", "BUDAPEST", "DE_BILT", "DRESDEN", "DUSSELDORF", "HEATHROW", "KASSEL", "LJUBLJANA", "MAASTRICHT",
              "MALMO", "MONTELIMAR", "MUENCHEN", "OSLO", "PERPIGNAN", "ROMA", "SONNBLICK", "STOCKHOLM", "TOURS"]

In [6]:
# For loop to construct a list of sub dataframes, each specific to a city
cities = {}
for city in city_names:
    pattern = f'DATE|MONTH|{city}'
    mask = weather_dataset.columns.str.contains(pattern, regex=True)
    sub_df = weather_dataset.loc[:, mask]
    cities[city] = sub_df

In [7]:
variables = ['temp_mean', 'temp_max', 'temp_min', 'cloud_cover', 'wind_speed', 'wind_gust',
            'humidity', 'pressure', 'global_radiation', 'precipitation', 'sunshine']

In [8]:
# For loop to construct a list of sub dataframes, each specific to a measurement
measurements = {}
for var in variables:
    pattern = f'DATE|MONTH|{var}'
    mask = weather_dataset.columns.str.contains(pattern, regex=True)
    sub_df = weather_dataset.loc[:, mask]
    measurements[var] = sub_df

In [9]:
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor

In [10]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression

In [11]:
df_basel = cities['BASEL']
print(df_basel.shape)
df_basel.head()

(3654, 11)


Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,BASEL_temp_max
0,20000101,1,8,0.89,1.0286,0.2,0.03,0.0,2.9,1.6,3.9
1,20000102,1,8,0.87,1.0318,0.25,0.0,0.0,3.6,2.7,4.8
2,20000103,1,5,0.81,1.0314,0.5,0.0,3.7,2.2,0.1,4.8
3,20000104,1,7,0.79,1.0262,0.63,0.35,6.9,3.9,0.5,7.5
4,20000105,1,5,0.9,1.0246,0.51,0.07,3.7,6.0,3.8,8.6


### Remove city/location names from each column and create the target column

In [12]:
for city in city_names:
    df_city = cities[city]
    cols = df_city.columns

    # Rename the columns without the city name
    df_city.columns = ['DATE','MONTH'] + [col.replace(f"{city}_", "") for col in df_city.columns if col not in ['DATE','MONTH']]

    # Create the target column
    df_city['target'] = df_city['temp_mean'].shift(-1)
    df_city.dropna(inplace=True)

    # Create the city column and update the city dataframe dict
    df_city.insert(loc=0, column='CITY', value=city)
    cities[city] = df_city

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_city['target'] = df_city['temp_mean'].shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_city.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_city['target'] = df_city['temp_mean'].shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/

In [13]:
cities['DE_BILT'].tail()

Unnamed: 0,CITY,DATE,MONTH,cloud_cover,wind_speed,wind_gust,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target
3648,DE_BILT,20091227,12,7,5.3,14.0,0.86,0.999,0.21,0.39,1.4,4.2,2.6,5.1,2.6
3649,DE_BILT,20091228,12,4,2.3,7.0,0.87,1.0044,0.4,0.0,5.2,2.6,-2.5,7.3,0.6
3650,DE_BILT,20091229,12,6,4.3,9.0,0.91,0.9992,0.15,0.55,0.4,0.6,-1.4,1.7,0.3
3651,DE_BILT,20091230,12,8,4.1,9.0,0.94,0.9939,0.11,0.17,0.0,0.3,-0.3,0.9,0.0
3652,DE_BILT,20091231,12,8,4.9,11.0,0.86,0.9971,0.11,0.0,0.0,0.0,-0.5,0.5,-1.6


In [14]:
cities['HEATHROW'].head()

Unnamed: 0,CITY,DATE,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target
0,HEATHROW,20000101,1,7,0.94,1.0245,0.18,0.0,0.4,7.0,4.9,10.8,7.9
1,HEATHROW,20000102,1,7,0.89,1.0253,0.2,0.02,0.7,7.9,5.0,11.5,9.4
2,HEATHROW,20000103,1,8,0.91,1.0186,0.13,0.6,0.0,9.4,7.2,9.5,7.0
3,HEATHROW,20000104,1,5,0.89,1.0148,0.34,0.02,2.9,7.0,4.4,11.0,6.4
4,HEATHROW,20000105,1,5,0.85,1.0142,0.25,0.08,1.3,6.4,1.9,10.8,8.9


In [15]:
cities['MALMO'].head()

Unnamed: 0,CITY,DATE,MONTH,wind_speed,precipitation,temp_mean,temp_min,temp_max,target
0,MALMO,20000101,1,2.5,0.27,2.9,0.9,3.6,3.7
1,MALMO,20000102,1,3.8,0.0,3.7,1.0,5.4,5.6
2,MALMO,20000103,1,4.3,0.06,5.6,4.0,6.9,4.5
3,MALMO,20000104,1,3.9,0.75,4.5,3.0,6.4,3.8
4,MALMO,20000105,1,3.2,0.03,3.8,2.5,5.5,4.1


### Now combine the city dataframes into a new weather dataset

In [16]:
generalized_weather_df = pd.concat(cities, ignore_index=True)
generalized_weather_df.head()

Unnamed: 0,CITY,DATE,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
0,BASEL,20000101,1,8.0,0.89,1.0286,0.2,0.03,0.0,2.9,1.6,3.9,3.6,,
1,BASEL,20000102,1,8.0,0.87,1.0318,0.25,0.0,0.0,3.6,2.7,4.8,2.2,,
2,BASEL,20000103,1,5.0,0.81,1.0314,0.5,0.0,3.7,2.2,0.1,4.8,3.9,,
3,BASEL,20000104,1,7.0,0.79,1.0262,0.63,0.35,6.9,3.9,0.5,7.5,6.0,,
4,BASEL,20000105,1,5.0,0.9,1.0246,0.51,0.07,3.7,6.0,3.8,8.6,4.2,,


In [17]:
print(generalized_weather_df.shape)

(65754, 15)


In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
#from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
#from sklearn.preprocessing import StandardScaler, MinMaxScaler
#from sklearn.dummy import DummyRegressor
#from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
#from sklearn.pipeline import make_pipeline
#from sklearn.impute import SimpleImputer
#from sklearn.feature_selection import SelectKBest, f_regression

In [19]:
generalized_weather_df['DATE'] = pd.to_datetime(generalized_weather_df.DATE, format="%Y%m%d")
generalized_weather_df.head()

Unnamed: 0,CITY,DATE,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
0,BASEL,2000-01-01,1,8.0,0.89,1.0286,0.2,0.03,0.0,2.9,1.6,3.9,3.6,,
1,BASEL,2000-01-02,1,8.0,0.87,1.0318,0.25,0.0,0.0,3.6,2.7,4.8,2.2,,
2,BASEL,2000-01-03,1,5.0,0.81,1.0314,0.5,0.0,3.7,2.2,0.1,4.8,3.9,,
3,BASEL,2000-01-04,1,7.0,0.79,1.0262,0.63,0.35,6.9,3.9,0.5,7.5,6.0,,
4,BASEL,2000-01-05,1,5.0,0.9,1.0246,0.51,0.07,3.7,6.0,3.8,8.6,4.2,,


In [20]:
generalized_weather_df.tail()

Unnamed: 0,CITY,DATE,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
65749,TOURS,2009-12-27,12,,0.84,1.0091,0.58,0.08,,4.6,-0.5,9.7,6.2,4.6,
65750,TOURS,2009-12-28,12,,0.95,1.0011,0.22,1.5,,6.2,1.8,10.6,10.4,3.7,
65751,TOURS,2009-12-29,12,,0.89,0.9966,0.24,0.4,,10.4,6.2,14.5,10.0,5.3,
65752,TOURS,2009-12-30,12,,0.88,0.9939,0.24,1.0,,10.0,8.7,11.3,8.5,3.8,
65753,TOURS,2009-12-31,12,,0.88,0.9933,0.58,0.02,,8.5,6.2,10.9,0.5,4.2,


In [21]:
generalized_weather_df = generalized_weather_df.set_index('DATE').sort_index()

In [22]:
#Train test split
#X_train, X_test, y_train, y_test = train_test_split(generalized_weather_df.drop(columns='target'), 
#                                                   generalized_weather_df.target, test_size=0.2)
train = generalized_weather_df.loc['2000-01-01':'2007-12-31']
test = generalized_weather_df.loc['2008-01-01':'2009-12-31']
X_train = train.drop(columns='target')
y_train = train.target
X_test = test.drop(columns='target')
y_test = test.target

In [26]:
y_test

DATE
2008-01-01     0.4
2008-01-01     5.0
2008-01-01     4.4
2008-01-01     0.4
2008-01-01    -3.0
              ... 
2009-12-31    -7.7
2009-12-31    11.0
2009-12-31    16.0
2009-12-31     6.9
2009-12-31     0.5
Name: target, Length: 13158, dtype: float64

In [28]:
tscv = TimeSeriesSplit(n_splits=5)
for train_ind, test_ind in tscv.split(generalized_weather_df):
    train, test = generalized_weather_df.iloc[train_ind], generalized_weather_df.iloc[test_ind]

In [31]:
train

Unnamed: 0_level_0,CITY,MONTH,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,target,wind_speed,wind_gust
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2000-01-01,BASEL,1,8.0,0.89,1.0286,0.20,0.03,0.0,2.9,1.6,3.9,3.6,,
2000-01-01,DUSSELDORF,1,8.0,0.92,1.0240,0.12,0.22,0.0,4.2,2.5,6.9,6.5,2.5,5.9
2000-01-01,HEATHROW,1,7.0,0.94,1.0245,0.18,0.00,0.4,7.0,4.9,10.8,7.9,,
2000-01-01,KASSEL,1,,0.93,1.0237,0.06,0.13,0.0,3.5,1.5,5.0,2.3,2.5,8.2
2000-01-01,LJUBLJANA,1,6.0,0.83,1.0294,0.57,0.00,5.2,-4.8,-9.1,-1.3,-0.9,0.4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2008-05-01,HEATHROW,5,6.0,0.79,1.0065,1.75,0.40,4.6,11.0,6.9,15.2,10.2,,
2008-05-01,STOCKHOLM,5,8.0,,1.0169,,0.00,5.6,11.2,8.4,15.2,10.1,,
2008-05-02,OSLO,5,8.0,0.81,1.0182,1.24,0.60,1.8,10.7,9.0,14.1,11.9,1.4,6.9
2008-05-02,DUSSELDORF,5,5.0,0.78,1.0224,1.50,0.48,3.5,10.8,6.1,16.5,13.0,2.3,6.6


In [32]:
X_train.shape, X_test.shape

((52596, 13), (13158, 13))

In [33]:
y_train.shape, y_test.shape

((52596,), (13158,))

In [34]:
names_list = ['CITY', 'MONTH']
names_train = X_train[names_list]
names_test = X_test[names_list]
X_train.drop(columns=names_list, inplace=True)
X_test.drop(columns=names_list, inplace=True)
X_train.shape, X_test.shape

((52596, 11), (13158, 11))

In [35]:
X_train.dtypes

cloud_cover         float64
humidity            float64
pressure            float64
global_radiation    float64
precipitation       float64
sunshine            float64
temp_mean           float64
temp_min            float64
temp_max            float64
wind_speed          float64
wind_gust           float64
dtype: object

In [36]:
X_train_mean = X_train.mean()
X_train_mean

cloud_cover          5.126415
humidity             0.750675
pressure             1.016122
global_radiation     1.366542
precipitation        0.229776
sunshine             5.027946
temp_mean           10.390655
temp_min             6.350199
temp_max            14.498023
wind_speed           3.353354
wind_gust           10.113371
dtype: float64

In [37]:
X_train_median = X_train.median()
X_train_median

cloud_cover          6.0000
humidity             0.7700
pressure             1.0164
global_radiation     1.1800
precipitation        0.0000
sunshine             4.3000
temp_mean           10.9000
temp_min             6.9000
temp_max            15.0000
wind_speed           2.9000
wind_gust            9.5000
dtype: float64

In [38]:
X_train.head()

Unnamed: 0_level_0,cloud_cover,humidity,pressure,global_radiation,precipitation,sunshine,temp_mean,temp_min,temp_max,wind_speed,wind_gust
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-01-01,8.0,0.89,1.0286,0.2,0.03,0.0,2.9,1.6,3.9,,
2000-01-01,8.0,0.92,1.024,0.12,0.22,0.0,4.2,2.5,6.9,2.5,5.9
2000-01-01,7.0,0.94,1.0245,0.18,0.0,0.4,7.0,4.9,10.8,,
2000-01-01,,0.93,1.0237,0.06,0.13,0.0,3.5,1.5,5.0,2.5,8.2
2000-01-01,6.0,0.83,1.0294,0.57,0.0,5.2,-4.8,-9.1,-1.3,0.4,


In [40]:
# Dummy Regressor
dummy = DummyRegressor(strategy='mean')
dummy.fit(X_train, y_train)
print(dummy.constant_)

[[10.39006198]]


In [41]:
y_pred = dummy.predict(X_test)
print("Dummy RMSE:", mean_squared_error(y_test, y_pred, squared=False))

Dummy RMSE: 8.298337311373482




In [42]:
#Impute missing values in the training and testing dataset via the training mean, training median, and an arbitrary value
X_train_fill_mean = X_train.fillna(X_train_mean)
X_test_fill_mean = X_test.fillna(X_train_mean)

X_train_fill_median = X_train.fillna(X_train_median)
X_test_fill_median = X_test.fillna(X_train_median)

X_train_fill_arbitrary = X_train.fillna(-64)
X_test_fill_arbitrary = X_test.fillna(-64)

### Scaling the data

In [43]:
# Begin with MinMaxScaler
# Apply it to the mean and median values
mm_scaler = MinMaxScaler()
mm_scaler.fit(X_train_fill_mean)
X_train_mean_scaled = mm_scaler.transform(X_train_fill_mean)
X_test_mean_scaled = mm_scaler.transform(X_test_fill_mean)

mm_scaler_2 = MinMaxScaler()
mm_scaler_2.fit(X_train_fill_median)
X_train_median_scaled = mm_scaler_2.transform(X_train_fill_median)
X_test_median_scaled = mm_scaler_2.transform(X_test_fill_median)

In [44]:
lm_1 = LinearRegression().fit(X_train_mean_scaled, y_train)
lm_2 = LinearRegression().fit(X_train_median_scaled, y_train)

In [45]:
y_train_mean_pred = lm_1.predict(X_train_mean_scaled)
y_test_mean_pred = lm_1.predict(X_test_mean_scaled)

y_train_median_pred = lm_2.predict(X_train_median_scaled)
y_test_median_pred = lm_2.predict(X_test_median_scaled)

In [46]:
r2 = r2_score(y_train, y_train_mean_pred), r2_score(y_test, y_test_mean_pred)
RMSE = mean_squared_error(y_train, y_train_mean_pred, squared=False), mean_squared_error(y_test, y_test_mean_pred, squared=False)
print("R-squared for linear regression with mean imputation:", r2)
print("RMSE for linear regression with mean imputation:", RMSE)

R-squared for linear regression with mean imputation: (0.9379320302544223, 0.938249747482214)
RMSE for linear regression with mean imputation: (np.float64(2.101198651730178), np.float64(2.0621034686427038))




In [None]:
# R-squared for linear regression with mean imputation: (0.9377911716578992, 0.9388541264454779)
# RMSE for linear regression with mean imputation: (np.float64(2.0945957861250304), np.float64(2.0877860072521153))

In [47]:
r2 = r2_score(y_train, y_train_median_pred), r2_score(y_test, y_test_median_pred)
RMSE = mean_squared_error(y_train, y_train_median_pred, squared=False), mean_squared_error(y_test, y_test_median_pred, squared=False)
print("R-squared for linear regression with median imputation:", r2)
print("RMSE for linear regression with median imputation:", RMSE)

R-squared for linear regression with median imputation: (0.937954475710293, 0.9383191009161178)
RMSE for linear regression with median imputation: (np.float64(2.1008186922569285), np.float64(2.0609451403032146))




In [48]:
# Lets now try the same with StandardScaler instead, and also with a pipeline
# If there seems to be overfitting, we will use SelectKBest with k=5
pipe_lr_1 = make_pipeline(
    SimpleImputer(strategy='mean'), 
    StandardScaler(),
    #SelectKBest(f_regression),
    LinearRegression()
)

pipe_lr_2 = make_pipeline(
    SimpleImputer(strategy='median'), 
    StandardScaler(),
    #SelectKBest(f_regression),
    LinearRegression()
)

In [49]:
pipe_lr_1.fit(X_train, y_train)

In [50]:
pipe_lr_2.fit(X_train, y_train)

In [51]:
y_train_mean_pred = pipe_lr_1.predict(X_train)
y_test_mean_pred = pipe_lr_1.predict(X_test)

In [52]:
y_train_median_pred = pipe_lr_2.predict(X_train)
y_test_median_pred = pipe_lr_2.predict(X_test)

In [55]:
r2 = r2_score(y_train, y_train_mean_pred), r2_score(y_test, y_test_mean_pred)
RMSE = mean_squared_error(y_train, y_train_mean_pred, squared=False), mean_squared_error(y_test, y_test_mean_pred, squared=False)
print("R-squared for linear regression with mean imputation:", r2)
print("RMSE for linear regression with mean imputation:", RMSE)

R-squared for linear regression with mean imputation: (0.9379320302544223, 0.938249747482214)
RMSE for linear regression with mean imputation: (np.float64(2.101198651730178), np.float64(2.0621034686427033))




In [54]:
r2 = r2_score(y_train, y_train_median_pred), r2_score(y_test, y_test_median_pred)
RMSE = mean_squared_error(y_train, y_train_median_pred, squared=False), mean_squared_error(y_test, y_test_median_pred, squared=False)
print("R-squared for linear regression with median imputation:", r2)
print("RMSE for linear regression with median imputation:", RMSE)

R-squared for linear regression with median imputation: (0.937954475710293, 0.9383191009161178)
RMSE for linear regression with median imputation: (np.float64(2.1008186922569285), np.float64(2.0609451403032146))




In [57]:
cv_scores_1 = cross_validate(pipe_lr_1, X_train, y_train, cv=5)
cv_scores_2 = cross_validate(pipe_lr_2, X_train, y_train, cv=5)

In [58]:
print(cv_scores_1['test_score'])
print(cv_scores_2['test_score'])

[0.93244214 0.93504656 0.93922795 0.93654076 0.93495466]
[0.93242279 0.9350811  0.9392639  0.93657097 0.93495836]


In [None]:
# (0.9377911716578992, 0.9388541264454779), (2.0945957861250304, 2.0877860072521153)
# (0.937954475710293, 0.9383191009161178), (2.1008186922569285, 2.0609451403032146)
# (0.9379320302544223, 0.938249747482214), (2.101198651730178, 2.0621034686427033)
# (0.937954475710293, 0.9383191009161178), (2.1008186922569285, 2.0609451403032146)
# So far seems like in linear regression, best model is StandardScaler with mean imputation

### Random Forest Model

In [56]:
RF_pipe_1 = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
    RandomForestRegressor(random_state=5)
)

RF_pipe_2 = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
    RandomForestRegressor(random_state=5)
)

RF_pipe_3 = make_pipeline(
    SimpleImputer(strategy='constant', fill_value=-64),
    StandardScaler(),
    RandomForestRegressor(random_state=5)
)

In [59]:
cv_scores_rf_1 = cross_validate(RF_pipe_1, X_train, y_train, cv=5)
cv_scores_rf_2 = cross_validate(RF_pipe_2, X_train, y_train, cv=5)
cv_scores_rf_3 = cross_validate(RF_pipe_3, X_train, y_train, cv=5)

In [60]:
print(cv_scores_rf_1['test_score'])
print(cv_scores_rf_2['test_score'])
print(cv_scores_rf_3['test_score'])

[0.93618561 0.93837974 0.94220633 0.93964348 0.93734552]
[0.93618561 0.93837974 0.94220633 0.93964348 0.93734552]
[0.93636888 0.9383451  0.94257789 0.93984348 0.93687426]


In [61]:
RF_pipe_1.fit(X_train, y_train)

In [62]:
RF_pipe_2.fit(X_train, y_train)

In [63]:
RF_pipe_3.fit(X_train, y_train)

In [64]:
y_train_RF_pred_1 = RF_pipe_1.predict(X_train)
y_test_RF_pred_1 = RF_pipe_1.predict(X_test)

In [65]:
y_train_RF_pred_2 = RF_pipe_2.predict(X_train)
y_test_RF_pred_2 = RF_pipe_2.predict(X_test)

In [66]:
y_train_RF_pred_3 = RF_pipe_3.predict(X_train)
y_test_RF_pred_3 = RF_pipe_3.predict(X_test)

In [67]:
r2 = r2_score(y_train, y_train_RF_pred_1), r2_score(y_test, y_test_RF_pred_1)
RMSE = mean_squared_error(y_train, y_train_RF_pred_1, squared=False), mean_squared_error(y_test, y_test_RF_pred_1, squared=False)
print("R-squared for random forest model with mean imputation:", r2)
print("RMSE for random forest model with mean imputation:", RMSE)

R-squared for random forest model with mean imputation: (0.9919946576126288, 0.9427285917216491)
RMSE for random forest model with mean imputation: (np.float64(0.7546112912425765), np.float64(1.9859120682109916))




In [68]:
r2 = r2_score(y_train, y_train_RF_pred_2), r2_score(y_test, y_test_RF_pred_2)
RMSE = mean_squared_error(y_train, y_train_RF_pred_2, squared=False), mean_squared_error(y_test, y_test_RF_pred_2, squared=False)
print("R-squared for random forest model with median imputation:", r2)
print("RMSE for random forest model with median imputation:", RMSE)

R-squared for random forest model with median imputation: (0.9919946576126288, 0.9427285917216491)
RMSE for random forest model with median imputation: (np.float64(0.7546112912425765), np.float64(1.9859120682109916))




In [69]:
r2 = r2_score(y_train, y_train_RF_pred_3), r2_score(y_test, y_test_RF_pred_3)
RMSE = mean_squared_error(y_train, y_train_RF_pred_3, squared=False), mean_squared_error(y_test, y_test_RF_pred_3, squared=False)
print("R-squared for random forest model with constant (-64) imputation:", r2)
print("RMSE for random forest model with mean constant (-64) imputation:", RMSE)

R-squared for random forest model with constant (-64) imputation: (0.9920197156471836, 0.9428392851025436)
RMSE for random forest model with mean constant (-64) imputation: (np.float64(0.7534293370408122), np.float64(1.9839919683725928))




In [None]:
# For random forest, seems like the best performing model has a constant imputation, though there seems to be some overfitting

### Extreme Gradient Boosting (XGBoost)

In [76]:
import sys
!{sys.executable} -m pip install xgboost
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder




[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: C:\Users\tanks\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip





In [122]:
params = {
    'objective': 'reg:squarederror',
    'max_depth': 3,
    'learning_rate': 0.01,
    'n_estimators': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 5
}
#xgb_model = xgb.train(params=params,dtrain=xgb_train,num_boost_round=50)
xgb_model = xgb.XGBRegressor(**params)
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)

In [108]:
y_test_le = le.fit_transform(y_test)

In [126]:
xgb_model.fit(X_train, y_train_le)

In [127]:
#y_xgb_pred = xgb_model.predict(xgb_test)
y_train_xgb_pred = xgb_model.predict(X_train)
y_test_xgb_pred = xgb_model.predict(X_test)

In [128]:
r2 = r2_score(y_train_le, y_train_xgb_pred), r2_score(y_test_le, y_test_xgb_pred)
RMSE = mean_squared_error(y_train_le, y_train_xgb_pred, squared=False), mean_squared_error(y_test_le, y_test_xgb_pred, squared=False)
print("R-squared for xgboost against encoded labels:", r2)
print("RMSE for xgboost against encoded labels:", RMSE)

R-squared for xgboost against encoded labels: (0.7977137565612793, 0.4649772047996521)
RMSE for xgboost against encoded labels: (np.float64(37.92201431473759), np.float64(60.49368719036834))




In [None]:
# There appears to be critical overfitting as the test accuracy scores are much worse than the training counterparts
# R-squared value for testing set never exceeded 0.62

### Categorical Boosting (CatBoost)

In [130]:
!{sys.executable} -m pip install catboost
from catboost import CatBoostRegressor




[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: C:\Users\tanks\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip


In [131]:
cat_model = CatBoostRegressor(
    iterations=100,      
    learning_rate=0.1,   
    depth=6,              
    verbose=0
)

In [132]:
cat_model.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x254cf21cad0>

In [133]:
y_train_cat_pred = cat_model.predict(X_train)
y_test_cat_pred = cat_model.predict(X_test)

In [134]:
r2 = r2_score(y_train, y_train_cat_pred), r2_score(y_test, y_test_cat_pred)
RMSE = mean_squared_error(y_train, y_train_cat_pred, squared=False), mean_squared_error(y_test, y_test_cat_pred, squared=False)
print("R-squared for catboost:", r2)
print("RMSE for catboost:", RMSE)

R-squared for catboost: (0.9428958464779363, 0.9426320989360327)
RMSE for catboost: (np.float64(2.0154275583253476), np.float64(1.9875843296986964))




In [135]:
cv_scores_cat = cross_validate(cat_model, X_train, y_train, cv=5)

In [136]:
print(cv_scores_cat['test_score'])

[0.93648992 0.93871993 0.94303025 0.93927871 0.93796765]


In [None]:
# Seems CatBoosting manages to avoid the overfitting problem from XGBoosting, scoreing very high in both R-squared and RMSE values
# An alternative to be considered would be LightGBM

### Model Evaluations

In [137]:
lr2_neg_mae = cross_validate(pipe_lr_2, X_train, y_train, 
                            scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
lr2_mae_mean = np.mean(-1 * lr2_neg_mae['test_score'])
lr2_mae_std = np.std(-1 * lr2_neg_mae['test_score'])
mean_absolute_error(y_test, pipe_lr_2.predict(X_test))

np.float64(1.5959120986038628)

In [138]:
rf1_neg_mae = cross_validate(RF_pipe_1, X_train, y_train, 
                            scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
rf1_mae_mean = np.mean(-1 * rf1_neg_mae['test_score'])
rf1_mae_std = np.std(-1 * rf1_neg_mae['test_score'])
mean_absolute_error(y_test, RF_pipe_1.predict(X_test))

np.float64(1.5307400820793433)

In [140]:
xgb_neg_mae = cross_validate(xgb_model, X_train, y_train_le, 
                            scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
xgb_mae_mean = np.mean(-1 * lr2_neg_mae['test_score'])
xgb_mae_std = np.std(-1 * lr2_neg_mae['test_score'])
mean_absolute_error(y_test_le, xgb_model.predict(X_test))

np.float64(50.647765284125285)

In [141]:
cat_neg_mae = cross_validate(cat_model, X_train, y_train, 
                            scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
cat_mae_mean = np.mean(-1 * cat_neg_mae['test_score'])
cat_mae_std = np.std(-1 * cat_neg_mae['test_score'])
mean_absolute_error(y_test, cat_model.predict(X_test))

np.float64(1.541301931061991)

### Conclusion

Based on the evaluation metrics of R-squared, RMSE, backed up by cross validation MAE, the 2 best models were the Random Forest Regressor and the Categorical Boosting Model. These 2 models will proceed to hyperparameter tuning before the final model is selected.

In [142]:
from library.sb_utils import save_file

In [147]:
# save the data to a new csv file
datapath = 'modified_data'
save_file(generalized_weather_df, 'weather_prediction_dataset_finalized.csv', datapath)

Writing file.  "modified_data\weather_prediction_dataset_finalized.csv"
