# Random Forest

## Data Preparation

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [28]:
# import the prepared data set
dat = pd.read_csv('../../results/df_agg_hourly.csv')

# Check the data
dat.head()

Unnamed: 0,Standort,Date,Time,Datetime,VELO_IN,VELO_OUT,FUSS_IN,FUSS_OUT,Ost,Nord,...,p [hPa],Year,AnzBestWir,bezeichnung,fk_zaehler,id1,richtung_in,richtung_out,korrekturfaktor,geometry
0,20,2023-01-01,00:00,2023-01-01 00:00,0.0,0.0,46.0,31.0,2682689,1247735,...,971.62,2023.0,447082.0,Militärbrücke,U15G3063864,20,Löwenplatz,Langstrasse,0.58,POINT (2682689 1247734.9)
1,20,2023-01-01,01:00,2023-01-01 01:00,0.0,0.0,43.0,94.0,2682689,1247735,...,971.86,2023.0,447082.0,Militärbrücke,U15G3063864,20,Löwenplatz,Langstrasse,0.58,POINT (2682689 1247734.9)
2,20,2023-01-01,02:00,2023-01-01 02:00,0.0,0.0,36.0,27.0,2682689,1247735,...,971.76,2023.0,447082.0,Militärbrücke,U15G3063864,20,Löwenplatz,Langstrasse,0.58,POINT (2682689 1247734.9)
3,20,2023-01-01,03:00,2023-01-01 03:00,0.0,0.0,22.0,27.0,2682689,1247735,...,972.01,2023.0,447082.0,Militärbrücke,U15G3063864,20,Löwenplatz,Langstrasse,0.58,POINT (2682689 1247734.9)
4,20,2023-01-01,04:00,2023-01-01 04:00,0.0,0.0,11.0,33.0,2682689,1247735,...,972.1,2023.0,447082.0,Militärbrücke,U15G3063864,20,Löwenplatz,Langstrasse,0.58,POINT (2682689 1247734.9)


In [29]:
# What are the different variables
dat.columns

Index(['Standort', 'Date', 'Time', 'Datetime', 'VELO_IN', 'VELO_OUT',
       'FUSS_IN', 'FUSS_OUT', 'Ost', 'Nord', 'Hr [%Hr]', 'RainDur [min]',
       'StrGlo [W/m2]', 'T [°C]', 'WD [°]', 'WVs [m/s]', 'WVv [m/s]',
       'p [hPa]', 'Year', 'AnzBestWir', 'bezeichnung', 'fk_zaehler', 'id1',
       'richtung_in', 'richtung_out', 'korrekturfaktor', 'geometry'],
      dtype='object')

### Extract Hours and Months from the Data

In [30]:
# Extract the hours from the time column
dat['hour'] = pd.to_datetime(dat['Time']).dt.hour

# Check the data
dat['hour'].head()

  dat['hour'] = pd.to_datetime(dat['Time']).dt.hour


0    0
1    1
2    2
3    3
4    4
Name: hour, dtype: int32

In [31]:
# Get the month from the Date column
dat['month'] = pd.to_datetime(dat['Date']).dt.month

# Check the data
dat['month'].head()

0    1
1    1
2    1
3    1
4    1
Name: month, dtype: int32

### Total Counts for the Bike and Foot Traffic

In [32]:
# Add up the bycicle and pedestrian counts
dat['bike_tot'] = dat['VELO_IN'] + dat['VELO_OUT']
dat['ped_tot'] = dat['FUSS_IN'] + dat['FUSS_OUT']

### NA handling

In [33]:
# Are there NAs in the data?
print(dat.isna().sum())

Standort              0
Date                  0
Time                  0
Datetime              0
VELO_IN               0
VELO_OUT              0
FUSS_IN               0
FUSS_OUT              0
Ost                   0
Nord                  0
Hr [%Hr]            173
RainDur [min]       137
StrGlo [W/m2]       137
T [°C]              173
WD [°]               90
WVs [m/s]           222
WVv [m/s]            90
p [hPa]             113
Year                 27
AnzBestWir           27
bezeichnung           0
fk_zaehler            0
id1                   0
richtung_in           0
richtung_out       3792
korrekturfaktor       0
geometry              0
hour                  0
month                 0
bike_tot              0
ped_tot               0
dtype: int64


Before dropping the NAs, let's select the columns which we will actually be using for the RF.

In [34]:
# Let's have a look at the data types
dat.dtypes

Standort             int64
Date                object
Time                object
Datetime            object
VELO_IN            float64
VELO_OUT           float64
FUSS_IN            float64
FUSS_OUT           float64
Ost                  int64
Nord                 int64
Hr [%Hr]           float64
RainDur [min]      float64
StrGlo [W/m2]      float64
T [°C]             float64
WD [°]             float64
WVs [m/s]          float64
WVv [m/s]          float64
p [hPa]            float64
Year               float64
AnzBestWir         float64
bezeichnung         object
fk_zaehler          object
id1                  int64
richtung_in         object
richtung_out        object
korrekturfaktor    float64
geometry            object
hour                 int32
month                int32
bike_tot           float64
ped_tot            float64
dtype: object

In [39]:
# Select only numeric columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
dd = dat.select_dtypes(include=numerics)

# Let's have a look at the data types
dd.dtypes

Standort             int64
VELO_IN            float64
VELO_OUT           float64
FUSS_IN            float64
FUSS_OUT           float64
Ost                  int64
Nord                 int64
Hr [%Hr]           float64
RainDur [min]      float64
StrGlo [W/m2]      float64
T [°C]             float64
WD [°]             float64
WVs [m/s]          float64
WVv [m/s]          float64
p [hPa]            float64
Year               float64
AnzBestWir         float64
id1                  int64
korrekturfaktor    float64
hour                 int32
month                int32
bike_tot           float64
ped_tot            float64
dtype: object

In [40]:
# Let's exclude the columns that are not needed anymore
dd = dd.drop(['VELO_IN', 'VELO_OUT', 'FUSS_IN', 'FUSS_OUT'], axis=1)

In [41]:
# Which columns are left?
dd.columns

Index(['Standort', 'Ost', 'Nord', 'Hr [%Hr]', 'RainDur [min]', 'StrGlo [W/m2]',
       'T [°C]', 'WD [°]', 'WVs [m/s]', 'WVv [m/s]', 'p [hPa]', 'Year',
       'AnzBestWir', 'id1', 'korrekturfaktor', 'hour', 'month', 'bike_tot',
       'ped_tot'],
      dtype='object')

In [None]:
# Select essential variables


In [None]:
# Drop NAs


## Model for Bike Traffic

First we have to define the target variable and the features.

In [20]:
# Define the target variable
y = dat['bike_tot']

# Define the features
X = dat.drop(['ped_tot'], axis=1)

# Check data types
X.dtypes

Standort             int64
Ost                  int64
Nord                 int64
Hr [%Hr]           float64
RainDur [min]      float64
StrGlo [W/m2]      float64
T [°C]             float64
WD [°]             float64
WVs [m/s]          float64
WVv [m/s]          float64
p [hPa]            float64
Year               float64
AnzBestWir         float64
id1                  int64
korrekturfaktor    float64
hour                 int32
month                int32
dtype: object

In [21]:
# Split into test and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check the shapes of the data
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(154772, 17) (66331, 17) (154772,) (66331,)


In [22]:
# Fit the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values