In [2]:
import pandas as pd
from datetime import datetime

### Baseline

In [25]:
knmi = pd.read_csv("../data/knmi/knmi_for_baseline.csv", index_col='Unnamed: 0')

In [9]:
cmsa_baseline = pd.read_csv("../data/cmsa_baseline.csv", index_col="Unnamed: 0")
cmsa_baseline.tail()

Unnamed: 0,datetime,GAWW-11,GAWW-12,GAWW-14,vacation_dummy,holiday_dummy,stringency_index
46843,2022-01-01 22:45:00,48.0,22.0,21.0,1,1,63.89
46844,2022-01-01 23:00:00,28.0,16.0,21.0,1,1,63.89
46845,2022-01-01 23:15:00,27.0,26.0,21.0,1,1,63.89
46846,2022-01-01 23:30:00,29.0,19.0,15.0,1,1,63.89
46847,2022-01-01 23:45:00,25.0,14.0,19.0,1,1,63.89


In [10]:
knmi.shape

(47805, 10)

In [15]:
knmi.dtypes

datetime              object
global_radiation       int64
pressure             float64
precipitation_h      float64
relative_humidity      int64
temperature          float64
cloud_cover            int64
sight                  int64
wind_direction         int64
wind_speed           float64
dtype: object

In [26]:
start_date = '2020-09-01 00:00:00'
end_date = '2022-01-01 23:45:00'

In [27]:
knmi = knmi[(knmi.datetime >= start_date) & (knmi.datetime <= end_date)]
knmi.shape

(46848, 10)

In [None]:
knmi = knmi.set_index('datetime')
cmsa_baseline = cmsa_baseline.set_index('datetime')

In [33]:
cmsa_baseline = pd.concat([cmsa_baseline, knmi], axis=1)

In [35]:
cmsa_baseline.shape

(46848, 15)

In [36]:
cmsa_baseline.to_csv("../data/cmsa_baseline.csv")

### For our crowd data

In [39]:
knmi2 = pd.read_csv("../data/knmi/knmi_for_our_model.csv", index_col='Unnamed: 0')

In [40]:
knmi2.columns

Index(['datetime', 'wind_direction', 'wind_speed', 'wind_speed_10m',
       'wind_gust', 'temperature', 'temperature_min', 'dew_point_temperature',
       'radiation_duration', 'global_radiation', 'precipitation_duration',
       'precipitation_h', 'pressure', 'sight', 'cloud_cover',
       'relative_humidity', 'weather_code', 'weather_index', 'fog', 'rain',
       'snow', 'thunder', 'ice'],
      dtype='object')

In [44]:
cmsa_without_knmi = pd.read_csv("../data/cmsa_combined_without_knmi.csv")
cmsa_without_knmi.columns

Index(['datetime', 'GAWW-11', 'GAWW-12', 'GAWW-14', 'vacation_dummy',
       'holiday_dummy', 'stringency_index', 'checkin_dam', 'checkout_dam',
       'checkin_nieuwmarkt', 'checkout_nieuwmarkt', 'hotel_gasten',
       'hotel_overnachtingen', 'airport_tot_passengers',
       'airport_arrived_passengers', 'airport_departed_passengers',
       'covid_cases', 'covid_hospital', 'covid_deaths'],
      dtype='object')

In [45]:
knmi2 = knmi2[(knmi2.datetime >= start_date) & (knmi2.datetime <= end_date)]
knmi2.shape

(46848, 23)

In [46]:
knmi2 = knmi2.set_index('datetime')
cmsa_without_knmi = cmsa_without_knmi.set_index('datetime')

In [47]:
cmsa_combined = pd.concat([cmsa_without_knmi, knmi2], axis=1)

In [48]:
cmsa_combined.columns

Index(['GAWW-11', 'GAWW-12', 'GAWW-14', 'vacation_dummy', 'holiday_dummy',
       'stringency_index', 'checkin_dam', 'checkout_dam', 'checkin_nieuwmarkt',
       'checkout_nieuwmarkt', 'hotel_gasten', 'hotel_overnachtingen',
       'airport_tot_passengers', 'airport_arrived_passengers',
       'airport_departed_passengers', 'covid_cases', 'covid_hospital',
       'covid_deaths', 'wind_direction', 'wind_speed', 'wind_speed_10m',
       'wind_gust', 'temperature', 'temperature_min', 'dew_point_temperature',
       'radiation_duration', 'global_radiation', 'precipitation_duration',
       'precipitation_h', 'pressure', 'sight', 'cloud_cover',
       'relative_humidity', 'weather_code', 'weather_index', 'fog', 'rain',
       'snow', 'thunder', 'ice'],
      dtype='object')

In [49]:
cmsa_combined.shape

(46848, 40)

In [51]:
cmsa_combined.to_csv("../data/cmsa_combined.csv")

In [52]:
cmsa_combined = pd.read_csv("../data/cmsa_combined.csv")
cmsa_combined.columns

Index(['datetime', 'GAWW-11', 'GAWW-12', 'GAWW-14', 'vacation_dummy',
       'holiday_dummy', 'stringency_index', 'checkin_dam', 'checkout_dam',
       'checkin_nieuwmarkt', 'checkout_nieuwmarkt', 'hotel_gasten',
       'hotel_overnachtingen', 'airport_tot_passengers',
       'airport_arrived_passengers', 'airport_departed_passengers',
       'covid_cases', 'covid_hospital', 'covid_deaths', 'wind_direction',
       'wind_speed', 'wind_speed_10m', 'wind_gust', 'temperature',
       'temperature_min', 'dew_point_temperature', 'radiation_duration',
       'global_radiation', 'precipitation_duration', 'precipitation_h',
       'pressure', 'sight', 'cloud_cover', 'relative_humidity', 'weather_code',
       'weather_index', 'fog', 'rain', 'snow', 'thunder', 'ice'],
      dtype='object')

In [53]:
def missingValuesInfo(df):
    total = df.isnull().sum().sort_values(ascending = False)
    percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100, 2)
    temp = pd.concat([total, percent], axis = 1,keys= ['Total', 'Percent'])
    return temp.loc[(temp['Total'] > 0)]

missingValuesInfo(cmsa_combined)

Unnamed: 0,Total,Percent
temperature_min,39144,83.56
weather_code,28040,59.85


In [57]:
cmsa_combined[['weather_index']].describe()

Unnamed: 0,weather_index
count,46848.0
mean,5.803449
std,0.980504
min,5.0
25%,5.0
50%,5.0
75%,7.0
max,7.0


In [58]:
cmsa_combined.drop(['temperature_min', 'weather_code'], axis=1, inplace=True)

In [59]:
def missingValuesInfo(df):
    total = df.isnull().sum().sort_values(ascending = False)
    percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100, 2)
    temp = pd.concat([total, percent], axis = 1,keys= ['Total', 'Percent'])
    return temp.loc[(temp['Total'] > 0)]

missingValuesInfo(cmsa_combined)

Unnamed: 0,Total,Percent
