By C4

In [1]:
%pylab inline
import pandas as pd
import workalendar.europe.belgium as belgium

Populating the interactive namespace from numpy and matplotlib


In [2]:
raw_data = pd.read_csv("data/dataraw.csv", parse_dates=["time"])

In [3]:
cleaned_data = raw_data[["time", "is_open"]].copy()
cleaned_data.loc[cleaned_data.time.dt.hour >= 23, "is_open"] = False
cleaned_data.loc[cleaned_data.time.dt.hour < 7, "is_open"] = False
cleaned_data["date"] = cleaned_data.time.dt.date

cleaned_data.sample(10)

Unnamed: 0,time,is_open,date
11984,2014-04-20 22:00:00,False,2014-04-20
11500,2014-03-31 18:00:00,False,2014-03-31
6148,2013-08-20 18:00:00,False,2013-08-20
35164,2016-12-11 18:00:00,False,2016-12-11
38950,2017-05-18 12:00:00,False,2017-05-18
20065,2015-03-23 15:00:00,False,2015-03-23
15526,2014-09-15 12:00:00,False,2014-09-15
10657,2014-02-24 15:00:00,True,2014-02-24
30845,2016-06-14 19:00:00,False,2016-06-14
10035,2014-01-29 17:00:00,True,2014-01-29


In [4]:
data_dates = pd.Series(cleaned_data["date"].unique(), name="day")
data_dates.sample(10)

251     2013-08-15
241     2013-08-05
1359    2016-08-27
1625    2017-05-20
1100    2015-12-12
216     2013-07-11
1613    2017-05-08
747     2014-12-24
331     2013-11-03
1103    2015-12-15
Name: day, dtype: object

In [5]:
starting_days = [
    datetime.date(2009, 9, 14),
    datetime.date(2010, 9, 20),
    datetime.date(2011, 9, 19),
    datetime.date(2012, 9, 17),
    datetime.date(2013, 9, 16),
    datetime.date(2014, 9, 15),
    datetime.date(2015, 9, 14),
    datetime.date(2016, 9, 19),
    datetime.date(2017, 9, 18),
]

assert all([d.weekday() == 0 for d in starting_days])

In [6]:
def get_week_no(date):
    possible = [d for d in starting_days if d <= date]
    aca_start = possible[-1]
    dt = date - aca_start
    return math.floor(timedelta64(dt) / timedelta64(1,'W'))

week_df = data_dates.to_frame()
week_df["week_no"] = week_df["day"].apply(get_week_no)
week_df["day"] = pd.to_datetime(week_df["day"])

#week_df[week_df.day.dt.date > datetime64("2013-09-10")].head(15)

In [7]:
featurized = cleaned_data.copy()

# day of the week
featurized["weekday"] = featurized["time"].dt.weekday

# academic week number
featurized["aca_week_no"] = featurized["date"].apply(get_week_no)

# civil week number
featurized["week_no"] = featurized["time"].dt.week

# date
featurized["year"] = featurized["time"].dt.year
featurized["month"] = featurized["time"].dt.month
featurized["day"] = featurized["time"].dt.day
featurized["hour"] = featurized["time"].dt.hour

# holiday
calendar = belgium.Belgium()
featurized["working"] = featurized["date"].apply(calendar.is_working_day)
### St V
featurized.loc[(featurized["time"].dt.month == 11) & (featurized["time"].dt.day == 20), "working"] = False

# tampon
featurized["tampon"] = False
featurized.loc[featurized.aca_week_no.isin([6, 13, 26, 34]), "tampon"] = True

# exams
featurized["exams"] = False
featurized.loc[featurized.aca_week_no.isin([14, 15, 16, 17, 18]), "exams"] = True
featurized.loc[featurized.aca_week_no.isin([36, 37, 38, 39, 40]), "exams"] = True

# vacances
featurized["vacances"] = False
featurized.loc[featurized.aca_week_no.isin([19] + list(range(41, 52))), "vacances"] = True

# weather

featurized.sort_values("time")
featurized.sample(5)

Unnamed: 0,time,is_open,date,weekday,aca_week_no,week_no,year,month,day,hour,working,tampon,exams,vacances
15650,2014-09-20 16:00:00,False,2014-09-20,5,0.0,38,2014,9,20,16,False,False,False,False
27499,2016-01-27 09:00:00,False,2016-01-27,2,19.0,4,2016,1,27,9,True,False,False,True
27906,2016-02-13 08:00:00,False,2016-02-13,5,21.0,6,2016,2,13,8,False,False,False,False
1546,2013-02-10 00:00:00,False,2013-02-10,6,20.0,6,2013,2,10,0,False,False,False,False
9186,2013-12-25 08:00:00,False,2013-12-25,2,14.0,52,2013,12,25,8,False,False,True,False


In [8]:
featurized.to_csv("data/data_featurized.csv")