# Date and time features with Feature-engine

In [1]:
import pandas as pd
from feature_engine.datetime import DatetimeFeatures

In [2]:
# let's create a toy dataframe with some date variables

# first we create a series with the ranges
rng_ = pd.date_range("2019-03-05", periods=20, freq="D")

# now we convert the series in a dataframe
data = pd.DataFrame({"date": rng_})

# output the first 5 rows
data.head()

Unnamed: 0,date
0,2019-03-05
1,2019-03-06
2,2019-03-07
3,2019-03-08
4,2019-03-09


In [3]:
# Extract all possible features

dtfs = DatetimeFeatures(
    variables=None,  # it identifies the datetime variable automatically.
    features_to_extract="all",
)

In [4]:
# Extract features.
dft = dtfs.fit_transform(data)

# Capture the names of the features we just created.
# (Feature-engine tags them with the original variable name
# plus the feature it extracted).
vars_ = [v for v in dft.columns if "date" in v]

# Show
dft[vars_].head()

Unnamed: 0,date_month,date_quarter,date_semester,date_year,date_week,date_day_of_week,date_day_of_month,date_day_of_year,date_weekend,date_month_start,date_month_end,date_quarter_start,date_quarter_end,date_year_start,date_year_end,date_leap_year,date_days_in_month,date_hour,date_minute,date_second
0,3,1,1,2019,10,1,5,64,0,0,0,0,0,0,0,0,31,0,0,0
1,3,1,1,2019,10,2,6,65,0,0,0,0,0,0,0,0,31,0,0,0
2,3,1,1,2019,10,3,7,66,0,0,0,0,0,0,0,0,31,0,0,0
3,3,1,1,2019,10,4,8,67,0,0,0,0,0,0,0,0,31,0,0,0
4,3,1,1,2019,10,5,9,68,1,0,0,0,0,0,0,0,31,0,0,0


In [5]:
# The datetime variable, which was automatically
# identified, is stored in an attribute.

dtfs.variables_

['date']

In [6]:
# Extract most common features

dtfs = DatetimeFeatures(
    variables=None,  # it identifies the datetime variable automatically
    features_to_extract=None,
)

In [7]:
# Extract features
dft = dtfs.fit_transform(data)

# Capture the names of the features we just created.
# (Feature-engine tags them with the original variable name
# plus the feature it extracted).
vars_ = [v for v in dft.columns if "date" in v]

# Show
dft[vars_].head()

Unnamed: 0,date_month,date_year,date_day_of_week,date_day_of_month,date_hour,date_minute,date_second
0,3,2019,1,5,0,0,0
1,3,2019,2,6,0,0,0
2,3,2019,3,7,0,0,0
3,3,2019,4,8,0,0,0
4,3,2019,5,9,0,0,0


In [8]:
# Extract user defined features

dtfs = DatetimeFeatures(
    variables=None,  # it identifies the datetime variable automatically
    features_to_extract=["week", "year", "day_of_month", "day_of_week"],
)

In [9]:
# Extract features
dft = dtfs.fit_transform(data)

# Capture the names of the features we just created.
# (Feature-engine tags them with the original variable name
# plus the feature it extracted).
vars_ = [v for v in dft.columns if "date" in v]

# Show
dft[vars_].head()

Unnamed: 0,date_week,date_year,date_day_of_month,date_day_of_week
0,10,2019,5,1
1,10,2019,6,2
2,10,2019,7,3
3,10,2019,8,4
4,10,2019,9,5


In [10]:
# First, let's create a toy dataframe with some
# timestamps in different time zones.

df = pd.DataFrame()

df["time"] = pd.concat(
    [
        pd.Series(
            pd.date_range(
                start="2014-08-01 09:00", freq="H", periods=3, tz="Europe/Berlin"
            )
        ),
        pd.Series(
            pd.date_range(
                start="2014-08-01 09:00", freq="H", periods=3, tz="US/Central"
            )
        ),
    ],
    axis=0,
)

df

Unnamed: 0,time
0,2014-08-01 09:00:00+02:00
1,2014-08-01 10:00:00+02:00
2,2014-08-01 11:00:00+02:00
0,2014-08-01 09:00:00-05:00
1,2014-08-01 10:00:00-05:00
2,2014-08-01 11:00:00-05:00


We can see the different timezones indicated by the +2 and -5, with respect to the central meridian.

In [11]:
dfts = DatetimeFeatures(
    features_to_extract=["day_of_week", "hour", "minute"],
    drop_original=False,
    utc=True,  # to handle timezones
)

# DatetimeFeatures will take all timestamps to utc
# before deriving the features.

In [12]:
dft = dfts.fit_transform(df)

dft.head()

Unnamed: 0,time,time_day_of_week,time_hour,time_minute
0,2014-08-01 09:00:00+02:00,4,7,0
1,2014-08-01 10:00:00+02:00,4,8,0
2,2014-08-01 11:00:00+02:00,4,9,0
0,2014-08-01 09:00:00-05:00,4,14,0
1,2014-08-01 10:00:00-05:00,4,15,0
