### Mit Interaktionsvariablen arbeiten (Kapitel 3.5.2)
#### 1) Daten laden und Datumsindex einstellen

In [3]:
import pandas as pd
from math import ceil

pd.set_option('display.max_columns', 20)

data_url=r'https://github.com/tplusone/hanser_ml_zeitreihen/blob/master/Daten/trip_weather_hourly.csv?raw=true'
df = pd.read_csv(data_url)
df['start_date'] = pd.to_datetime(df['start_date'])
df = df.set_index('start_date')
df.head()

Unnamed: 0_level_0,number_trips,duration,max_temperature_f,mean_temperature_f,min_temperature_f,max_dew_point_f,mean_dew_point_f,min_dew_point_f,max_humidity,mean_humidity,...,mean_visibility_miles,min_visibility_miles,max_wind_Speed_mph,mean_wind_speed_mph,max_gust_speed_mph,precipitation_inches,cloud_cover,events,wind_dir_degrees,zip_code
start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-08-29 09:00:00,12,1022.5,74.0,68.0,61.0,61.0,58.0,56.0,93.0,75.0,...,10.0,10.0,23.0,11.0,28.0,0,4.0,,286.0,94107
2013-08-29 10:00:00,14,1677.857143,74.0,68.0,61.0,61.0,58.0,56.0,93.0,75.0,...,10.0,10.0,23.0,11.0,28.0,0,4.0,,286.0,94107
2013-08-29 11:00:00,42,2203.809524,74.0,68.0,61.0,61.0,58.0,56.0,93.0,75.0,...,10.0,10.0,23.0,11.0,28.0,0,4.0,,286.0,94107
2013-08-29 12:00:00,120,934.833333,74.0,68.0,61.0,61.0,58.0,56.0,93.0,75.0,...,10.0,10.0,23.0,11.0,28.0,0,4.0,,286.0,94107
2013-08-29 13:00:00,87,3934.103448,74.0,68.0,61.0,61.0,58.0,56.0,93.0,75.0,...,10.0,10.0,23.0,11.0,28.0,0,4.0,,286.0,94107


#### 2) Features aus Datum extrahieren

In [4]:
df['weekday'] = df.index.weekday
df['month'] = df.index.month
df['hour'] = df.index.hour
df[['weekday', 'month', 'hour']].head()

Unnamed: 0_level_0,weekday,month,hour
start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-08-29 09:00:00,3,8,9
2013-08-29 10:00:00,3,8,10
2013-08-29 11:00:00,3,8,11
2013-08-29 12:00:00,3,8,12
2013-08-29 13:00:00,3,8,13


#### 2) Train-Test-Split durchführen

In [5]:
from sklearn.model_selection import train_test_split

X = df[['month', 'weekday', 'hour']]
y = df['number_trips']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, 
                                            random_state=11)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((14066, 3), (3517, 3), (14066,), (3517,))

#### 3) Interaktionsvariable aus Weekday und hour bilden 
- Zunächst hour und weekday in One-Hot bzw. Dummy-Format bringen

In [12]:
from sklearn.preprocessing import OneHotEncoder

# Daten extrahieren
X_train_hour = X_train[['hour']]
X_test_hour = X_test[['hour']]

# One-Hot-Encoding
ohe = OneHotEncoder(sparse=False)
ohe.fit(X_train_hour)
X_train_hour = ohe.transform(X_train_hour)
X_test_hour = ohe.transform(X_test_hour)

# Dummy-Variable aus weekday erzeugen (Wochentage=0, Wochenende=1)
X_train_wd = X_train['weekday'].map(lambda x:
                                1 if x >= 5 else 0)
X_test_wd = X_test['weekday'].map(lambda x:
                                1 if x >= 5 else 0)
# Array in 2D-Format bringen [[1,3,6,4]]

X_train_wd = X_train_wd.values.reshape(-1, 1)
X_test_wd = X_test_wd.values.reshape(-1, 1)
X_train_wd


array([[0],
       [0],
       [0],
       ...,
       [0],
       [1],
       [1]], dtype=int64)

- Interaktionsvariable erzeugen und mit einfachen Variablen zusammenführen

In [14]:
import numpy as np

# Interaktionsvariablen erzeugen
y = c1*x1+c2*x2+c3*x3+c4*x3*x2.....
X_train_int = X_train_wd * X_train_hour
X_test_int = X_test_wd * X_test_hour

# Variablen zusammenfügen
X_train_compl = np.concatenate(
    [X_train_wd, X_train_hour, X_train_int], 
    axis=1)
X_test_compl = np.concatenate(
    [X_test_wd, X_test_hour, X_test_int], 
    axis=1)
X_train_compl.shape, X_test_compl.shape

((14066, 49), (3517, 49))

#### 4) Regression anlernen

In [15]:
from sklearn.linear_model import LinearRegression

linear = LinearRegression()
linear.fit(X_train_compl, y_train)
r2 = linear.score(X_test_compl, y_test)
print('r2 {:.3f}'.format(r2))

r2 0.829
