# Importing the packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error

set_config(transform_output="pandas")

# Loading the data

In [2]:
FINAL_DATA_PATH = "../data/interim/final_time_series_data.csv"

df = pd.read_csv(FINAL_DATA_PATH, parse_dates=["tpep_pickup_datetime"])
df

Unnamed: 0,tpep_pickup_datetime,region,total_pickups,avg_pickups
0,2016-01-01 00:00:00,0,58,58.0
1,2016-01-01 00:15:00,0,120,97.0
2,2016-01-01 00:30:00,0,149,123.0
3,2016-01-01 00:45:00,0,160,140.0
4,2016-01-01 01:00:00,0,187,161.0
...,...,...,...,...
262075,2016-03-31 22:45:00,29,14,16.0
262076,2016-03-31 23:00:00,29,17,16.0
262077,2016-03-31 23:15:00,29,18,17.0
262078,2016-03-31 23:30:00,29,13,15.0


In [3]:
df.shape

(262080, 4)

In [4]:
df.dtypes

tpep_pickup_datetime    datetime64[ns]
region                           int64
total_pickups                    int64
avg_pickups                    float64
dtype: object

In [5]:
# Checking for missing values

df.isna().sum()

tpep_pickup_datetime    0
region                  0
total_pickups           0
avg_pickups             0
dtype: int64

# Extracting features

In [6]:
df["day_of_week"] = df["tpep_pickup_datetime"].dt.day_of_week
df["month"] = df["tpep_pickup_datetime"].dt.month
# As we have data for only 3 months, this month feature may not be very useful for prediction.
# However, I will use this feature for train-test split

# Setting the datetime column as the index

df.set_index("tpep_pickup_datetime", inplace=True)
df

Unnamed: 0_level_0,region,total_pickups,avg_pickups,day_of_week,month
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-01 00:00:00,0,58,58.0,4,1
2016-01-01 00:15:00,0,120,97.0,4,1
2016-01-01 00:30:00,0,149,123.0,4,1
2016-01-01 00:45:00,0,160,140.0,4,1
2016-01-01 01:00:00,0,187,161.0,4,1
...,...,...,...,...,...
2016-03-31 22:45:00,29,14,16.0,3,3
2016-03-31 23:00:00,29,17,16.0,3,3
2016-03-31 23:15:00,29,18,17.0,3,3
2016-03-31 23:30:00,29,13,15.0,3,3


# Creating lag features

In [7]:
region_grp = df.groupby("region")
region_grp

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017B7EA7B5D0>

In [8]:
# Shifting periods

periods = list(range(1, 5))
periods

[1, 2, 3, 4]

In [9]:
# Generating lag features

lag_features = region_grp["total_pickups"].shift(periods)
lag_features

Unnamed: 0_level_0,total_pickups_1,total_pickups_2,total_pickups_3,total_pickups_4
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-01 00:00:00,,,,
2016-01-01 00:15:00,58.0,,,
2016-01-01 00:30:00,120.0,58.0,,
2016-01-01 00:45:00,149.0,120.0,58.0,
2016-01-01 01:00:00,160.0,149.0,120.0,58.0
...,...,...,...,...
2016-03-31 22:45:00,22.0,14.0,15.0,13.0
2016-03-31 23:00:00,14.0,22.0,14.0,15.0
2016-03-31 23:15:00,17.0,14.0,22.0,14.0
2016-03-31 23:30:00,18.0,17.0,14.0,22.0


In [10]:
# Merging the lag features with the original df

data = pd.concat([lag_features, df], axis=1)
data

Unnamed: 0_level_0,total_pickups_1,total_pickups_2,total_pickups_3,total_pickups_4,region,total_pickups,avg_pickups,day_of_week,month
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-01-01 00:00:00,,,,,0,58,58.0,4,1
2016-01-01 00:15:00,58.0,,,,0,120,97.0,4,1
2016-01-01 00:30:00,120.0,58.0,,,0,149,123.0,4,1
2016-01-01 00:45:00,149.0,120.0,58.0,,0,160,140.0,4,1
2016-01-01 01:00:00,160.0,149.0,120.0,58.0,0,187,161.0,4,1
...,...,...,...,...,...,...,...,...,...
2016-03-31 22:45:00,22.0,14.0,15.0,13.0,29,14,16.0,3,3
2016-03-31 23:00:00,14.0,22.0,14.0,15.0,29,17,16.0,3,3
2016-03-31 23:15:00,17.0,14.0,22.0,14.0,29,18,17.0,3,3
2016-03-31 23:30:00,18.0,17.0,14.0,22.0,29,13,15.0,3,3


In [11]:
print(f"Shape of the data before concatenation: {df.shape}.")
print(f"Shape of the data after concatenation: {data.shape}.")

Shape of the data before concatenation: (262080, 5).
Shape of the data after concatenation: (262080, 9).


In [12]:
# Number of rows having missing values

data.isna().any(axis=1).sum()

np.int64(120)

In [13]:
# Dropping the rows with missing values

data.dropna(inplace=True)
data.isna().any(axis=1).sum()

np.int64(0)

In [14]:
mapper = {
    name: f"lag_{idx + 1}" for idx, name in enumerate(data.columns[0:4])
}
mapper

{'total_pickups_1': 'lag_1',
 'total_pickups_2': 'lag_2',
 'total_pickups_3': 'lag_3',
 'total_pickups_4': 'lag_4'}

In [15]:
# Replace column names

data = data.rename(columns=mapper)
data

Unnamed: 0_level_0,lag_1,lag_2,lag_3,lag_4,region,total_pickups,avg_pickups,day_of_week,month
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-01-01 01:00:00,160.0,149.0,120.0,58.0,0,187,161.0,4,1
2016-01-01 01:15:00,187.0,160.0,149.0,120.0,0,194,175.0,4,1
2016-01-01 01:30:00,194.0,187.0,160.0,149.0,0,180,177.0,4,1
2016-01-01 01:45:00,180.0,194.0,187.0,160.0,0,197,185.0,4,1
2016-01-01 02:00:00,197.0,180.0,194.0,187.0,0,185,185.0,4,1
...,...,...,...,...,...,...,...,...,...
2016-03-31 22:45:00,22.0,14.0,15.0,13.0,29,14,16.0,3,3
2016-03-31 23:00:00,14.0,22.0,14.0,15.0,29,17,16.0,3,3
2016-03-31 23:15:00,17.0,14.0,22.0,14.0,29,18,17.0,3,3
2016-03-31 23:30:00,18.0,17.0,14.0,22.0,29,13,15.0,3,3


In [16]:
# Number of rows in each month

data["month"].value_counts()

month
3    89280
1    89160
2    83520
Name: count, dtype: int64

In [17]:
data["month"].value_counts(normalize=True)

month
3    0.340815
1    0.340357
2    0.318827
Name: proportion, dtype: float64

# Train-test split

In [18]:
train_df = data.loc[data["month"].isin([1, 2]), "lag_1": "day_of_week"]
test_df = data.loc[data["month"].isin([3]), "lag_1": "day_of_week"]

In [19]:
train_df

Unnamed: 0_level_0,lag_1,lag_2,lag_3,lag_4,region,total_pickups,avg_pickups,day_of_week
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-01 01:00:00,160.0,149.0,120.0,58.0,0,187,161.0,4
2016-01-01 01:15:00,187.0,160.0,149.0,120.0,0,194,175.0,4
2016-01-01 01:30:00,194.0,187.0,160.0,149.0,0,180,177.0,4
2016-01-01 01:45:00,180.0,194.0,187.0,160.0,0,197,185.0,4
2016-01-01 02:00:00,197.0,180.0,194.0,187.0,0,185,185.0,4
...,...,...,...,...,...,...,...,...
2016-02-29 22:45:00,15.0,9.0,11.0,11.0,29,12,12.0,0
2016-02-29 23:00:00,12.0,15.0,9.0,11.0,29,17,14.0,0
2016-02-29 23:15:00,17.0,12.0,15.0,9.0,29,15,14.0,0
2016-02-29 23:30:00,15.0,17.0,12.0,15.0,29,15,15.0,0


In [20]:
test_df

Unnamed: 0_level_0,lag_1,lag_2,lag_3,lag_4,region,total_pickups,avg_pickups,day_of_week
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-03-01 00:00:00,36.0,44.0,31.0,29.0,0,41,39.0,1
2016-03-01 00:15:00,41.0,36.0,44.0,31.0,0,35,37.0,1
2016-03-01 00:30:00,35.0,41.0,36.0,44.0,0,47,41.0,1
2016-03-01 00:45:00,47.0,35.0,41.0,36.0,0,34,38.0,1
2016-03-01 01:00:00,34.0,47.0,35.0,41.0,0,30,35.0,1
...,...,...,...,...,...,...,...,...
2016-03-31 22:45:00,22.0,14.0,15.0,13.0,29,14,16.0,3
2016-03-31 23:00:00,14.0,22.0,14.0,15.0,29,17,16.0,3
2016-03-31 23:15:00,17.0,14.0,22.0,14.0,29,18,17.0,3
2016-03-31 23:30:00,18.0,17.0,14.0,22.0,29,13,15.0,3


In [21]:
# Saving the training and test sets

TRAIN_DATA_PATH = "../data/processed/train.csv"
TEST_DATA_PATH = "../data/processed/test.csv"

# train_df.to_csv(TRAIN_DATA_PATH, index=True)
# test_df.to_csv(TEST_DATA_PATH, index=True)

# Input-output split

In [22]:
X_train = train_df.drop(columns=["total_pickups"])
y_train = train_df["total_pickups"]

In [23]:
X_train

Unnamed: 0_level_0,lag_1,lag_2,lag_3,lag_4,region,avg_pickups,day_of_week
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-01 01:00:00,160.0,149.0,120.0,58.0,0,161.0,4
2016-01-01 01:15:00,187.0,160.0,149.0,120.0,0,175.0,4
2016-01-01 01:30:00,194.0,187.0,160.0,149.0,0,177.0,4
2016-01-01 01:45:00,180.0,194.0,187.0,160.0,0,185.0,4
2016-01-01 02:00:00,197.0,180.0,194.0,187.0,0,185.0,4
...,...,...,...,...,...,...,...
2016-02-29 22:45:00,15.0,9.0,11.0,11.0,29,12.0,0
2016-02-29 23:00:00,12.0,15.0,9.0,11.0,29,14.0,0
2016-02-29 23:15:00,17.0,12.0,15.0,9.0,29,14.0,0
2016-02-29 23:30:00,15.0,17.0,12.0,15.0,29,15.0,0


In [24]:
y_train

tpep_pickup_datetime
2016-01-01 01:00:00    187
2016-01-01 01:15:00    194
2016-01-01 01:30:00    180
2016-01-01 01:45:00    197
2016-01-01 02:00:00    185
                      ... 
2016-02-29 22:45:00     12
2016-02-29 23:00:00     17
2016-02-29 23:15:00     15
2016-02-29 23:30:00     15
2016-02-29 23:45:00     12
Name: total_pickups, Length: 172680, dtype: int64

In [25]:
X_test = test_df.drop(columns=["total_pickups"])
y_test = test_df["total_pickups"]

In [26]:
X_test

Unnamed: 0_level_0,lag_1,lag_2,lag_3,lag_4,region,avg_pickups,day_of_week
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-03-01 00:00:00,36.0,44.0,31.0,29.0,0,39.0,1
2016-03-01 00:15:00,41.0,36.0,44.0,31.0,0,37.0,1
2016-03-01 00:30:00,35.0,41.0,36.0,44.0,0,41.0,1
2016-03-01 00:45:00,47.0,35.0,41.0,36.0,0,38.0,1
2016-03-01 01:00:00,34.0,47.0,35.0,41.0,0,35.0,1
...,...,...,...,...,...,...,...
2016-03-31 22:45:00,22.0,14.0,15.0,13.0,29,16.0,3
2016-03-31 23:00:00,14.0,22.0,14.0,15.0,29,16.0,3
2016-03-31 23:15:00,17.0,14.0,22.0,14.0,29,17.0,3
2016-03-31 23:30:00,18.0,17.0,14.0,22.0,29,15.0,3


In [27]:
y_test

tpep_pickup_datetime
2016-03-01 00:00:00    41
2016-03-01 00:15:00    35
2016-03-01 00:30:00    47
2016-03-01 00:45:00    34
2016-03-01 01:00:00    30
                       ..
2016-03-31 22:45:00    14
2016-03-31 23:00:00    17
2016-03-31 23:15:00    18
2016-03-31 23:30:00    13
2016-03-31 23:45:00    14
Name: total_pickups, Length: 89280, dtype: int64

# Regression pipeline

In [28]:
encoder = ColumnTransformer(
    [
        ("ohe", OneHotEncoder(drop="first", sparse_output=False), ["region", "day_of_week"]),
    ],
    remainder="passthrough",
    n_jobs=-1,
    force_int_remainder_cols=False,
)
encoder

In [29]:
# Encoding training and test sets

X_train_encoded = encoder.fit_transform(X_train)
X_test_encoded = encoder.transform(X_test)

In [30]:
# Linear regression

regressor = LinearRegression()

regressor.fit(X_train_encoded, y_train)

In [31]:
# Predictions

y_pred_train = regressor.predict(X_train_encoded)
y_pred_test = regressor.predict(X_test_encoded)

In [32]:
# Error metrics

train_mape = mean_absolute_percentage_error(y_true=y_train, y_pred=y_pred_train)
test_mape = mean_absolute_percentage_error(y_true=y_test, y_pred=y_pred_test)

In [33]:
print(f"Train MAPE: {train_mape * 100:.2f}%.")
print(f"Test MAPE: {test_mape * 100:.2f}%.")

Train MAPE: 8.78%.
Test MAPE: 7.93%.
