In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
df_train = pd.read_csv('dataset/train_preprocessed.csv')
df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])

df_test = pd.read_csv('dataset/test_preprocessed.csv')
df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492157 entries, 0 to 492156
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   item_id           492157 non-null  object        
 1   timestamp         492157 non-null  datetime64[ns]
 2   rerata_kecepatan  492157 non-null  float64       
 3   lanes             492157 non-null  float64       
 4   maxspeed          492157 non-null  float64       
 5   highway           492157 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(1), object(1)
memory usage: 22.5+ MB


# Maxspeed Feature Enginering

### Unit Convertion

Data maxspeed yang diimport dari external data bersatuan mph, sedangkan data yang ada pada dataset bersatuan km/h. Maka dari itu perlu dilakukan konversi.

In [4]:
df_train['maxspeed_kmh'] = df_train['maxspeed'] * 1.60934
df_test['maxspeed_kmh'] = df_test['maxspeed'] * 1.60934

### Average from Max?

Menurut intuisi, tentu maxspeed sangat jarang terjadi, bukankah rata rata umumnya berada dibawah maxspeed? maka dari itu akan dicoba dibuat feature feature baru berupa half_maxspeed, 3_per_4_maxspeed, dan 2_per_3_maxspeed

In [5]:
df_train['half_maxspeed'] = df_train['maxspeed'] * 0.5
df_test['half_maxspeed'] = df_test['maxspeed'] * 0.5

df_train['3_per_4_maxspeed'] = df_train['maxspeed'] * 0.75
df_test['3_per_4_maxspeed'] = df_test['maxspeed'] * 0.75

df_train['2_per_3_maxspeed'] = df_train['maxspeed'] * 0.666
df_test['2_per_3_maxspeed'] = df_test['maxspeed'] * 0.666

# Timeseries Feature Extraction

### day_of_week

In [6]:
df_train['day_of_week'] = df_train['timestamp'].dt.dayofweek
df_test['day_of_week'] = df_test['timestamp'].dt.dayofweek

### holiday

In [7]:
df_train['holiday'] = df_train['day_of_week']
df_test['holiday'] = df_test['day_of_week']

# Label Encode "libur"

ordinal_map = {5:1,
               6:1,
               0:0,
               1:0,
               2:0,
               3:0,
               4:0,
}

df_train['holiday'] = df_train.day_of_week.map(ordinal_map)
df_test['holiday'] = df_test.day_of_week.map(ordinal_map)

### hour

In [8]:
df_train['hour'] = df_train['timestamp'].dt.hour
df_test['hour'] = df_test['timestamp'].dt.hour

### mean_rerata_kecepatan_mingguan

In [9]:
# Define a custom aggregation function to calculate mean
def calculate_mean(group):
    return group['rerata_kecepatan'].mean()

# Apply the custom aggregation function to the DataFrame
mean_values = df_train.groupby(['item_id', 'day_of_week', 'hour']).apply(calculate_mean).reset_index(name='mean_rerata_kecepatan_mingguan')

# Merge the mean_values DataFrame back into df_train
df_train = df_train.merge(mean_values, on=['item_id', 'day_of_week', 'hour'], how='left')
df_test = df_test.merge(mean_values, on=['item_id', 'day_of_week', 'hour'], how='left')

### mean_rerata_kecepatan_harian

In [10]:
# Apply the custom aggregation function to the DataFrame
mean_values = df_train.groupby(['item_id', 'hour']).apply(calculate_mean).reset_index(name='mean_rerata_kecepatan_harian')

# Merge the mean_values DataFrame back into df_train
df_train = df_train.merge(mean_values, on=['item_id', 'hour'], how='left')
df_test = df_test.merge(mean_values, on=['item_id', 'hour'], how='left')

### Save File

In [11]:
df_train.to_csv(r'dataset/train_feature_engineered.csv', index=False)
df_test.to_csv(r'dataset/test_feature_engineered.csv', index=False)