# 6.1. Time Series Feature Extraction

Target di notebook ini kita ekstrak banyak feature dulu, baru habis itu diuji kualitas featurenya.

Walaupun banyak feature yg ga masuk akal, kayak misal masak tiap hari jumat lebih sering hujan? kan gamasuk ya, tp coba aja :v

In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('./dataset/train_cleaned.csv')
df_train['datetime_iso'] = pd.to_datetime(df_train['datetime_iso'])

df_test = pd.read_csv('./dataset/test_cleaned.csv')
df_test['datetime_iso'] = pd.to_datetime(df_test['datetime_iso'])

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 341880 entries, 0 to 341879
Data columns (total 20 columns):
 #   Column        Non-Null Count   Dtype              
---  ------        --------------   -----              
 0   datetime      341880 non-null  int64              
 1   datetime_iso  341880 non-null  datetime64[ns, UTC]
 2   time-zone     341880 non-null  int64              
 3   temp          341880 non-null  float64            
 4   visibility    51112 non-null   object             
 5   d_point       341880 non-null  float64            
 6   feels         341880 non-null  float64            
 7   min_temp      341880 non-null  float64            
 8   max_temp      341880 non-null  float64            
 9   prssr         341880 non-null  float64            
 10  sea_level     192964 non-null  object             
 11  grnd_level    192919 non-null  object             
 12  hum           341880 non-null  float64            
 13  wind_spd      339654 non-null  float64      

In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49368 entries, 0 to 49367
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   datetime      49368 non-null  int64              
 1   datetime_iso  49368 non-null  datetime64[ns, UTC]
 2   time-zone     49368 non-null  int64              
 3   temp          49368 non-null  float64            
 4   visibility    7533 non-null   object             
 5   d_point       49367 non-null  float64            
 6   feels         49368 non-null  float64            
 7   min_temp      49368 non-null  float64            
 8   max_temp      49368 non-null  float64            
 9   prssr         49368 non-null  float64            
 10  sea_level     27694 non-null  object             
 11  grnd_level    27866 non-null  object             
 12  hum           49368 non-null  float64            
 13  wind_spd      49034 non-null  float64            
 14  wind_d

In [6]:
df = pd.concat([df_train, df_test], axis=0)

# Feature Extraction: Time Series

In [7]:
# Kita ambil datetime_iso saja dan label nya

df_time_series = df[['datetime_iso', 'rain_1h']]

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 341880 entries, 0 to 341879
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype              
---  ------        --------------   -----              
 0   datetime_iso  341880 non-null  datetime64[ns, UTC]
 1   rain_1h       341880 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(1)
memory usage: 5.2 MB


## Create Feature: Day

In [6]:
# Extract the month and create a new column
df_train['day_of_the_week'] = df_train['datetime_iso'].dt.dayofweek
df_test['day_of_the_week'] = df_test['datetime_iso'].dt.dayofweek

In [7]:
df_train.day_of_the_week.value_counts()

0    48840
1    48840
2    48840
3    48840
4    48840
5    48840
6    48840
Name: day_of_the_week, dtype: int64

## Create Feature: Month

In [8]:
# Extract the month and create a new column
df_train['month'] = df_train['datetime_iso'].dt.month
df_test['month'] = df_test['datetime_iso'].dt.month

In [9]:
df_train.month.value_counts()

1     29016
3     29016
5     29016
7     29016
8     29016
10    29016
12    29016
4     28080
6     28080
9     28080
11    28080
2     26448
Name: month, dtype: int64

## Create Feature: Season

Pembagian musim:
- Hujan: Oktober - Maret
- Kemarau: April - September

In [10]:
ordinal_map = {1:1,
               2:1,
               3:1,
               4:0,
               5:0,
               6:0,
               7:0,
               8:0,
               9:0,
               10:1,
               11:1,
               12:1,
               
}

df_train['season'] = df_train.month.map(ordinal_map)
df_test['season'] = df_test.month.map(ordinal_map)

In [11]:
df_train.head()

Unnamed: 0,datetime_iso,rain_1h,day_of_the_week,month,season
0,1979-01-01 00:00:00+00:00,0.0,0,1,1
1,1979-01-01 01:00:00+00:00,0.0,0,1,1
2,1979-01-01 02:00:00+00:00,0.0,0,1,1
3,1979-01-01 03:00:00+00:00,0.13,0,1,1
4,1979-01-01 04:00:00+00:00,0.34,0,1,1


### Create Feature: Lag 6 years ago

In [12]:
# Specify the number of years to lag
lag_years = 6

# Calculate the lagged date
df_train['lag_rain_1h_6_years_date'] = df_train['datetime_iso'] - pd.DateOffset(years=lag_years)
df_test['lag_rain_1h_6_years_date'] = df_test['datetime_iso'] - pd.DateOffset(years=lag_years)

df_train['lag_rain_1h_6_years_date'] = pd.to_datetime(df_train['lag_rain_1h_6_years_date'])
df_test['lag_rain_1h_6_years_date'] = pd.to_datetime(df_test['lag_rain_1h_6_years_date'])

# Merge the DataFrame with itself to get lag_rain_1h_6_years_value
result_df =df_train.merge(df_train[['datetime_iso', 'rain_1h']], left_on='lag_rain_1h_6_years_date', right_on='datetime_iso', suffixes=('', '_lag'))

# Rename the 'rain_1h_lag' column to 'lag_rain_1h_6_years_value'
result_df.rename(columns={'rain_1h_lag': 'lag_rain_1h_6_years_value'}, inplace=True)

# Drop the duplicated datetime_iso column
result_df.drop(columns='datetime_iso_lag', inplace=True)

In [15]:
result_df.head()
result_df.drop(['lag_rain_1h_6_years_date'])

Unnamed: 0,datetime_iso,rain_1h,day_of_the_week,month,season,lag_rain_1h_6_years_date,lag_rain_1h_6_years_value
0,1985-01-01 00:00:00+00:00,0.0,1,1,1,1979-01-01 00:00:00+00:00,0.0
1,1985-01-01 01:00:00+00:00,0.11,1,1,1,1979-01-01 01:00:00+00:00,0.0
2,1985-01-01 02:00:00+00:00,0.1,1,1,1,1979-01-01 02:00:00+00:00,0.0
3,1985-01-01 03:00:00+00:00,0.13,1,1,1,1979-01-01 03:00:00+00:00,0.13
4,1985-01-01 04:00:00+00:00,0.33,1,1,1,1979-01-01 04:00:00+00:00,0.34


## Kita uji dengan model sederhana saja: random forest

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
X = df_train

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

NameError: name 'X' is not defined