# 6.1. Time Series Feature Extraction

Target di notebook ini kita ekstrak banyak feature dulu, baru habis itu diuji kualitas featurenya.

Walaupun banyak feature yg ga masuk akal, kayak misal masak tiap hari jumat lebih sering hujan? kan gamasuk ya, tp coba aja :v

In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('./dataset/train_cleaned.csv')
df_train['datetime_iso'] = pd.to_datetime(df_train['datetime_iso'])
df_train['source'] = 'df_train'

df_test = pd.read_csv('./dataset/test_cleaned.csv')
df_test['datetime_iso'] = pd.to_datetime(df_test['datetime_iso'])
df_test['source'] = 'df_test'

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 341880 entries, 0 to 341879
Data columns (total 21 columns):
 #   Column        Non-Null Count   Dtype              
---  ------        --------------   -----              
 0   datetime      341880 non-null  int64              
 1   datetime_iso  341880 non-null  datetime64[ns, UTC]
 2   time-zone     341880 non-null  int64              
 3   temp          341880 non-null  float64            
 4   visibility    51112 non-null   object             
 5   d_point       341880 non-null  float64            
 6   feels         341880 non-null  float64            
 7   min_temp      341880 non-null  float64            
 8   max_temp      341880 non-null  float64            
 9   prssr         341880 non-null  float64            
 10  sea_level     192964 non-null  object             
 11  grnd_level    192919 non-null  object             
 12  hum           341880 non-null  float64            
 13  wind_spd      339654 non-null  float64      

In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49368 entries, 0 to 49367
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   datetime      49368 non-null  int64              
 1   datetime_iso  49368 non-null  datetime64[ns, UTC]
 2   time-zone     49368 non-null  int64              
 3   temp          49368 non-null  float64            
 4   visibility    7533 non-null   object             
 5   d_point       49367 non-null  float64            
 6   feels         49368 non-null  float64            
 7   min_temp      49368 non-null  float64            
 8   max_temp      49368 non-null  float64            
 9   prssr         49368 non-null  float64            
 10  sea_level     27694 non-null  object             
 11  grnd_level    27866 non-null  object             
 12  hum           49368 non-null  float64            
 13  wind_spd      49034 non-null  float64            
 14  wind_d

In [5]:
df = pd.concat([df_train, df_test], axis=0)
#df = df_train

# Feature Extraction: Time Series

In [6]:
# Kita ambil datetime_iso saja dan label nya

#df_time_series = df[['datetime_iso', 'rain_1h']]

In [7]:
#df_time_series.info()

## Create Feature: Day

In [8]:
# Extract the month and create a new column
df['day_of_the_week'] = df['datetime_iso'].dt.dayofweek

In [9]:
df.day_of_the_week.value_counts()

0    55896
1    55896
2    55896
3    55896
4    55896
5    55896
6    55872
Name: day_of_the_week, dtype: int64

## Create Feature: Month

In [10]:
# Extract the month and create a new column
df['month'] = df['datetime_iso'].dt.month

In [11]:
df.month.value_counts()

1     33480
3     33480
5     33480
7     33480
8     33192
10    32736
12    32736
4     32400
6     32400
9     31680
11    31680
2     30504
Name: month, dtype: int64

## Create Feature: Season

Pembagian musim:
- Hujan: Oktober - Maret
- Kemarau: April - September

In [12]:
ordinal_map = {1:1,
               2:1,
               3:1,
               4:0,
               5:0,
               6:0,
               7:0,
               8:0,
               9:0,
               10:1,
               11:1,
               12:1,
               
}

df['season'] = df.month.map(ordinal_map)

In [13]:
df.head()

Unnamed: 0,datetime,datetime_iso,time-zone,temp,visibility,d_point,feels,min_temp,max_temp,prssr,...,wind_deg,rain_1h,rain_3h,snow_1h,snow_3h,clouds,source,day_of_the_week,month,season
0,283996800,1979-01-01 00:00:00+00:00,28800,24.75,,23.89,25.76,24.28,25.22,1012.0,...,320.0,0.0,0,,,100.0,df_train,0,1,1
1,284000400,1979-01-01 01:00:00+00:00,28800,24.58,,23.73,25.57,23.99,25.26,1012.0,...,338.0,0.0,0,0.0,0.0,100.0,df_train,0,1,1
2,284004000,1979-01-01 02:00:00+00:00,28800,26.6,unidentified,24.06,26.6,26.1,27.39,1012.0,...,339.0,0.0,volume:zero,,,99.0,df_train,0,1,1
3,284007600,1979-01-01 03:00:00+00:00,28800,27.31,,24.37,30.9,26.59,28.36,1012.0,...,342.0,0.13,nol,0.0,,94.0,df_train,0,1,1
4,284011200,1979-01-01 04:00:00+00:00,28800,27.41,,25.05,31.54,26.58,28.31,1011.0,...,336.0,0.34,nol,,0.0,100.0,df_train,0,1,1


### Create Feature: yearly_mean

We create the 'yearly_mean' feature by grouping the DataFrame by the year of the 'date' column and then using the transform function to calculate the mean value for each group. This assigns the mean value of each year to all rows within that year.

In [14]:
df['yearly_mean'] = df.groupby(df['datetime_iso'].dt.year)['rain_1h'].transform('mean')

### Create Feature: monthly_mean
We create the 'monthly_mean' feature by grouping the DataFrame by the month of the 'date' column and calculating the mean value for each month.

In [15]:
df['monthly_mean'] = df.groupby(df['datetime_iso'].dt.month)['rain_1h'].transform('mean')

### Create Feature: weekly_mean

In [16]:
df['weekly_mean'] = df.groupby([df['datetime_iso'].dt.year, df['datetime_iso'].dt.month, df['datetime_iso'].dt.isocalendar().week])['rain_1h'].transform('mean')

In [17]:
df.head()


Unnamed: 0,datetime,datetime_iso,time-zone,temp,visibility,d_point,feels,min_temp,max_temp,prssr,...,snow_1h,snow_3h,clouds,source,day_of_the_week,month,season,yearly_mean,monthly_mean,weekly_mean
0,283996800,1979-01-01 00:00:00+00:00,28800,24.75,,23.89,25.76,24.28,25.22,1012.0,...,,,100.0,df_train,0,1,1,0.373315,0.442561,0.518929
1,284000400,1979-01-01 01:00:00+00:00,28800,24.58,,23.73,25.57,23.99,25.26,1012.0,...,0.0,0.0,100.0,df_train,0,1,1,0.373315,0.442561,0.518929
2,284004000,1979-01-01 02:00:00+00:00,28800,26.6,unidentified,24.06,26.6,26.1,27.39,1012.0,...,,,99.0,df_train,0,1,1,0.373315,0.442561,0.518929
3,284007600,1979-01-01 03:00:00+00:00,28800,27.31,,24.37,30.9,26.59,28.36,1012.0,...,0.0,,94.0,df_train,0,1,1,0.373315,0.442561,0.518929
4,284011200,1979-01-01 04:00:00+00:00,28800,27.41,,25.05,31.54,26.58,28.31,1011.0,...,,0.0,100.0,df_train,0,1,1,0.373315,0.442561,0.518929


## Kita uji dengan pycaret: random forest

In [18]:
df = df.drop(['datetime','datetime_iso','time-zone','visibility','sea_level','grnd_level','rain_3h','snow_1h','snow_3h'],axis = 1)

In [19]:
df_train = df[df['source'] == 'df_train']
df_test = df[df['source'] == 'df_test']

# Remove the 'source' column if you no longer need it
df_train = df_train.drop(columns=['source'])
df_test = df_test.drop(columns=['source'])

In [20]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 341880 entries, 0 to 341879
Data columns (total 17 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   temp             341880 non-null  float64
 1   d_point          341880 non-null  float64
 2   feels            341880 non-null  float64
 3   min_temp         341880 non-null  float64
 4   max_temp         341880 non-null  float64
 5   prssr            341880 non-null  float64
 6   hum              341880 non-null  float64
 7   wind_spd         339654 non-null  float64
 8   wind_deg         341880 non-null  float64
 9   rain_1h          341880 non-null  float64
 10  clouds           341880 non-null  float64
 11  day_of_the_week  341880 non-null  int64  
 12  month            341880 non-null  int64  
 13  season           341880 non-null  int64  
 14  yearly_mean      341880 non-null  float64
 15  monthly_mean     341880 non-null  float64
 16  weekly_mean      341880 non-null  floa

In [21]:
from pycaret.regression import *
s = setup(df_train, target = 'rain_1h', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,rain_1h
2,Target type,Regression
3,Original data shape,"(341880, 17)"
4,Transformed data shape,"(341880, 17)"
5,Transformed train set shape,"(239315, 17)"
6,Transformed test set shape,"(102565, 17)"
7,Numeric features,16
8,Rows with missing values,0.7%
9,Preprocess,True


In [22]:
lr = create_model('lr')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.5013,0.9481,0.9737,0.0578,0.3971,1.059
1,0.4786,0.7873,0.8873,0.0612,0.3842,1.049
2,0.4907,0.8568,0.9256,0.0513,0.3928,1.0516
3,0.4915,0.8714,0.9335,0.0537,0.393,1.0163
4,0.4881,0.8398,0.9164,0.055,0.3909,1.0422
5,0.4885,0.8204,0.9058,0.0524,0.3921,1.0497
6,0.4778,0.8009,0.8949,0.058,0.3849,1.0524
7,0.487,0.8326,0.9124,0.0559,0.3903,1.0601
8,0.4873,0.8284,0.9102,0.0491,0.3911,1.0598
9,0.4921,0.8223,0.9068,0.0534,0.3934,1.044


In [23]:
rf = create_model('rf')


KeyboardInterrupt



In [24]:
catboost = create_model('catboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4492,0.7887,0.8881,0.2162,0.3648,1.2578
1,0.4261,0.662,0.8136,0.2107,0.3527,1.255
2,0.4383,0.7388,0.8595,0.182,0.3598,1.2188
3,0.442,0.7443,0.8627,0.1917,0.3617,1.2035
4,0.4361,0.7169,0.8467,0.1933,0.3579,1.2179
5,0.439,0.7146,0.8454,0.1747,0.3622,1.2414
6,0.428,0.6739,0.8209,0.2073,0.355,1.255
7,0.4366,0.7131,0.8445,0.1914,0.3589,1.2568
8,0.4358,0.7047,0.8395,0.191,0.3585,1.2534
9,0.4405,0.7127,0.8442,0.1795,0.3615,1.2056


## kita coba pake satu satu

In [None]:
# ['day_of_the_week', 'month','season','yearly_mean','monthly_mean','weekly_mean']

### Day_of_The_week

In [26]:
df_train_dotw = df_train.drop(['month','season','yearly_mean','monthly_mean','weekly_mean'],axis = 1)

In [27]:
s_dotw = setup(df_train_dotw, target = 'rain_1h', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,rain_1h
2,Target type,Regression
3,Original data shape,"(341880, 12)"
4,Transformed data shape,"(341880, 12)"
5,Transformed train set shape,"(239315, 12)"
6,Transformed test set shape,"(102565, 12)"
7,Numeric features,11
8,Rows with missing values,0.7%
9,Preprocess,True


In [28]:
catboost = create_model('catboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.464,0.8665,0.9308,0.1389,0.3752,1.2182
1,0.4406,0.7222,0.8498,0.1389,0.364,1.2126
2,0.4533,0.7747,0.8802,0.1422,0.3698,1.2109
3,0.4542,0.7899,0.8887,0.1422,0.3707,1.1747
4,0.4508,0.7596,0.8716,0.1452,0.3686,1.1867
5,0.4521,0.7417,0.8612,0.1434,0.37,1.2231
6,0.4423,0.7345,0.857,0.136,0.3656,1.2282
7,0.4501,0.7452,0.8632,0.155,0.3676,1.2282
8,0.4491,0.7461,0.8638,0.1436,0.3679,1.2208
9,0.4541,0.7442,0.8627,0.1432,0.3712,1.1871


### Month

In [29]:
df_train_month = df_train.drop(['day_of_the_week','season','yearly_mean','monthly_mean','weekly_mean'],axis = 1)

In [30]:
s_month = setup(df_train_month, target = 'rain_1h', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,rain_1h
2,Target type,Regression
3,Original data shape,"(341880, 12)"
4,Transformed data shape,"(341880, 12)"
5,Transformed train set shape,"(239315, 12)"
6,Transformed test set shape,"(102565, 12)"
7,Numeric features,11
8,Rows with missing values,0.7%
9,Preprocess,True


In [31]:
catboost = create_model('catboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4614,0.858,0.9263,0.1473,0.3732,1.225
1,0.4381,0.716,0.8462,0.1463,0.3624,1.2086
2,0.4512,0.7694,0.8771,0.1481,0.3685,1.2066
3,0.4527,0.7845,0.8857,0.148,0.3701,1.1768
4,0.4485,0.7517,0.867,0.1542,0.3668,1.1846
5,0.4485,0.7323,0.8557,0.1543,0.3676,1.2131
6,0.4402,0.7279,0.8532,0.1438,0.3637,1.2301
7,0.4473,0.7383,0.8593,0.1628,0.3658,1.2306
8,0.4455,0.7364,0.8582,0.1546,0.3656,1.2162
9,0.4519,0.736,0.8579,0.1528,0.3694,1.187


### Season

In [32]:
df_train_season = df_train.drop(['day_of_the_week','month','yearly_mean','monthly_mean','weekly_mean'],axis = 1)

In [35]:
s_season = setup(df_train_season, target = 'rain_1h', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,rain_1h
2,Target type,Regression
3,Original data shape,"(341880, 12)"
4,Transformed data shape,"(341880, 12)"
5,Transformed train set shape,"(239315, 12)"
6,Transformed test set shape,"(102565, 12)"
7,Numeric features,11
8,Rows with missing values,0.7%
9,Preprocess,True


In [36]:
catboost = create_model('catboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4636,0.8649,0.93,0.1405,0.3751,1.2204
1,0.4399,0.7194,0.8482,0.1422,0.3637,1.2084
2,0.4525,0.7725,0.8789,0.1447,0.3693,1.2045
3,0.4536,0.7872,0.8872,0.1451,0.3702,1.1745
4,0.4508,0.7588,0.8711,0.1461,0.3687,1.1867
5,0.4507,0.7383,0.8592,0.1473,0.3692,1.2127
6,0.4421,0.733,0.8561,0.1379,0.3652,1.2278
7,0.4493,0.7429,0.8619,0.1576,0.3672,1.2231
8,0.4475,0.7425,0.8617,0.1477,0.367,1.2121
9,0.4528,0.7402,0.8604,0.1478,0.3701,1.1802


### yearly_mean

In [37]:
df_train_yearly_mean = df_train.drop(['day_of_the_week','month','season','monthly_mean','weekly_mean'],axis = 1)

In [38]:
s_yearly_mean = setup(df_train_yearly_mean, target = 'rain_1h', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,rain_1h
2,Target type,Regression
3,Original data shape,"(341880, 12)"
4,Transformed data shape,"(341880, 12)"
5,Transformed train set shape,"(239315, 12)"
6,Transformed test set shape,"(102565, 12)"
7,Numeric features,11
8,Rows with missing values,0.7%
9,Preprocess,True


In [39]:
catboost = create_model('catboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4626,0.8615,0.9282,0.1438,0.374,1.2198
1,0.4392,0.7155,0.8459,0.1469,0.3627,1.2166
2,0.4524,0.7726,0.879,0.1446,0.3688,1.2181
3,0.4533,0.7882,0.8878,0.144,0.37,1.1752
4,0.4503,0.7582,0.8708,0.1468,0.3681,1.1937
5,0.4516,0.7405,0.8605,0.1448,0.3696,1.2238
6,0.4418,0.7338,0.8566,0.1369,0.3648,1.2282
7,0.4488,0.7431,0.862,0.1574,0.3667,1.2285
8,0.4484,0.7428,0.8619,0.1473,0.3671,1.2244
9,0.4538,0.7427,0.8618,0.145,0.3707,1.1866


### monthly_mean

In [40]:
df_train_monthly_mean = df_train.drop(['day_of_the_week','month','season','yearly_mean','weekly_mean'],axis = 1)

In [41]:
s_monthly_mean = setup(df_train_monthly_mean, target = 'rain_1h', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,rain_1h
2,Target type,Regression
3,Original data shape,"(341880, 12)"
4,Transformed data shape,"(341880, 12)"
5,Transformed train set shape,"(239315, 12)"
6,Transformed test set shape,"(102565, 12)"
7,Numeric features,11
8,Rows with missing values,0.7%
9,Preprocess,True


In [42]:
catboost = create_model('catboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4614,0.8546,0.9244,0.1507,0.3731,1.2259
1,0.437,0.7142,0.8451,0.1484,0.3618,1.208
2,0.4516,0.769,0.8769,0.1486,0.3685,1.2055
3,0.4525,0.7847,0.8858,0.1478,0.3699,1.1768
4,0.4483,0.7527,0.8676,0.153,0.3669,1.1877
5,0.4488,0.7327,0.856,0.1538,0.3679,1.2116
6,0.4404,0.7279,0.8532,0.1438,0.3638,1.2233
7,0.4472,0.7382,0.8592,0.1629,0.3661,1.2258
8,0.447,0.7388,0.8595,0.1519,0.3664,1.2245
9,0.4518,0.7373,0.8587,0.1512,0.3694,1.1847


### weekly_mean

In [43]:
df_train_weekly_mean = df_train.drop(['day_of_the_week','month','season','yearly_mean','monthly_mean'],axis = 1)

In [44]:
s_weekly_mean = setup(df_train_weekly_mean, target = 'rain_1h', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,rain_1h
2,Target type,Regression
3,Original data shape,"(341880, 12)"
4,Transformed data shape,"(341880, 12)"
5,Transformed train set shape,"(239315, 12)"
6,Transformed test set shape,"(102565, 12)"
7,Numeric features,11
8,Rows with missing values,0.7%
9,Preprocess,True


In [45]:
catboost = create_model('catboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4523,0.8249,0.9082,0.1802,0.3674,1.2666
1,0.4292,0.6809,0.8252,0.1881,0.3558,1.2528
2,0.4421,0.7522,0.8673,0.1671,0.363,1.2226
3,0.4428,0.7577,0.8705,0.1771,0.3629,1.1909
4,0.4391,0.7279,0.8532,0.1809,0.3607,1.2176
5,0.4413,0.7237,0.8507,0.1642,0.3639,1.246
6,0.432,0.7076,0.8412,0.1677,0.3582,1.2574
7,0.4369,0.7153,0.8457,0.1889,0.3592,1.2535
8,0.4387,0.7188,0.8478,0.1749,0.3612,1.2555
9,0.4443,0.7221,0.8497,0.1688,0.3645,1.2092


### Treshold

In [46]:
df_train_treshold = df_train.drop(['day_of_the_week', 'month','season','yearly_mean','monthly_mean','weekly_mean'],axis = 1)

In [47]:
s_tres = setup(df_train_treshold, target = 'rain_1h', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,rain_1h
2,Target type,Regression
3,Original data shape,"(341880, 11)"
4,Transformed data shape,"(341880, 11)"
5,Transformed train set shape,"(239315, 11)"
6,Transformed test set shape,"(102565, 11)"
7,Numeric features,10
8,Rows with missing values,0.7%
9,Preprocess,True


In [48]:
catboost = create_model('catboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4632,0.8669,0.9311,0.1385,0.3751,1.2189
1,0.4404,0.7231,0.8504,0.1378,0.3639,1.2139
2,0.4541,0.7771,0.8815,0.1395,0.3704,1.2138
3,0.4536,0.7891,0.8883,0.143,0.3704,1.1712
4,0.4514,0.7611,0.8724,0.1436,0.3689,1.1844
5,0.4516,0.7399,0.8602,0.1455,0.3695,1.2234
6,0.4421,0.735,0.8573,0.1355,0.3654,1.2291
7,0.4499,0.7454,0.8634,0.1547,0.3678,1.2266
8,0.4493,0.7459,0.8636,0.1438,0.368,1.2227
9,0.4543,0.7438,0.8624,0.1438,0.3711,1.1879
