# Notebook 2: Features Engineering

- Extract 'day of the year' and 'time of the day' from the 'Date Time' text string.
- To incorporate the time periodicity knowledge into the model, both features go through simple trigonometric functions to become 4 features: sin(day_of_year), cos(day_of_year), sin(time_of_day), cos(time_of_day).


- A similar issue appears with the wind direction, they are cyclical. Wind direction and its speed are combined into 2 new vectors each using simple trigonometry.



In [None]:
# Display Colab GPU status
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Jan 22 15:38:58 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              42W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# Import the required libraries
import tensorflow
import numpy as np
import pandas as pd
from time import time
from tensorflow import keras
import matplotlib.pyplot as plt
from tensorflow.keras import layers

## Load the clean dataset from last notebook from its csv file.

In [None]:
# Start by loading the clean dataset without engineering into a dataframe
raw_data_path = '/content/drive/MyDrive/data/mpi_roof_global_clean.csv'
raw_data = pd.read_csv(raw_data_path, index_col=0)

print(raw_data.shape)
raw_data

(806391, 16)


Unnamed: 0,Date Time,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg),CO2 (ppm)
0,27.02.2008 16:00:00,988.77,7.98,282.05,-0.82,53.62,10.72,5.75,4.97,3.63,5.82,1222.53,4.02,9.25,244.8,387.5
1,27.02.2008 16:10:00,988.79,7.81,281.88,-0.82,54.26,10.60,5.75,4.85,3.63,5.82,1223.29,4.01,6.25,252.0,391.5
2,27.02.2008 16:20:00,988.81,7.75,281.82,-0.76,54.72,10.56,5.78,4.78,3.64,5.84,1223.57,3.08,5.50,246.5,391.4
3,27.02.2008 16:30:00,988.88,7.70,281.76,-0.86,54.53,10.52,5.74,4.78,3.62,5.80,1223.89,3.10,7.63,242.1,391.8
4,27.02.2008 16:40:00,989.01,7.60,281.65,-0.81,55.09,10.45,5.76,4.69,3.63,5.82,1224.48,3.53,6.50,250.3,392.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806386,30.06.2023 23:20:00,985.13,15.51,289.90,15.15,97.70,17.65,17.24,0.41,10.96,17.50,1181.01,0.88,1.35,233.5,435.1
806387,30.06.2023 23:30:00,985.11,15.40,289.79,15.15,98.40,17.52,17.24,0.28,10.96,17.50,1181.43,0.93,1.14,223.6,436.9
806388,30.06.2023 23:40:00,985.07,15.27,289.66,15.25,99.90,17.38,17.36,0.02,11.04,17.62,1181.86,1.02,1.49,236.3,439.6
806389,30.06.2023 23:50:00,985.02,15.32,289.72,15.32,100.00,17.43,17.43,0.00,11.08,17.70,1181.56,1.04,1.33,233.2,442.0


In [None]:
raw_data.keys()

Index(['Date Time', 'p (mbar)', 'T (degC)', 'Tpot (K)', 'Tdew (degC)',
       'rh (%)', 'VPmax (mbar)', 'VPact (mbar)', 'VPdef (mbar)', 'sh (g/kg)',
       'H2OC (mmol/mol)', 'rho (g/m**3)', 'wv (m/s)', 'max. wv (m/s)',
       'wd (deg)', 'CO2 (ppm)'],
      dtype='object')

In [None]:
print(raw_data.shape)
raw_data.describe().T

(806391, 16)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
p (mbar),806391.0,989.479986,8.637193,913.6,984.44,989.87,995.1,1020.07
T (degC),806391.0,9.887356,8.265368,-23.01,3.75,9.65,15.82,37.95
Tpot (K),806391.0,283.907777,8.355773,250.6,277.77,283.72,289.86,312.21
Tdew (degC),806391.0,5.207493,6.506166,-25.01,0.55,5.46,10.17,23.11
rh (%),806391.0,75.401968,17.321303,12.95,63.83,78.6,89.5,100.0
VPmax (mbar),806391.0,13.917221,7.857134,0.95,7.99,12.01,18.0,66.13
VPact (mbar),806391.0,9.648586,4.129602,0.79,6.36,9.01,12.44,28.32
VPdef (mbar),806391.0,4.26856,5.19693,0.0,0.9,2.28,5.62,56.72
sh (g/kg),806391.0,6.09384,2.621958,0.5,4.01,5.68,7.86,18.13
H2OC (mmol/mol),806391.0,9.754391,4.180566,0.8,6.43,9.11,12.57,28.82


# Split the data into Training, Validation and Test using 60%/20%/20%

In [None]:
num_train_samples = int(0.6 * len(raw_data))
num_val_samples = int(0.20 * len(raw_data))
num_test_samples = len(raw_data) - num_train_samples - num_val_samples
print("num_train_samples:", num_train_samples, "num_val_samples:", num_val_samples, "num_test_samples:", num_test_samples)

num_train_samples: 483834 num_val_samples: 161278 num_test_samples: 161279


# 6.1 Add columns: Feature engineering: Day-of-the-year and Time-of-day as Sin/Cos

In [None]:
# Convert the 'Date Time' column FEATURE to date_time
date_time = pd.to_datetime(raw_data['Date Time'], format='%d.%m.%Y %H:%M:%S')

# Extract components
day_of_year = date_time.dt.dayofyear
time_of_day = date_time.dt.hour * 3600 + date_time.dt.minute * 60 + date_time.dt.second

# Calculate sin and cos for each cyclical feature. Insert new feature in dataframe.
raw_data['day sin']   = np.sin(day_of_year / 365 * 2 * np.pi)
raw_data['day cos']   = np.cos(day_of_year / 365 * 2 * np.pi)
raw_data['time sin']  = np.sin(time_of_day / 86400 * 2 * np.pi)
raw_data['time cos']  = np.cos(time_of_day / 86400 * 2 * np.pi)

# 6.2 Wind direction as sin/cos

In [None]:
# Calculate sin and cos for each cyclical feature.
# Insert new feature in dataframe.
raw_data['wind_sin'] = raw_data['wv (m/s)'] * np.sin(raw_data['wd (deg)'] / 180 * np.pi)
raw_data['wind_cos'] = raw_data['wv (m/s)'] * np.cos(raw_data['wd (deg)'] / 180 * np.pi)
raw_data['max_wind_sin'] = raw_data['max. wv (m/s)'] * np.sin(raw_data['wd (deg)'] / 180 * np.pi)
raw_data['max_wind_cos'] = raw_data['max. wv (m/s)'] * np.cos(raw_data['wd (deg)'] / 180 * np.pi)

## 6.3 Remove 4 unused columns: Drop the 'Date Time' and 3 winds columns.


In [None]:
print(raw_data.shape)
if 'Date Time' in raw_data:
  raw_data.drop(['Date Time'], axis=1, inplace=True)

if 'wv (m/s)' in raw_data:
  raw_data.drop(['wv (m/s)'], axis=1, inplace=True)
if 'max. wv (m/s)' in raw_data:
  raw_data.drop(['max. wv (m/s)'], axis=1, inplace=True)
if 'wd (deg)' in raw_data:
  raw_data.drop(['wd (deg)'], axis=1, inplace=True)

print(raw_data.shape)
raw_data.keys()

(806391, 24)
(806391, 20)


Index(['p (mbar)', 'T (degC)', 'Tpot (K)', 'Tdew (degC)', 'rh (%)',
       'VPmax (mbar)', 'VPact (mbar)', 'VPdef (mbar)', 'sh (g/kg)',
       'H2OC (mmol/mol)', 'rho (g/m**3)', 'CO2 (ppm)', 'day sin', 'day cos',
       'time sin', 'time cos', 'wind_sin', 'wind_cos', 'max_wind_sin',
       'max_wind_cos'],
      dtype='object')

# save the clean dataset with features engineering

In [None]:
raw_data_path = '/content/drive/MyDrive/data/mpi_roof_global_clean_features.csv'

# Uncomment the line below to create the dataset file locally
# raw_data.to_csv(raw_data_path)