##0. Import Relevant Libraries

In [None]:
from pathlib import Path
import urllib.request
import pandas as pd
import numpy as np

##1. Import Base Classes and Helper Functions from the Project GitHub and Set the Random Seed

In [None]:
# Import the Base Classes and Helper Functions from the Project GitHub
url = "https://raw.githubusercontent.com/VinceMoran/EA_Water_Quality_Time_Series_Prediction/main/base_classes_and_helper_functions.py"

file_path = Path("base_classes_and_helper_functions.py")
if not file_path.exists():
    urllib.request.urlretrieve(url, file_path)
import base_classes_and_helper_functions as bchf

In [None]:
# Set the random seed for all PNRGs to ensure reproducibility
bchf.set_random_seed()

##2. Load the Data from the Project GitHub into a Pandas DataFrame

In [None]:
# Create the path to the directory containing the data in the Jupyter notebook environment
data_path = bchf.load_raw_data(url="https://github.com/VinceMoran/EA_Water_Quality_Time_Series_Prediction/raw/main/data/processed_data/river_thames/THAMES_HAMMERSMITH_E_200707/df_parameters_preprocessed.zip")

# Assign the full filepath for the preprocessed data
parameter_path = "/content" / data_path / "df_parameters_preprocessed.csv"

[INFO] data/raw_data directory has been created.
[INFO] Downloading data...
[INFO] Data extracted to data/raw_data.
[INFO] ZIP file removed.


In [None]:
# Load the data into a Pandas DataFrame
df_parameters = pd.read_csv(parameter_path)

# Inspect the first 5 rows of the DataFrame
df_parameters.head()

Unnamed: 0,dateTime,ammonium,conductivity,oxygen_conc,oxygen_perc,temperature,turbidity,ammonium_impossible,conductivity_impossible,oxygen_conc_impossible,oxygen_perc_impossible,temperature_impossible,turbidity_impossible,ammonium_anomaly,conductivity_anomaly,oxygen_perc_anomaly,turbidity_anomaly,oxygen_conc_anomaly,temperature_anomaly
0,2009-05-18 16:19:00,0.494,658.333333,9.9,99.8,15.7,9.6,False,False,False,False,False,False,False,True,False,True,False,False
1,2009-05-18 16:34:00,0.476,659.0,9.96,100.5,15.7,11.8,False,False,False,False,False,False,False,False,False,False,False,False
2,2009-05-18 16:49:00,0.471,659.0,10.03,101.1,15.69,11.7,False,False,False,False,False,False,False,False,False,False,False,False
3,2009-05-18 17:04:00,0.46,658.0,10.08,101.7,15.7,11.5,False,False,False,False,False,False,False,False,False,False,False,False
4,2009-05-18 17:19:00,0.456,658.0,10.13,102.2,15.7,11.0,False,False,False,False,False,False,False,False,False,False,False,False


##3. Investigate and Reindex the DataFrame

In [None]:
# Investigate the unique value distributions of the DataFrame
bchf.unique_value_counts(df_parameters, "WATER QUALITY PARAMETER DATAFRAME")

DISTINCT VALUE COUNTS FOR WATER QUALITY PARAMETER DATAFRAME:
Variable         Number of Distinct Value Counts
dateTime                  569374          

ammonium                   21236          

conductivity               78756          

oxygen_conc                26789          

oxygen_perc                48625          

temperature                2385           

turbidity                 225589          

ammonium_impossible             2            

conductivity_impossible             2            

oxygen_conc_impossible             2            

oxygen_perc_impossible             2            

temperature_impossible             1            

turbidity_impossible             2            

ammonium_anomaly             2            

conductivity_anomaly             2            

oxygen_perc_anomaly             2            

turbidity_anomaly             2            

oxygen_conc_anomaly             1            

temperature_anomaly             1            



In [None]:
# Investigate the datatypes and numbers of missing values for each variable
bchf.datatypes_and_missing_values(df_parameters, "WATER QUALITY PARAMETER DATAFRAME")

DATATYPES AND MISSING VALUES FOR WATER QUALITY PARAMETER DATAFRAME
Variable           Data Type      Number of Missing Values
dateTime          , object        ,            0            
ammonium          , float64       ,            0            
conductivity      , float64       ,            0            
oxygen_conc       , float64       ,            0            
oxygen_perc       , float64       ,            0            
temperature       , float64       ,            0            
turbidity         , float64       ,            0            
ammonium_impossible, bool          ,            0            
conductivity_impossible, bool          ,            0            
oxygen_conc_impossible, bool          ,            0            
oxygen_perc_impossible, bool          ,            0            
temperature_impossible, bool          ,            0            
turbidity_impossible, bool          ,            0            
ammonium_anomaly  , bool          ,            0            


In [None]:
# Change the data type for the dateTime column to datetime64
df_parameters["dateTime"] = pd.to_datetime(df_parameters["dateTime"], dayfirst=True)

# Set the datetime column as the index
df_parameters.set_index('dateTime', inplace=True)

# Investigate the first 5 rows of the reindexed DataFrame
df_parameters.head()

  df_parameters["dateTime"] = pd.to_datetime(df_parameters["dateTime"], dayfirst=True)


Unnamed: 0_level_0,ammonium,conductivity,oxygen_conc,oxygen_perc,temperature,turbidity,ammonium_impossible,conductivity_impossible,oxygen_conc_impossible,oxygen_perc_impossible,temperature_impossible,turbidity_impossible,ammonium_anomaly,conductivity_anomaly,oxygen_perc_anomaly,turbidity_anomaly,oxygen_conc_anomaly,temperature_anomaly
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2009-05-18 16:19:00,0.494,658.333333,9.9,99.8,15.7,9.6,False,False,False,False,False,False,False,True,False,True,False,False
2009-05-18 16:34:00,0.476,659.0,9.96,100.5,15.7,11.8,False,False,False,False,False,False,False,False,False,False,False,False
2009-05-18 16:49:00,0.471,659.0,10.03,101.1,15.69,11.7,False,False,False,False,False,False,False,False,False,False,False,False
2009-05-18 17:04:00,0.46,658.0,10.08,101.7,15.7,11.5,False,False,False,False,False,False,False,False,False,False,False,False
2009-05-18 17:19:00,0.456,658.0,10.13,102.2,15.7,11.0,False,False,False,False,False,False,False,False,False,False,False,False


##4. Engineer Additional Temporal Features for Time Series Prediction

In [None]:
# Numerically encode date and time as features
df_parameters["day_of_year"] = df_parameters.index.dayofyear
df_parameters["day_of_year_sin"] = np.sin(2*np.pi*(df_parameters["day_of_year"]/365))
df_parameters["day_of_year_cos"] = np.cos(2*np.pi*(df_parameters["day_of_year"]/365))

df_parameters["minute_of_day"] = df_parameters.index.hour*60 + df_parameters.index.minute
df_parameters["minute_of_day_sin"] = np.sin(2*np.pi*(df_parameters["minute_of_day"]/1440))
df_parameters["minute_of_day_cos"] = np.cos(2*np.pi*(df_parameters["minute_of_day"]/1440))

# Remove redundant columns
df_parameters.drop(["day_of_year", "minute_of_day"], axis=1, inplace=True)

# Inspect the first 5 rows of the DataFrame
df_parameters.head()

Unnamed: 0_level_0,ammonium,conductivity,oxygen_conc,oxygen_perc,temperature,turbidity,ammonium_impossible,conductivity_impossible,oxygen_conc_impossible,oxygen_perc_impossible,...,ammonium_anomaly,conductivity_anomaly,oxygen_perc_anomaly,turbidity_anomaly,oxygen_conc_anomaly,temperature_anomaly,day_of_year_sin,day_of_year_cos,minute_of_day_sin,minute_of_day_cos
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-05-18 16:19:00,0.494,658.333333,9.9,99.8,15.7,9.6,False,False,False,False,...,False,True,False,True,False,False,0.693281,-0.720667,-0.904455,-0.426569
2009-05-18 16:34:00,0.476,659.0,9.96,100.5,15.7,11.8,False,False,False,False,...,False,False,False,False,False,False,0.693281,-0.720667,-0.930418,-0.366501
2009-05-18 16:49:00,0.471,659.0,10.03,101.1,15.69,11.7,False,False,False,False,...,False,False,False,False,False,False,0.693281,-0.720667,-0.952396,-0.304864
2009-05-18 17:04:00,0.46,658.0,10.08,101.7,15.7,11.5,False,False,False,False,...,False,False,False,False,False,False,0.693281,-0.720667,-0.970296,-0.241922
2009-05-18 17:19:00,0.456,658.0,10.13,102.2,15.7,11.0,False,False,False,False,...,False,False,False,False,False,False,0.693281,-0.720667,-0.984041,-0.177944


##5. Create Target Variables for Time Series Prediction

In [None]:
# Create the target variables
target_vars = ["ammonium", "oxygen_conc", "temperature"]
for target in target_vars:
  df_parameters[f"{target}_target_t_1"] = df_parameters[target].shift(periods=-1)

# Removed NaN rows added after shifting features and targets
df_parameters.dropna(inplace=True)

# Inspect the first 5 rows of the DataFrame
df_parameters.head()

Unnamed: 0_level_0,ammonium,conductivity,oxygen_conc,oxygen_perc,temperature,turbidity,ammonium_impossible,conductivity_impossible,oxygen_conc_impossible,oxygen_perc_impossible,...,turbidity_anomaly,oxygen_conc_anomaly,temperature_anomaly,day_of_year_sin,day_of_year_cos,minute_of_day_sin,minute_of_day_cos,ammonium_target_t_1,oxygen_conc_target_t_1,temperature_target_t_1
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-05-18 16:19:00,0.494,658.333333,9.9,99.8,15.7,9.6,False,False,False,False,...,True,False,False,0.693281,-0.720667,-0.904455,-0.426569,0.476,9.96,15.7
2009-05-18 16:34:00,0.476,659.0,9.96,100.5,15.7,11.8,False,False,False,False,...,False,False,False,0.693281,-0.720667,-0.930418,-0.366501,0.471,10.03,15.69
2009-05-18 16:49:00,0.471,659.0,10.03,101.1,15.69,11.7,False,False,False,False,...,False,False,False,0.693281,-0.720667,-0.952396,-0.304864,0.46,10.08,15.7
2009-05-18 17:04:00,0.46,658.0,10.08,101.7,15.7,11.5,False,False,False,False,...,False,False,False,0.693281,-0.720667,-0.970296,-0.241922,0.456,10.13,15.7
2009-05-18 17:19:00,0.456,658.0,10.13,102.2,15.7,11.0,False,False,False,False,...,False,False,False,0.693281,-0.720667,-0.984041,-0.177944,0.446,10.02,15.7


##6. Save the Base DataFrame to a .csv File for Deep Learning Model Training

In [None]:
# Save a copy of the DataFrame to prevent overwriting issues from later code
df_parameters_deep_learning = df_parameters.copy()

# Specify the file path for saving the DataFrame as a csv file
df_parameters_deep_learning_path = Path("/content/data/df_parameters_deep_learning.csv")

# Save the DataFrame to a csv file
df_parameters_deep_learning.to_csv(df_parameters_deep_learning_path, index=True)