In [68]:
import pandas as pd

Load everything we need first.

Reminder: Download the latest 'data' folder from Teams and replace yours before start working.

In [69]:
df_influent = pd.read_parquet('data/raw-data/Influent_2023.parquet')
df_oxygen_A = pd.read_parquet('data/raw-data/oxygen_a_2023.parquet')
df_oxygen_B = pd.read_parquet('data/raw-data/oxygen_b_2023.parquet')
df_ammonium = pd.read_parquet('data/raw-data/ammonium_2023.parquet')
df_nitrite = pd.read_parquet('data/raw-data/nitrate_2023.parquet')
df_phosphate = pd.read_parquet('data/raw-data/phosphate_2023.parquet')
df_effluent = pd.read_excel('data/raw-data/effluent_2023.xlsx')
df_weather = pd.read_csv('data/weather-data/weather_2023.csv', sep=',')

Check the year of these dataframes are 2023, and see their columns names & records frenquency.

In [70]:
# influent data - recorded every minute
# df_influent.head()

# oxygen data (A & B) - need to be merged (take the average) first - recorded every minute
# df_oxygen_A.head()
# df_oxygen_B.head()

# chemical data (ammonium, nitrite, phosphate) - recorded every minute
# df_ammonium.head()
# df_nitrite.head()
# df_phosphate.head()

# effluent data - recorded every 15 minutes
# df_effluent.head() 15min

# weather data - recorded every hour
df_weather.head()

Unnamed: 0,Timestamp,PrecipitationAmount,Temperature
0,2023-01-01 00:00:00,0,15.3
1,2023-01-01 01:00:00,0,14.6
2,2023-01-01 02:00:00,0,15.0
3,2023-01-01 03:00:00,0,14.7
4,2023-01-01 04:00:00,0,14.1


In [71]:
# keep the columns we need and rename them
a_clean = df_oxygen_A[['datumBeginMeting', 'hstWaarde']].rename(columns={'hstWaarde': 'oxygen_a'})
b_clean = df_oxygen_B[['datumBeginMeting', 'hstWaarde']].rename(columns={'hstWaarde': 'oxygen_b'})

# merge two oxygen dataframes on 'datumBeginMeting'
df_oxygen = pd.merge(a_clean, b_clean, on='datumBeginMeting', how='inner')

# convert to numeric
df_oxygen['oxygen_a'] = pd.to_numeric(df_oxygen['oxygen_a'], errors='coerce')
df_oxygen['oxygen_b'] = pd.to_numeric(df_oxygen['oxygen_b'], errors='coerce')

# take the average of the two oxygen columns
df_oxygen['oxygen_avg'] = (df_oxygen['oxygen_a'] + df_oxygen['oxygen_b']) / 2

df_oxygen.head()

Unnamed: 0,datumBeginMeting,oxygen_a,oxygen_b,oxygen_avg
0,2023-01-01 00:00:00,1.663,0.777,1.22
1,2023-01-01 00:01:00,1.789,0.982,1.3855
2,2023-01-01 00:02:00,2.105,1.189,1.647
3,2023-01-01 00:03:00,2.271,1.266,1.7685
4,2023-01-01 00:04:00,2.168,1.403,1.7855


In [72]:
influent_clean = df_influent[['datumBeginMeting', 'hstWaarde']].rename(columns={'hstWaarde': 'Influent'})
oxygen_clean = df_oxygen[['datumBeginMeting', 'oxygen_avg']].rename(columns={'oxygen_avg': 'Oxygen'})
ammonium_clean = df_ammonium[['datumBeginMeting', 'hstWaarde']].rename(columns={'hstWaarde': 'Ammonium'})
nitrate_clean = df_nitrite[['datumBeginMeting', 'hstWaarde']].rename(columns={'hstWaarde': 'Nitrate'})
phosphate_clean = df_phosphate[['datumBeginMeting', 'hstWaarde']].rename(columns={'hstWaarde': 'Phosphate'})

# merge all dataframes on 'datumBeginMeting' one by one
df_water = influent_clean.merge(oxygen_clean, on='datumBeginMeting', how='inner') \
                      .merge(ammonium_clean, on='datumBeginMeting', how='inner') \
                      .merge(nitrate_clean, on='datumBeginMeting', how='inner') \
                      .merge(phosphate_clean, on='datumBeginMeting', how='inner')

# convert 'datumBeginMeting' to datetime and set as index
df_water['datumBeginMeting'] = pd.to_datetime(df_water['datumBeginMeting'])
df_water.set_index('datumBeginMeting', inplace=True)

# convert all columns to numeric, coerce errors to NaN
df_water = df_water.apply(pd.to_numeric, errors='coerce')

# hourly resampling
df_water = df_water.resample('H').mean().reset_index()

df_water.head()

  df_water = df_water.resample('H').mean().reset_index()


Unnamed: 0,datumBeginMeting,Influent,Oxygen,Ammonium,Nitrate,Phosphate
0,2023-01-01 00:00:00,3202.204867,0.91915,1.037983,4.424633,0.00345
1,2023-01-01 01:00:00,2790.287717,0.895517,1.93865,3.062367,0.380733
2,2023-01-01 02:00:00,2281.9509,1.11355,1.8856,3.344783,0.169217
3,2023-01-01 03:00:00,2377.07655,0.584708,0.9331,2.867483,0.161217
4,2023-01-01 04:00:00,1517.581467,0.552192,0.772933,2.491967,0.112783


After reviewing the weather dataset, I found that the Rain column only contains binary values: 0 indicates no rain, and 1 indicates rain. So, I decided to use the PrecipitationAmount column this time, which records the actual hourly precipitation amount in millimeters (mm). Additionally, the temperature values in the dataset have already been adjusted by dividing them by 10, so the unit is now degrees Celsius (°C).

In [73]:
# Convert datetime columns to datetime type
df_weather['Timestamp'] = pd.to_datetime(df_weather['Timestamp'])
df_water['datumBeginMeting'] = pd.to_datetime(df_water['datumBeginMeting'])

# Rename datetime columns to 'date'
df_weather = df_weather.rename(columns={'Timestamp': 'date', 'PrecipitationAmount': 'Rainfall'})
df_water = df_water.rename(columns={'datumBeginMeting': 'date'})

# Merge the two dataframes on 'date'
df_main = pd.merge(df_water, df_weather, on='date', how='inner')

df_main.to_csv("data/main_2023.csv", index=False)

# Display the first few rows of the merged dataframe
df_main.head()


Unnamed: 0,date,Influent,Oxygen,Ammonium,Nitrate,Phosphate,Rainfall,Temperature
0,2023-01-01 00:00:00,3202.204867,0.91915,1.037983,4.424633,0.00345,0,15.3
1,2023-01-01 01:00:00,2790.287717,0.895517,1.93865,3.062367,0.380733,0,14.6
2,2023-01-01 02:00:00,2281.9509,1.11355,1.8856,3.344783,0.169217,0,15.0
3,2023-01-01 03:00:00,2377.07655,0.584708,0.9331,2.867483,0.161217,0,14.7
4,2023-01-01 04:00:00,1517.581467,0.552192,0.772933,2.491967,0.112783,0,14.1


Apply min-max scaling to the main dataset.

In [75]:
from sklearn.preprocessing import MinMaxScaler

# Separate the 'date' column
df_features = df_main.drop(columns=['date'])

# Initialize the scaler
scaler = MinMaxScaler()

# Apply min-max scaling to all feature columns
df_scaled_values = scaler.fit_transform(df_features)

# Create a new DataFrame with scaled values and the same column names
df_scaled = pd.DataFrame(df_scaled_values, columns=df_features.columns)

# Add the 'date' column back
df_scaled['date'] = df_main['date']

# Reorder columns to place 'date' first
df_scaled = df_scaled[['date'] + df_features.columns.tolist()]

# Display the first few rows of the scaled DataFrame
df_scaled.head()


Unnamed: 0,date,Influent,Oxygen,Ammonium,Nitrate,Phosphate,Rainfall,Temperature
0,2023-01-01 00:00:00,0.441427,0.306547,0.051943,0.42028,9.5e-05,0.0,0.556391
1,2023-01-01 01:00:00,0.381769,0.298638,0.097014,0.278328,0.063068,0.0,0.538847
2,2023-01-01 02:00:00,0.308147,0.371606,0.094359,0.307757,0.027763,0.0,0.548872
3,2023-01-01 03:00:00,0.321924,0.194623,0.046694,0.258021,0.026428,0.0,0.541353
4,2023-01-01 04:00:00,0.197444,0.18374,0.038679,0.218891,0.018344,0.0,0.526316
