In [8]:
import pandas as pd
import numpy as np
from datetime import timedelta

In [9]:
def cleanData(df,substance):
    df = df.drop(columns=['historianTagnummer'])
    df.rename(columns={'hstWaarde': substance + 'Value'}, inplace=True)
    df[substance + 'Value'] = df[substance + 'Value'].astype(float)
    df = df.drop(columns=['datumBeginMeting'])
    df['datumEindeMeting'] = pd.to_datetime(df['datumEindeMeting'])
    df.rename(columns={'datumEindeMeting': 'measurementDate'}, inplace=True)
    df.to_parquet('../data/cleanedData/'+ substance +'.parquet', index=False)
    pass

In [10]:
data_files = [
    ('../data/Ammonium_measurements.parquet', 'ammonium'),
    ('../data/Nitrate_measurements.parquet', 'nitrate'),
    ('../data/Oxygen_A.parquet', 'oxygenA'),
    ('../data/Oxygen_B.parquet', 'oxygenB'),
    ('../data/Phosphate_measurements.parquet', 'phosphate')
]

for file_path, var_name in data_files:
    df = pd.read_parquet(file_path)
    cleanData(df, var_name)

In [11]:
with open('../data/Total_influent_flow_WWTP_Ede_2021_minute_data.csv', 'r') as file:
    lines = file.readlines()

with open('../data/cleanedData/total.csv', 'w') as file:
    for line in lines:
        modified_line = line.replace(';60000', '')
        file.write(modified_line)

finalTotal = pd.read_csv('../data/cleanedData/total.csv', sep=';')
finalTotal['DateTime'] = pd.to_datetime(finalTotal['DateTime'], dayfirst=True)
finalTotal.rename(columns={'DateTime': 'measurementDate'}, inplace=True)
finalTotal.to_csv('../data/cleanedData/total.csv', sep=';' ,index=False)

In [12]:
def shiftDates(df, name):
    duplicates = df[df.duplicated(subset='measurementDate', keep=False)].index.tolist()
    marchDate = df.loc[df['measurementDate'] == '2021-03-28 03:00:00']
    startTime = marchDate.index.values[0]
    endTime = duplicates[-1]

    for i in range(startTime, endTime + 1):
        if i < (endTime - len(duplicates)):
            df.loc[i, 'measurementDate'] -= timedelta(hours=1)
        elif not ((i % 2) == 0):
            df.loc[i, 'measurementDate'] -= timedelta(hours=1)
    
    df = df.sort_values(by='measurementDate')
    df.to_parquet('../data/shiftedDates/'+ name +'.parquet', index=False)
    pass

In [13]:
cleaned_data = [
    ('../data/cleanedData/ammonium.parquet', 'cleanedAmmonium'),
    ('../data/cleanedData/nitrate.parquet', 'cleanedNitrate'),
    ('../data/cleanedData/oxygenA.parquet', 'cleanedOxygenA'),
    ('../data/cleanedData/oxygenB.parquet', 'cleanedOxygenB'),
    ('../data/cleanedData/total.parquet', 'total'),
]

for path_file, name in cleaned_data:
    df = pd.read_parquet(path_file)
    shiftDates(df, name)

In [16]:
phosphate = pd.read_parquet('../data/cleanedData/phosphate.parquet')

start_date = '2021-01-01 00:00:00'
end_date = '2021-12-31 23:58:00'
index = pd.date_range(start=start_date, end=end_date, freq='1min')

# Step 2: Reindex the 'phosphate' DataFrame using the DateTime index
phosphate = phosphate.set_index('measurementDate').reindex(index, fill_value=None)

# Reset index to make 'endDate' a column again
phosphate = phosphate.reset_index()
phosphate.rename(columns={'index': 'measurementDate'}, inplace=True)
phosphate.to_parquet('../data/cleanedData/phosphate.parquet')

In [15]:
total = pd.read_csv('../data/Total_influent_flow_WWTP_Ede_2021_minute_data.csv', sep=';')
total.drop(columns=['wwResolution'],inplace=True)
total['DateTime'] = pd.to_datetime(total['DateTime'])
total.rename(columns={'EDE_09902MTW_K100.MTW':'waterFlowPerMinute','DateTime':'measurementDate'}, inplace=True)

total['waterFlowPerMinute'] = total['waterFlowPerMinute'].str.replace(',','.').replace('(null)', np.nan)
total['waterFlowPerMinute'] = total['waterFlowPerMinute'].astype('float')
shiftDates(total,'total')
