# Bühlot data preprocessing

The purpose of this code is to read in all the collected data, sort it by their different variables and then safe it in the correct folder.
By running this code ALL the collected data will be processed, not just the new data. Therefore all the previous sorted data will be overwritten. The sorted data will be safed in a folder named "data_export".

This is a list of all the variables:
- air temperature [°C]
- bulk electrical conductivity [dS/m]
- ground water level [mm]
- logger temperature [°C]
- precipitation [mm]
- river water level 1 []
- river water level 2 []
- river water level 4 []
- volumetric water content [m^3/m^3]
- water temperature [°C]

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import os
from glob import glob
from tqdm import tqdm

In [2]:
# create directory if it does not exist
os.makedirs("data/data_export/air_temperature", exist_ok=True)
os.makedirs("data/data_export/bulk_electrical_conductivity", exist_ok=True)
os.makedirs("data/data_export/ground_water_level", exist_ok=True)
os.makedirs("data/data_export/logger_temperature", exist_ok=True)
os.makedirs("data/data_export/precipitation", exist_ok=True)
os.makedirs("data/data_export/river_water_level_1", exist_ok=True)
os.makedirs("data/data_export/river_water_level_2", exist_ok=True)
os.makedirs("data/data_export/river_water_level_4", exist_ok=True)
os.makedirs("data/data_export/volumetric_water_content", exist_ok=True)
os.makedirs("data/data_export/water_temperature", exist_ok=True)

In [3]:
def preprocessing(filename, variable):
    """
    This function preprocesses the raw data files for the needed variable.
    It will seperate a data file into the different variables.
    It reads in the raw data to then create a tabel with the columns that are needed. 

    """    

    if variable == 'precipitation':
        
        # read in raw data
        df = pd.read_csv(filename, skiprows=1, na_values='Logged')
        
        # slice down to relevant columns
        df = df.iloc[:, [1,3]].copy()

        # rename columns
        df.columns = ['tstamp', 'precipitation']
        
        # convert to datetime
        try: 
            df['tstamp'] = pd.to_datetime(df['tstamp'], format='%m/%d/%y %I:%M:%S %p')
            
        except ValueError:
            df['tstamp'] = pd.to_datetime(df['tstamp'], format='%d/%m/%y %H:%M:%S')
                   
        # drop from df where precipitation is NaN
        df.dropna(subset=["precipitation"], inplace=True)
        
    elif variable == 'air temperature':
        
        # read in raw data
        df = pd.read_csv(filename, skiprows=1, na_values='Logged')
        
        # slice down to relevant columns
        df = df.iloc[:, [1,2]].copy()
        
        # rename columns
        df.columns = ['tstamp', 'air_temperature']
        
        # convert to datetime      
        try: 
            df['tstamp'] = pd.to_datetime(df['tstamp'], format='%m/%d/%y %I:%M:%S %p')
            
        except ValueError:
            df['tstamp'] = pd.to_datetime(df['tstamp'], format='%d/%m/%y %H:%M:%S')
            
        # drop from df where ait temperature is NaN
        df.dropna(subset=["air_temperature"], inplace=True)
        
    elif variable == 'Table1_VWC':
        
        # read in raw data from table 1
        df = pd.read_csv(filename, skiprows=[1,2,3,4], na_values='Logged')
        
        # slice down to relevant columns
        df = df.iloc[:, [0,2]].copy()

        # rename columns
        df.columns = ['tstamp', 'volumetric_water_content']
        
        # convert to datetime
        df['tstamp'] = pd.to_datetime(df['tstamp'], format='%Y-%m-%d %H:%M:%S')
        
        # drop from df where voulemtric water content is NaN
        df.dropna(subset=["volumetric_water_content"], inplace=True)
        
    elif variable == 'Table1_EC':
        
        # read in raw data from table 1
        df = pd.read_csv(filename, skiprows=[1,2,3,4], na_values='Logged')

        # slice down to relevant columns
        df = df.iloc[:, [0,3]].copy()

        # rename columns
        df.columns = ['tstamp', 'bulk_electrical_conductivity']
        
        # convert to datetime
        df['tstamp'] = pd.to_datetime(df['tstamp'], format='%Y-%m-%d %H:%M:%S')
        
        # drop from df where bulk electrical conductivity is NaN
        df.dropna(subset=["bulk_electrical_conductivity"], inplace=True)
        
    elif variable == 'Table2_VWC':
        
        # read in raw data from table 2
        df = pd.read_csv(filename, skiprows=[1,2,3,4], na_values='Logged')

        # slice down to relevant columns
        df = df.iloc[:, [0,3]].copy()

        # rename columns
        df.columns = ['tstamp', 'volumetric_water_content']
        
        # convert to datetime
        df['tstamp'] = pd.to_datetime(df['tstamp'], format='%Y-%m-%d %H:%M:%S')
        
        # drop from df where voulemtric water content is NaN
        df.dropna(subset=["volumetric_water_content"], inplace=True)
        
    elif variable == 'Table2_EC':
        
        # read in raw data from table 2
        df = pd.read_csv(filename, skiprows=[1,2,3,4], na_values='Logged')

        # slice down to relevant columns
        df = df.iloc[:, [0,4]].copy()

        # rename columns
        df.columns = ['tstamp', 'bulk_electrical_conductivity']
        
        # convert to datetime
        df['tstamp'] = pd.to_datetime(df['tstamp'], format='%Y-%m-%d %H:%M:%S')
        
        # drop from df where bulk electrical conductivity is NaN
        df.dropna(subset=["bulk_electrical_conductivity"], inplace=True)
        
    elif variable == 'ground water level':
        
        # read in raw data
        df = pd.read_excel(filename, skiprows=[1,2,3,4,5,6,7,8,9,10,11], na_values='Logged')

        # slice down to relevant columns
        df = df.iloc[:, [1,4]].copy()

        # rename columns
        df.columns = ['tstamp', 'water_height']
        
        # convert to datetime
        df['tstamp'] = pd.to_datetime(df['tstamp'], format='%Y-%m-%d %H:%M:%S')
        
        # drop from df where water height is NaN
        df.dropna(subset=["water_height"], inplace=True)
        
    elif variable == 'water temperature':
        
        # read in raw data
        df = pd.read_excel(filename, skiprows=[1,2,3,4,5,6,7,8,9,10,11], na_values='Logged')

        # slice down to relevant columns
        df = df.iloc[:, [1,2]].copy()

        # rename columns
        df.columns = ['tstamp', 'water_temperature']
        
        # convert to datetime
        df['tstamp'] = pd.to_datetime(df['tstamp'], format='%Y-%m-%d %H:%M:%S')
        
        # drop from df where water temperature is NaN
        df.dropna(subset=["water_temperature"], inplace=True)
        
    elif variable == 'logger temperature':
        
        # read in raw data
        df = pd.read_excel(filename, skiprows=[1,2,3,4,5,6,7,8,9,10,11], na_values='Logged')

        # slice down to relevant columns
        df = df.iloc[:, [1,3]].copy()

        # rename columns
        df.columns = ['tstamp', 'logger_temperature']
        
        # convert to datetime
        df['tstamp'] = pd.to_datetime(df['tstamp'], format='%Y-%m-%d %H:%M:%S')
        
        # drop from df where logger temperature is NaN
        df.dropna(subset=["logger_temperature"], inplace=True)
        
    elif variable == 'ground water level csv':
        
        # read in raw data
        df = pd.read_csv(filename, na_values='Logged')

        # slice down to relevant columns
        df = df.iloc[:, [1,4]].copy()

        # rename columns
        df.columns = ['tstamp', 'water_height']
        
        # convert to datetime
        df['tstamp'] = pd.to_datetime(df['tstamp'], format='%d/%m/%Y %H:%M:%S')
        
        # drop from df where water height is NaN
        df.dropna(subset=["water_height"], inplace=True)
        
    elif variable == 'water temperature csv':
        
        # read in raw data
        df = pd.read_csv(filename, na_values='Logged')

        # slice down to relevant columns
        df = df.iloc[:, [1,2]].copy()

        # rename columns
        df.columns = ['tstamp', 'water_temperature']
        
        # convert to datetime
        df['tstamp'] = pd.to_datetime(df['tstamp'], format='%d/%m/%Y %H:%M:%S')
        
        # drop from df where water temperature is NaN
        df.dropna(subset=["water_temperature"], inplace=True)
        
    elif variable == 'logger temperature csv':
        
        # read in raw data
        df = pd.read_csv(filename, na_values='Logged')

        # slice down to relevant columns
        df = df.iloc[:, [1,3]].copy()

        # rename columns
        df.columns = ['tstamp', 'logger_temperature']
        
        # convert to datetime
        df['tstamp'] = pd.to_datetime(df['tstamp'], format='%d/%m/%Y %H:%M:%S')
        
        # drop from df where logger temperature is NaN
        df.dropna(subset=["logger_temperature"], inplace=True)
        
    elif variable == 'river water level 1':
        
        # read in raw data
        df = pd.read_csv(filename, na_values='Logged', sep=';', header=None)

        # merge date with time
        df['tstamp'] = df.iloc[:,0] + ' ' + df.iloc[:,1]

        # rename columns
        df.columns = ['date_str', 'time', 'river_water_level_1', 'tstamp']

        # change the order of the columns
        df = df[['tstamp', 'river_water_level_1']]
        
        # convert to datetime
        df['tstamp'] = pd.to_datetime(df['tstamp'], format='%d.%m.%Y %H:%M')
        
        # drop from df where river water level 1 is NaN
        df.dropna(subset=["river_water_level_1"], inplace=True)
        
    elif variable == 'river water level 2':
        
        # read in raw data
        df = pd.read_csv(filename, na_values='Logged', sep=';', header=None)

        # merge date with time
        df['tstamp'] = df.iloc[:,0] + ' ' + df.iloc[:,1]

        # rename columns
        df.columns = ['date_str', 'time', 'river_water_level_2', 'tstamp']

        # change the order of the columns
        df = df[['tstamp', 'river_water_level_2']]
        
        # convert to datetime
        df['tstamp'] = pd.to_datetime(df['tstamp'], format='%d.%m.%Y %H:%M')
        
        # drop from df where river water level 2 is NaN
        df.dropna(subset=["river_water_level_2"], inplace=True)
        
    elif variable == 'river water level 4':
        
        # read in raw data
        df = pd.read_csv(filename, na_values='Logged', sep=';', header=None)

        # merge date with time
        df['tstamp'] = df.iloc[:,0] + ' ' + df.iloc[:,1]

        # rename columns
        df.columns = ['date_str', 'time', 'river_water_level_4', 'tstamp']

        # change the order of the columns
        df = df[['tstamp', 'river_water_level_4']]
        
        # convert to datetime
        df['tstamp'] = pd.to_datetime(df['tstamp'], format='%d.%m.%Y %H:%M')
        
        # drop from df where river water level 4 is NaN
        df.dropna(subset=["river_water_level_4"], inplace=True)
        
    else:
        raise ValueError(f"Variable is '{variable}', must be in ['precipitation', 'air temperature', 'Table1_VWC', 'Table1_EC', 'Table2_VWC', 'Table2_EC', 'ground water level', 'water temperature', 'logger temperature', 'ground water level csv', 'water temperature csv', 'logger temperature csv', 'river water level 1', 'river water level 2', 'river water level 4']")
    
    # return preprocessed dataframe
    return df

In [4]:
def merge(variable):
    """
    This function merges all the data for the assigned list. 
    It will rename the file in a correct way (name_variable) and then safe it in the associated folder. 
    Duplicates are not excepted, so they must be removed from each file.
    
    "Table1" and "Table2" are names from the data file volumetric water content. Each station has two sensors ("Table1" and "Table2"). 
    While the sensor from "Table1" is placed in a depth of 20 cm below the top edge of the ground, the other sensor "Table2" is placed in a 
    depth of 50 cm below the top edge of the ground.
    
    The abbreviations are:
    AT = air temperature
    P = precipitation
    VWC_1 = volumetric water content of "Table1"
    EC_1 = bulk electrical conductivity of "Table1"
    VWC_2 = volumetric water content of "Table2"
    EC_2 = bulk electrical conductivity of "Table2"
    GWL = ground water level
    WT = water temperature
    LT = logger temperature
    GWL_csv = ground water level
    WT_csv = water temperature
    LG_csv = logger temperature
    RWL_1 = river water level from the first sensor
    RWL_2 = river water level from the second sensor
    RWL_4 = river water level from the third sensor - sensor is named with number 4 
    
    """
       
    if variable == 'all_data_AT':
        
        # merge all_data
        df_all_data_AT = pd.concat(all_data_AT, ignore_index=True)
        
        # sort by datetime
        df_all_data_AT.sort_index(axis='index', inplace=False)
                        
        # drop duplicates
        df_all_data_AT = df_all_data_AT.drop_duplicates(subset=['tstamp'], keep='first', ignore_index=False)
        
        # replace filename 
        filename_AT = filename.replace(".csv", "_air_temperature.csv")
        
        # check if there are any duplicates - if yes safe that file in folder 'data_duplicates' 
        try:
            assert len(df_all_data_AT['tstamp'])==len(set(df_all_data_AT['tstamp']))
            df_all_data_AT.to_csv(f'data/data_export/air_temperature/{filename_AT}', index=False)
            
        except AssertionError:
            print(f'Bei datei {filename_AT} sind duplikate vorhanden.')
            df_all_data_AT.to_csv(f'data/data_duplicates/{filename_AT}', index=False)
                   
    elif variable == 'all_data_P':
        
        # merge all_data
        df_all_data_P = pd.concat(all_data_P, ignore_index=True)
        
        # sort by datetime
        df_all_data_P.sort_index(axis='index', inplace=False)
        
        # drop duplicates
        df_all_data_P = df_all_data_P.drop_duplicates(subset=['tstamp'], keep='first', ignore_index=False)
        
        # replace filename
        filename_P = filename.replace(".csv", "_precipitation.csv")
        
        # check if there are any duplicates - if yes safe that file in folder 'data_duplicates' 
        try:
            assert len(df_all_data_P['tstamp'])==len(set(df_all_data_P['tstamp']))
            df_all_data_P.to_csv(f'data/data_export/precipitation/{filename_P}', index=False)
            
        except AssertionError:
            print(f'Bei datei {filename_P} sind duplikate vorhanden.')
            df_all_data_P.to_csv(f'data/data_duplicates/{filename_P}', index=False)
        
    elif variable == 'all_data_VWC_1':
        
        # merge all_data
        df_all_data_VWC_1 = pd.concat(all_data_VWC_1, ignore_index=True)
        
        # sort by datetime
        df_all_data_VWC_1.sort_index(axis='index', inplace=False)
        
        # drop duplicates
        df_all_data_VWC_1 = df_all_data_VWC_1.drop_duplicates(subset=['tstamp'], keep='first', ignore_index=False)
        
        # replace filename
        filename_VWC_1 = filename.replace("_Table1.dat", "_volumetric_water_content_20cm.csv")
        
        # check if there are any duplicates - if yes safe that file in folder 'data_duplicates' 
        try:
            assert len(df_all_data_VWC_1['tstamp'])==len(set(df_all_data_VWC_1['tstamp']))
            df_all_data_VWC_1.to_csv(f'data/data_export/volumetric_water_content/{filename_VWC_1}', index=False)
            
        except AssertionError:
            print(f'Bei datei {filename_VWC_1} sind duplikate vorhanden.')
            df_all_data_VWC_1.to_csv(f'data/data_duplicates/{filename_VWC_1}', index=False)
        
    elif variable == 'all_data_EC_1':
        
        # merge all_data
        df_all_data_EC_1 = pd.concat(all_data_EC_1, ignore_index=True)
        
        # sort by datetime
        df_all_data_EC_1.sort_index(axis='index', inplace=False)
        
        # drop duplicates
        df_all_data_EC_1 = df_all_data_EC_1.drop_duplicates(subset=['tstamp'], keep='first', ignore_index=False)
        
        # replace filename
        filename_EC_1 = filename.replace("_Table1.dat", "_bulk_electrical_conductivity_20cm.csv")
        
        # check if there are any duplicates - if yes safe that file in folder 'data_duplicates' 
        try:
            assert len(df_all_data_EC_1['tstamp'])==len(set(df_all_data_EC_1['tstamp']))
            df_all_data_EC_1.to_csv(f'data/data_export/bulk_electrical_conductivity/{filename_EC_1}', index=False)
            
        except AssertionError:
            print(f'Bei datei {filename_EC_1} sind duplikate vorhanden.')
            df_all_data_EC_1.to_csv(f'data/data_duplicates/{filename_EC_1}', index=False)
        
    elif variable == 'all_data_VWC_2':
        
        # merge all_data
        df_all_data_VWC_2 = pd.concat(all_data_VWC_2, ignore_index=True)
        
        # sort by datetime
        df_all_data_VWC_2.sort_index(axis='index', inplace=False)
        
        # drop duplicates
        df_all_data_VWC_2 = df_all_data_VWC_2.drop_duplicates(subset=['tstamp'], keep='first', ignore_index=False)
        
        # replace filename
        filename_VWC_2 = filename.replace("_Table2.dat", "_volumetric_water_content_50cm.csv")
        
        # check if there are any duplicates - if yes safe that file in folder 'data_duplicates' 
        try:
            assert len(df_all_data_VWC_2['tstamp'])==len(set(df_all_data_VWC_2['tstamp']))
            df_all_data_VWC_2.to_csv(f'data/data_export/volumetric_water_content/{filename_VWC_2}', index=False)
            
        except AssertionError:
            print(f'Bei datei {filename_VWC_2} sind duplikate vorhanden.')
            df_all_data_VWC_2.to_csv(f'data/data_duplicates/{filename_VWC_2}', index=False)
        
    elif variable == 'all_data_EC_2':
        
        # merge all_data
        df_all_data_EC_2 = pd.concat(all_data_EC_2, ignore_index=True)
        
        # sort by datetime
        df_all_data_EC_2.sort_index(axis='index', inplace=False)
        
        # drop duplicates
        df_all_data_EC_2 = df_all_data_EC_2.drop_duplicates(subset=['tstamp'], keep='first', ignore_index=False)
        
        # replace filename
        filename_EC_2 = filename.replace("_Table2.dat", "_bulk_electrical_conductivity_50cm.csv")
        
        # check if there are any duplicates - if yes safe that file in folder 'data_duplicates' 
        try:
            assert len(df_all_data_EC_2['tstamp'])==len(set(df_all_data_EC_2['tstamp']))
            df_all_data_EC_2.to_csv(f'data/data_export/bulk_electrical_conductivity/{filename_EC_2}', index=False)
            
        except AssertionError:
            print(f'Bei datei {filename_EC_2} sind duplikate vorhanden.')
            df_all_data_EC_2.to_csv(f'data/data_duplicates/{filename_EC_2}', index=False)
        
    elif variable == 'all_data_GWL':
        
        # merge all_data
        df_all_data_GWL = pd.concat(all_data_GWL, ignore_index=True)
        
        # sort by datetime
        df_all_data_GWL.sort_index(axis='index', inplace=False)
        
        # drop duplicates
        df_all_data_GWL = df_all_data_GWL.drop_duplicates(subset=['tstamp'], keep='first', ignore_index=False)
        
        # replace filename
        filename_GWL = filename.replace("_Tensiometer", "").replace("sued", "süd").replace(".xlsx", "_ground_water_level.csv")
        
        # check if there are any duplicates - if yes safe that file in folder 'data_duplicates' 
        try:
            assert len(df_all_data_GWL['tstamp'])==len(set(df_all_data_GWL['tstamp']))
            df_all_data_GWL.to_csv(f'data/data_export/ground_water_level/{filename_GWL}', index=False)
            
        except AssertionError:
            print(f'Bei datei {filename_GWL} sind duplikate vorhanden.')
            df_all_data_GWL.to_csv(f'data/data_duplicates/{filename_GWL}', index=False)
        
    elif variable == 'all_data_WT':
        
        # merge all_data
        df_all_data_WT = pd.concat(all_data_WT, ignore_index=True)
        
        # sort by datetime
        df_all_data_WT.sort_index(axis='index', inplace=False)
        
        # drop duplicates
        df_all_data_WT = df_all_data_WT.drop_duplicates(subset=['tstamp'], keep='first', ignore_index=False)
        
        # replace filename
        filename_WT = filename.replace("_Tensiometer", "").replace("sued", "süd").replace(".xlsx", "_water_temperature.csv")
        
        # check if there are any duplicates - if yes safe that file in folder 'data_duplicates' 
        try:
            assert len(df_all_data_WT['tstamp'])==len(set(df_all_data_WT['tstamp']))
            df_all_data_WT.to_csv(f'data/data_export/water_temperature/{filename_WT}', index=False)
            
        except AssertionError:
            print(f'Bei datei {filename_WT} sind duplikate vorhanden.')
            df_all_data_WT.to_csv(f'data/data_duplicates/{filename_WT}', index=False)
        
    elif variable == 'all_data_LT':
        
        # merge all_data
        df_all_data_LT = pd.concat(all_data_LT, ignore_index=True)
        
        # sort by datetime
        df_all_data_LT.sort_index(axis='index', inplace=False)
        
        # drop duplicates
        df_all_data_LT = df_all_data_LT.drop_duplicates(subset=['tstamp'], keep='first', ignore_index=False)
        
        # replace filename
        filename_LT = filename.replace("_Tensiometer", "").replace("sued", "süd").replace(".xlsx", "_logger_temperature.csv")
        
        # check if there are any duplicates - if yes safe that file in folder 'data_duplicates' 
        try:
            assert len(df_all_data_LT['tstamp'])==len(set(df_all_data_LT['tstamp']))
            df_all_data_LT.to_csv(f'data/data_export/logger_temperature/{filename_LT}', index=False)
            
        except AssertionError:
            print(f'Bei datei {filename_LT} sind duplikate vorhanden.')
            df_all_data_LT.to_csv(f'data/data_duplicates/{filename_LT}', index=False)
        
    elif variable == 'all_data_GWL_csv':
        
        # merge all_data
        df_all_data_GWL_csv = pd.concat(all_data_GWL_csv, ignore_index=True)
        
        # sort by datetime
        df_all_data_GWL_csv.sort_index(axis='index', inplace=False)
        
        # drop duplicates
        df_all_data_GWL_csv = df_all_data_GWL_csv.drop_duplicates(subset=['tstamp'], keep='first', ignore_index=False)
        
        # replace filename
        filename_GWL_csv = filename.replace(".csv", "_GWL.csv")
        
        # check if there are any duplicates - if yes safe that file in folder 'data_duplicates' 
        try:
            assert len(df_all_data_GWL_csv['tstamp'])==len(set(df_all_data_GWL_csv['tstamp']))
            df_all_data_GWL_csv.to_csv(f'data/data_export/ground_water_level/{filename_GWL_csv}', index=False)
            
        except AssertionError:
            print(f'Bei datei {filename_GWL_csv} sind duplikate vorhanden.')
            df_all_data_GWL_csv.to_csv(f'data/data_duplicates/{filename_GWL_csv}', index=False)
        
    elif variable == 'all_data_WT_csv':
        
        # merge all_data
        df_all_data_WT_csv = pd.concat(all_data_WT_csv, ignore_index=True)
        
        # sort by datetime
        df_all_data_WT_csv.sort_index(axis='index', inplace=False)
        
        # drop duplicates
        df_all_data_WT_csv = df_all_data_WT_csv.drop_duplicates(subset=['tstamp'], keep='first', ignore_index=False)
        
        # replace filename
        filename_WT_csv = filename.replace(".csv", "_WT.csv")
        
        # check if there are any duplicates - if yes safe that file in folder 'data_duplicates' 
        try:
            assert len(df_all_data_WT_csv['tstamp'])==len(set(df_all_data_WT_csv['tstamp']))
            df_all_data_WT_csv.to_csv(f'data/data_export/water_temperature/{filename_WT_csv}', index=False)
            
        except AssertionError:
            print(f'Bei datei {filename_WT_csv} sind duplikate vorhanden.')
            df_all_data_WT_csv.to_csv(f'data/data_duplicates/{filename_WT_csv}', index=False)
        
    elif variable == 'all_data_LT_csv':
        
        # merge all_data
        df_all_data_LT_csv = pd.concat(all_data_LT_csv, ignore_index=True)
        
        # sort by datetime
        df_all_data_LT_csv.sort_index(axis='index', inplace=False)
        
        # drop duplicates
        df_all_data_LT_csv = df_all_data_LT_csv.drop_duplicates(subset=['tstamp'], keep='first', ignore_index=False)
        
        # replace filename
        filename_LT_csv = filename.replace(".csv", "_LT.csv")
        
        # check if there are any duplicates - if yes safe that file in folder 'data_duplicates' 
        try:
            assert len(df_all_data_LT_csv['tstamp'])==len(set(df_all_data_LT_csv['tstamp']))
            df_all_data_LT_csv.to_csv(f'data/data_export/logger_temperature/{filename_LT_csv}', index=False)
            
        except AssertionError:
            print(f'Bei datei {filename_LT_csv} sind duplikate vorhanden.')
            df_all_data_LT_csv.to_csv(f'data/data_duplicates/{filename_LT_csv}', index=False)
        
    elif variable == 'all_data_RWL_1':
        
        # merge all_data
        df_all_data_RWL_1 = pd.concat(all_data_RWL_1, ignore_index=True)
        
        # sort by datetime
        df_all_data_RWL_1.sort_index(axis='index', inplace=False)
        
        # drop duplicates
        df_all_data_RWL_1 = df_all_data_RWL_1.drop_duplicates(subset=['tstamp'], keep='first', ignore_index=False)
        
        # replace filename
        filename_RWL_1 = filename.replace("Pegel1_", "").replace(".csv", "_river_water_level_1.csv")
        
        # check if there are any duplicates - if yes safe that file in folder 'data_duplicates' 
        try:
            assert len(df_all_data_RWL_1['tstamp'])==len(set(df_all_data_RWL_1['tstamp']))
            df_all_data_RWL_1.to_csv(f'data/data_export/river_water_level_1/{filename_RWL_1}', index=False)
            
        except AssertionError:
            print(f'Bei datei {filename_RWL_1} sind duplikate vorhanden.')
            df_all_data_RWL_1.to_csv(f'data/data_duplicates/{filename_RWL_1}', index=False)
        
    elif variable == 'all_data_RWL_2':
        
        # merge all_data
        df_all_data_RWL_2 = pd.concat(all_data_RWL_2, ignore_index=True)
        
        # sort by datetime
        df_all_data_RWL_2.sort_index(axis='index', inplace=False)
        
        # drop duplicates
        df_all_data_RWL_2 = df_all_data_RWL_2.drop_duplicates(subset=['tstamp'], keep='first', ignore_index=False)
        
        # replace filename
        filename_RWL_2 = filename.replace("Pegel2_", "").replace(".csv", "_river_water_level_2.csv")
        
        # check if there are any duplicates - if yes safe that file in folder 'data_duplicates' 
        try:
            assert len(df_all_data_RWL_2['tstamp'])==len(set(df_all_data_RWL_2['tstamp']))
            df_all_data_RWL_2.to_csv(f'data/data_export/river_water_level_2/{filename_RWL_2}', index=False)
            
        except AssertionError:
            print(f'Bei datei {filename_RWL_2} sind duplikate vorhanden.')
            df_all_data_RWL_2.to_csv(f'data/data_duplicates/{filename_RWL_2}', index=False)
        
    elif variable == 'all_data_RWL_4':
        
        # merge all_data
        df_all_data_RWL_4 = pd.concat(all_data_RWL_4, ignore_index=True)
        
        # sort by datetime
        df_all_data_RWL_4.sort_index(axis='index', inplace=False)
        
        # drop duplicates
        df_all_data_RWL_4 = df_all_data_RWL_4.drop_duplicates(subset=['tstamp'], keep='first', ignore_index=False)
        
        # replace filename
        filename_RWL_4 = filename.replace("Pegel4_", "").replace(".csv", "_river_water_level_4.csv")
        
        # check if there are any duplicates - if yes safe that file in folder 'data_duplicates' 
        try:
            assert len(df_all_data_RWL_4['tstamp'])==len(set(df_all_data_RWL_4['tstamp']))
            df_all_data_RWL_4.to_csv(f'data/data_export/river_water_level_4/{filename_RWL_4}', index=False)
            
        except AssertionError:
            print(f'Bei datei {filename_RWL_4} sind duplikate vorhanden.')
            df_all_data_RWL_4.to_csv(f'data/data_duplicates/{filename_RWL_4}', index=False)
        
    else:
        raise ValueError(f"Variable is '{variable}', must be in ['all_data_AT', 'all_data_P', 'all_data_VWC_1', 'all_data_EC_1', 'all_data_VWC_2', 'all_data_EC_2', 'all_data_GWL', 'all_data_WT', 'all_data_LT', 'all_data_GWL_csv', 'all_data_WT_csv', 'all_data_LT_csv', 'river water level 1', 'river water level 2', 'river water level 4']")
    

In [5]:
# list of all the different stations for precipitation and air temperature
FILENAMES = ['Butschenberg.csv', 'Grundigklinik.csv', 'Hundseck.csv', 'Schafhof.csv', 'Schönbrunn.csv', 'Sportplatz.csv', 
             'Sternenberg-Schlammfang.csv', 'Schwabenquelle.csv', 'Winterberg.csv']

# lists of all the different stations for soil moisture 
FILENAMES_DAT_1 = ['Schafhof1_Table1.dat', 'Schafhof5_Table1.dat']
FILENAMES_DAT_2 = ['Schafhof1_Table2.dat', 'Schafhof5_Table2.dat']

# list of all the different stations for ground water level as a xlsx file
FILENAMES_GWL = ['Schafhof_Tensiometer.xlsx', 'Sprengquellen_Tensiometer_unten_nord.xlsx', 'Sprengquellen_Tensiometer_unten_sued.xlsx', 
                 'Sprengquellen_Tensiometer_oben_nord.xlsx', 'Sprengquellen_Tensiometer_oben_sued.xlsx']

# list of all the different stations for ground water level as a csv file
FILENAMES_GWL_csv = ['Schafhof_Tensiometer_alt.csv', 'Sprengquellen_Tensiometer_unten_nord_alt.csv', 
                     'Sprengquellen_Tensiometer_unten_sued_alt.csv', 'Sprengquellen_Tensiometer_oben_nord_alt.csv', 
                     'Sprengquellen_Tensiometer_oben_sued_alt.csv']

# list of all the different stations for river water level as a csv file
FILENAMES_RWL_1 = ['Pegel1_Bühlot.csv', 'Pegel1_Schwabenbrünnele.csv', 'Pegel1_Büchelbach.csv']

# list of all the different stations for river water level as a csv file
FILENAMES_RWL_2 = ['Pegel2_Bühlot.csv', 'Pegel2_Schwabenbrünnele.csv', 'Pegel2_Büchelbach.csv']

# list of all the different stations for river water level as a csv file
FILENAMES_RWL_4 = ['Pegel4_Bühlot.csv', 'Pegel4_Schwabenbrünnele.csv', 'Pegel4_Büchelbach.csv']

In [6]:
# preprocessing air temperature and precipitation
for filename in FILENAMES:
    all_data_AT = []
    all_data_P = []
    
    filenames = glob(f"*/*/*/{filename}", recursive=False)
    for datafile in tqdm(filenames):
        
        # preprocess each raw data file, once for rainfall and once for the air temperature
        df_AT = preprocessing(datafile, 'air temperature')
        df_P = preprocessing(datafile, 'precipitation')
        
        # append to all_data
        all_data_AT.append(df_AT)
        all_data_P.append(df_P)

    merge('all_data_AT')
    merge('all_data_P')

100%|██████████████████████████████████████████████████████████████████████████████████| 66/66 [00:19<00:00,  3.33it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [00:17<00:00,  3.99it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [00:10<00:00,  3.45it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 67/67 [00:18<00:00,  3.56it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 49/49 [00:12<00:00,  3.79it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 73/73 [00:20<00:00,  3.61it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [00:12<00:00,  3.37it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:19<00:00,  3.29it/s]
100%|███████████████████████████████████

In [7]:
# preprocessing volumetric water content and electrical conductivity
for filename in FILENAMES_DAT_1:
    all_data_VWC_1 = []
    all_data_EC_1 = []
    
    filenames = glob(f"*/*/*/{filename}", recursive=False)
    for datafile in tqdm(filenames):
        
        # preprocess each raw data file, once for the volumetric water content and once for the electrical conductivity
        df_VWC_1 = preprocessing(datafile, 'Table1_VWC')
        df_EC_1 = preprocessing(datafile, 'Table1_EC')
        
        # append to all_data
        all_data_VWC_1.append(df_VWC_1)
        all_data_EC_1.append(df_EC_1)

    merge('all_data_VWC_1')
    merge('all_data_EC_1')
    
for filename in FILENAMES_DAT_2:
    all_data_VWC_2 = []
    all_data_EC_2 = []
    
    filenames = glob(f"*/*/*/{filename}", recursive=False)
    for datafile in tqdm(filenames):
        
        # preprocess each raw data file, once for the volumetric water content and once for the electrical conductivity
        df_VWC_2 = preprocessing(datafile, 'Table2_VWC')
        df_EC_2 = preprocessing(datafile, 'Table2_EC')
        
        # append to all_data
        all_data_VWC_2.append(df_VWC_2)
        all_data_EC_2.append(df_EC_2)

    merge('all_data_VWC_2')
    merge('all_data_EC_2')

100%|██████████████████████████████████████████████████████████████████████████████████| 47/47 [00:07<00:00,  6.05it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 75/75 [00:09<00:00,  7.66it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 47/47 [00:07<00:00,  5.96it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 74/74 [00:10<00:00,  7.32it/s]


In [8]:
# preprocessing ground water level, water temperature and logger temperature
for filename in FILENAMES_GWL:
    all_data_GWL = []
    all_data_WT = []
    all_data_LT = []
    
    filenames = glob(f"*/*/*/{filename}", recursive=False)
    for datafile in tqdm(filenames):
        
        # preprocess each raw data file
        df_GWL = preprocessing(datafile, 'ground water level')
        df_WT = preprocessing(datafile, 'water temperature')
        df_LT = preprocessing(datafile, 'logger temperature')
        
        # append to all_data
        all_data_GWL.append(df_GWL)
        all_data_WT.append(df_WT)
        all_data_LT.append(df_LT)

    merge('all_data_GWL')
    merge('all_data_WT')
    merge('all_data_LT')
    
# preprocessing ground water level, water temperature and logger temperature
for filename in FILENAMES_GWL_csv:
    all_data_GWL_csv = []
    all_data_WT_csv = []
    all_data_LT_csv = []
    
    filenames = glob(f"*/*/*/{filename}", recursive=False)
    for datafile in tqdm(filenames):
        
        # preprocess each raw data file
        df_GWL_csv = preprocessing(datafile, 'ground water level csv')
        df_WT_csv = preprocessing(datafile, 'water temperature csv')
        df_LT_csv = preprocessing(datafile, 'logger temperature csv')
        
        # append to all_data
        all_data_GWL_csv.append(df_GWL_csv)
        all_data_WT_csv.append(df_WT_csv)
        all_data_LT_csv.append(df_LT_csv)

    merge('all_data_GWL_csv')
    merge('all_data_WT_csv')
    merge('all_data_LT_csv')

100%|██████████████████████████████████████████████████████████████████████████████████| 39/39 [04:31<00:00,  6.97s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:09<00:00,  6.46s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:41<00:00,  4.10s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [03:18<00:00,  6.62s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 33/33 [03:49<00:00,  6.96s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  4.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.79it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.76it/s]
100%|███████████████████████████████████

In [9]:
# preprocessing river water level as a csv file
for filename in FILENAMES_RWL_1:
    all_data_RWL_1 = []
    
    filenames = glob(f"*/*/*/{filename}", recursive=False)
    for datafile in tqdm(filenames):
        
        # preprocess each raw data file
        df_RWL_1 = preprocessing(datafile, 'river water level 1')

        # append to all_data
        all_data_RWL_1.append(df_RWL_1)

    merge('all_data_RWL_1')

for filename in FILENAMES_RWL_2:
    all_data_RWL_2 = []
    
    filenames = glob(f"*/*/*/{filename}", recursive=False)
    for datafile in tqdm(filenames):
        
        # preprocess each raw data file
        df_RWL_2 = preprocessing(datafile, 'river water level 2')

        # append to all_data
        all_data_RWL_2.append(df_RWL_2)

    merge('all_data_RWL_2')
    
for filename in FILENAMES_RWL_4:
    all_data_RWL_4 = []
    
    filenames = glob(f"*/*/*/{filename}", recursive=False)
    for datafile in tqdm(filenames):
        
        # preprocess each raw data file
        df_RWL_4 = preprocessing(datafile, 'river water level 4')

        # append to all_data
        all_data_RWL_4.append(df_RWL_4)

    merge('all_data_RWL_4')

100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:09<00:00,  2.35it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 34/34 [00:09<00:00,  3.63it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:03<00:00,  7.09it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:02<00:00,  6.17it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:05<00:00,  5.53it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:03<00:00,  6.98it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:01<00:00,  6.01it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:05<00:00,  5.44it/s]
100%|███████████████████████████████████

In [54]:
# Problem bei diesen dateien!
# Problem: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.
# Sprenquellen_oben_nord_ground_water_level
# Sprenquellen_oben_nord_logger_temperature
# Sprenquellen_oben_nord_water_temperature
# Schafhof1_volumetric_water_content_20cm
# Schafhof1_volumetric_water_content_50cm