## Data Loading

The package uses a simple data class called `DataSource`, intended to hold both the raw data (as a pandas DataFrame) and any associated metadata, or any annotations added during the data pipeline process.

At present `DataSource` has the main attributes

`DataSource._df` - the pandas data frame where the data is store

`DataSource._time_column` - a pointer to the time column for time series data

`DataSource._categorical_columns` - a list of which columns are categorical

`DataSource._numeric_columns` - a list of which columns are considered numeric.

`DataSource._annotation_columns` - a list of columns added to the dataframe as annotations (e.g `_is_anomaly`)

`DataSource._description` - where the profile is added to the data source

`DataSource._column_profiles` - where profiles are added for each column in the data frame

`DataSource._valid_ranges` - if some apriori information is known about some of the columns (e.g acceptable range for sensor values) then this info can be provided.




When loading a `DataSource` we provide the initial dataframe, as well as the `time_column` (if there is one), `_categorical_columns` and `numeric_columns`.


# Loading EGM data

In [None]:
import pandas as pd
import os
from collections import namedtuple
import datetime
from dqp.core import DataSource

_DATA_FOLDER_ROOT = "./datasets/"

def load_egm(path=None):
    if not path:
        path = os.path.join(_DATA_FOLDER_ROOT, "egm/")

    d = {}
    for f in os.listdir(path):
        name = f.split("-")[0]
        df = pd.read_csv(f"{path}/{f}")
        d[name] = df

    time = d["Illuminance"]["Time"].values
    cols = {d[k].columns[1]: d[k][d[k].columns[1]] for k in d}
    cols["ts"] = time
    df = pd.DataFrame(cols)
    df["illuminance"] = df["illuminance"].apply(lambda x: float(x.split(" ")[0]))
    df["precipitation"] = df["precipitation"].apply(lambda x: float(x.split(" ")[0]))
    df["irradiance"] = df["irradiance"].apply(lambda x: float(x.split(" ")[0]))
    df["windspeedgust"] = df["windspeedgust"].apply(lambda x: float(x.split(" ")[0]))
    df["windspeedavg"] = df["windspeedavg"].apply(lambda x: float(x.split(" ")[0]))
    df["temperature"] = df["temperature"].apply(lambda x: float(x.split(" ")[0]))
    df["humidity"] = df["humidity"].apply(lambda x: float(x.strip("%")))
    df["ts"] = pd.to_datetime(df["ts"])
    df = df.sort_values(by="ts")

    SensorRange = namedtuple("SensorRange", "min max")

    extra_data_info = {
        "temperature": {
            "range": SensorRange(-30, 60),
            "Accuracy": 1,
            "Resolution": 0.1,
            "type": float,
        },
        "humidity": {
            "range": SensorRange(10, 99),
            "Accuracy": 5,
            "Resoultion": None,
            "type": float,
        },
        "precipitation": {
            "range": SensorRange(0, 9999),
            "Accuracy": 10,
            "Resolution": "complicated!",
            "type": float,
        },
        "windspeedavg": {
            "range": SensorRange(0, 50),
            "Accuracy": "complicated",
            "type": float,
        },
        "illuminance": {"range": SensorRange(0, 300000), "type": float},
        "ts": {"type": datetime.datetime},
    }

    valid_ranges = {
        k: extra_data_info[k]["range"] for k in extra_data_info if k != "ts"
    }
    numeric_columns = ['temperature', 'humidity', 'precipitation', 'windspeedavg', 'illuminance']
    categorical_columns = []

    return DataSource(
        df,
        time_column="ts",
        valid_ranges=valid_ranges,
        categorical_columns=categorical_columns,
        numeric_columns=numeric_columns,
    )

data = load_egm("./datasets/egm/")


## Load the new EGM datasets -- these are loaded one by one separately

In [None]:
def load_egm2(path=None,timecol=None,delimiter=","):
    if not path:
        path = os.path.join(_DATA_FOLDER_ROOT, "low_cost_weather_station/")

    if not timecol:
        timecol="observedAt"

    metric_vals=['mm','cm','%','°C',' m']
    df = pd.read_csv(path,delimiter=delimiter)

    time = df[timecol].values
    
    # cols = {d[k].columns[1]: d[k][d[k].columns[1]] for k in d}
    df["ts"] = time
    df=df.drop(timecol,axis=1)
    for c in df.columns:
        temp_dtype=df[c].dtype
        df[c]=df[c].astype(str)
        tmp=df[c].str.contains('|'.join(metric_vals))
        if tmp.sum()/len(tmp)>0.1:

            df[c]=df[c].apply(lambda x: float(x.split(" ")[0]))
            df[c]=df[c].astype(float)
        else:
            df[c]=df[c].astype(temp_dtype)

    df["ts"] = pd.to_datetime(df["ts"])

    SensorRange = namedtuple("SensorRange", "min max")

    extra_data_info = {
        #  "windDirection": {
        #     "range": SensorRange(0, 50),
        #     # "Accuracy": "complicated",
        #     "type": float,
        # },
        # "illuminance": {"range": SensorRange(0, 300000), "type": float},
        "ts": {"type": datetime.datetime},
    }

    valid_ranges = {
        k: extra_data_info[k]["range"] for k in extra_data_info if k != "ts"
    }
    numeric_columns = [c for c in df.columns if (df[c].dtype==float or df[c].dtype==int)]

    categorical_columns = [c for c in df.columns if (df[c].dtype=="object")]

    return DataSource(
        df,
        time_column="ts",
        valid_ranges=valid_ranges,
        categorical_columns=categorical_columns,
        numeric_columns=numeric_columns,
    )


In [None]:
data = load_egm2("./datasets/egm2/WeatherInforamtion_Les_Orres.csv",delimiter=";")

In [None]:
data._df

In [None]:
df