# EDA

## Load

In [None]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from src.config import DATA_RAW_DIR, DATA_RAW_FILENAME

In [None]:
df_raw = pd.read_csv(os.path.join(DATA_RAW_DIR, DATA_RAW_FILENAME), sep=";")

## First glance at raw data

In [None]:
display(df_raw.shape)

In [None]:
display(df_raw.info())

In [None]:
df_raw.isna().sum()

In [None]:
df_raw[
    df_raw["sol_prod"].isna() & df_raw["sol_prod_1"].notna() & df_raw["sol_prod_1"] > 0
][["timestamp", "sol_prod", "sol_prod_1", "sol_prod_2", "sol_prod_3"]].head(10)

There are missing values in the `sol_prod` column, but the `sol_prod_1` column has values. The values in `sol_prod` should be recoverable from `sol_prod_1` and `sol_prod_2`.

In [None]:
df_raw[
    df_raw["wb_tot_charge"].isna() & df_raw["wb_0_tot_charge"].notna() & df_raw["wb_0_tot_charge"] > 0
][["timestamp", "wb_tot_charge", "wb_0_tot_charge", "wb_0_grid_cons", "wb_0_sol_charge"]].head(10)

In [None]:
df_raw[
    df_raw["wb_tot_charge"].notna() & df_raw["wb_0_tot_charge"].notna() & df_raw["wb_0_tot_charge"] > 0
][["timestamp", "wb_tot_charge", "wb_0_tot_charge", "wb_0_grid_cons", "wb_0_sol_charge"]].head(10)

There are also missing values in the `wb_tot_charge` column, but the `wb_0_tot_charge` column has values. The values in `wb_tot_charge` should be recoverable from `wb_0_tot_charge`.

## Info, Describe, Overview

In [None]:
df_raw["timestamp"] = pd.to_datetime(df_raw["timestamp"], format="%Y-%m-%d %H:%M")

In [None]:
df_raw.info()

In [None]:
display(df_raw.loc[:, df_raw.select_dtypes(include="number").columns].describe().T)

In [None]:
from src.eda import overview

overview(df_raw)

### Observation
* `sol_prod_3` has only zeros.
* `wb_0_grid_cons` has negative values.

## Correlations

In [None]:
df_raw_copy = df_raw.loc[:, df_raw.select_dtypes(include=["number"]).columns].copy()


corr = df_raw_copy.corr()
mask = np.abs(corr) < 0.3
annot = corr.round(2).astype(str)
annot[mask] = ""

plt.figure(figsize=(9, 7))
sns.heatmap(
    corr,
    annot=annot,
    fmt="",
    cmap="coolwarm",
    vmin=-1,
    vmax=1,
)

plt.title("Correlations between numerical columns", y=1.02)
plt.show()

del df_raw_copy

## Outlier Detection

In [None]:
from src.eda import mark_outliers_mad

# Mark outliers in the raw data using the Median Absolute Deviation (MAD) method
# deviations from the median are considered outliers if they exceed 3 times the median absolute deviation
# Note the log scale
mark_outliers_mad(
    df_raw,
    )

### Observations
* Most of the columns have values that are considered outliers by the mad-3 method. Most of the values are very small, so this is not surprising.