In [1]:
## Enkel test av MonthlyStatistics

import sys
sys.path.append("../src")  
from monthly_statistics import MonthlyStatistics

stats = MonthlyStatistics("../data/processed")

# Regn ut statistikk for én måned i Oslo
resultat = stats.compute_single_month(
    "2024-03", "mean(air_temperature P1D)", "oslo", time_offset="PT0H"
)
print(resultat)

# Hent alle måneder for Tromsø
df = stats.compute_all_months(
    "mean(air_temperature P1D)", "tromso", time_offset="PT0H"
)
df.head()


{'mean': 2.6967741935483875, 'median': 2.7, 'std': 2.2953647360696494}


Unnamed: 0,year_month,mean,median,std
0,2000-01,-2.4,-1.5,3.480517
1,2000-02,-3.296552,-3.3,2.77057
2,2000-03,-3.112903,-2.7,2.648615
3,2000-04,-0.81,-0.1,3.010482
4,2000-05,4.964516,5.7,3.66056


In [2]:
## Ny funksjon for å finne outliers per måned
## Kan gjenbrukes for visualisering senere

import sys
sys.path.append("../src")  
import pandas as pd
from outlier_detector import OutlierDetector
from monthly_statistics import MonthlyStatistics

def finn_outliers_per_maaned(
        by: str,
        element_id: str,
        time_offset: str
    ) -> pd.DataFrame:
    """
    Returnerer en DataFrame med oversikt over hvor mange outliers som ble
    fjernet per måned for en gitt by, elementId og timeOffset.
    
    Kolonner: year_month, outliers_removed, antall_totalt, andel_outliers_%
    """
    stats = MonthlyStatistics("../data/processed")
    detect = OutlierDetector()

    df = stats._load_city(by)
    df = df[
        (df["elementId"] == element_id) &
        (df["timeOffset"] == time_offset)
    ].copy()

    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    df["referenceTime"] = pd.to_datetime(df["referenceTime"], utc=True)
    df["year_month"] = df["referenceTime"].dt.tz_localize(None).dt.to_period("M").astype(str)

    resultater = []

    for ym, gruppe in df.groupby("year_month"):
        original = gruppe["value"]
        mask = detect.detect_iqr(original, extreme=True)
        antall_outliers = int(mask.sum())

        if antall_outliers > 0:
            resultater.append({
                "year_month": ym,
                "outliers_removed": antall_outliers,
                "antall_totalt": len(original),
                "andel_outliers_%": round(100 * antall_outliers / len(original), 1),
            })

    return pd.DataFrame(resultater).sort_values("year_month").reset_index(drop=True)

# Justerbare parametre
by = "oslo"
element_id = "mean(air_temperature P1D)"
time_offset = "PT0H"

outlier_oversikt = finn_outliers_per_maaned(by, element_id, time_offset)
outlier_oversikt.head()


Unnamed: 0,year_month,outliers_removed,antall_totalt,andel_outliers_%
0,2002-04,1,30,3.3
1,2008-08,1,31,3.2
2,2009-07,1,31,3.2
3,2023-07,1,31,3.2


In [None]:
## Ny funksjon for å finne månedlige gjennomsnitt med og uten outliers
## Kan gjenbrukes for visualisering senere

import sys
sys.path.append("../src")  
import pandas as pd
from outlier_detector import OutlierDetector
from monthly_statistics import MonthlyStatistics

def statistikk_med_og_uten_outliers(
        by: str,
        element_id: str,
        time_offset: str,
        statistikk: str
    ) -> pd.DataFrame:
    """
    Returnerer en DataFrame med månedlige statistiske mål
    (mean, median eller std), både med og uten outliers.

    Parametre:
    - by: f.eks. "oslo"
    - element_id: f.eks. "mean(air_temperature P1D)"
    - time_offset: f.eks. "PT0H"
    - statistikk: "mean", "median" eller "std"

    Kolonner: year_month, <stat>_with_outliers, <stat>_without_outliers,
              outliers_removed, elementId
    """
    stats = MonthlyStatistics("../data/processed")
    detect = OutlierDetector()

    if statistikk not in {"mean", "median", "std"}:
        raise ValueError("statistikk må være 'mean', 'median' eller 'std'")

    df = stats._load_city(by)
    df = df[
        (df["elementId"] == element_id) &
        (df["timeOffset"] == time_offset)
    ].copy()

    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    df["referenceTime"] = pd.to_datetime(df["referenceTime"], utc=True)
    df["year_month"] = df["referenceTime"].dt.tz_localize(None).dt.to_period("M").astype(str)

    resultater = []

    for ym, gruppe in df.groupby("year_month"):
        serie = gruppe["value"]
        outlier_mask = detect.detect_iqr(serie, extreme=True)

        if serie.dropna().empty:
            continue

        renset = serie.where(~outlier_mask).dropna()

        if statistikk == "mean":
            verdi_full = serie.mean()
            verdi_renset = renset.mean() if not renset.empty else None
        elif statistikk == "median":
            verdi_full = serie.median()
            verdi_renset = renset.median() if not renset.empty else None
        else:  # std
            verdi_full = serie.std(ddof=0)
            verdi_renset = renset.std(ddof=0) if not renset.empty else None

        resultater.append({
            "year_month": ym,
            f"{statistikk}_with_outliers": round(verdi_full, 3),
            f"{statistikk}_without_outliers": round(verdi_renset, 3) if verdi_renset is not None else None,
            "outliers_removed": int(outlier_mask.sum()),
            "elementId": element_id
        })

        kolonnenavn = [
            "year_month",
            f"{statistikk}_with_outliers",
            f"{statistikk}_without_outliers",
            "outliers_removed",
            "elementId"
        ]

    return pd.DataFrame(resultater, columns=kolonnenavn).sort_values("year_month").reset_index(drop=True)

# Justerbare parametre
by = "oslo"
element_id = "range(air_temperature P1D)"
time_offset = "PT0H"
statistikk = "std"  # "mean", "median" eller "std"

df_snitt = statistikk_med_og_uten_outliers(by, element_id, time_offset, statistikk)
df_snitt.head()

Unnamed: 0,year_month,std_with_outliers,std_without_outliers,outliers_removed,elementId
0,2005-01,2.796,2.796,0,range(air_temperature P1D)
1,2005-02,2.701,2.701,0,range(air_temperature P1D)
2,2005-03,3.139,3.139,0,range(air_temperature P1D)
3,2005-10,1.715,1.715,0,range(air_temperature P1D)
4,2005-12,0.0,0.0,0,range(air_temperature P1D)


In [21]:
## Ny funksjon for å analyse kombinasjoner av variabler
## Kan gjenbrukes for visualisering senere

import sys
sys.path.append("../src")  
import re
import pandas as pd
from monthly_statistics import MonthlyStatistics
from outlier_detector import OutlierDetector


def kombiner_variabler_analyse(
    city: str,
    element_id1: str,
    element_id2: str,
    element_id3: str,
    statistic: str = "mean",        # "mean", "median" eller "std"
    frequency: str = "ME",           # "D", "W", "ME", "Y"
    remove_outliers: bool = True,
    start: str | None = None,       # "YYYY-MM" eller "YYYY-MM-DD"
    end: str | None = None,         # samme format som start
) -> pd.DataFrame:
    """
    Kombinerer (element_id1 + element_id2) og sammenligner med element_id3.

    Resultatet er én DataFrame med
        periode | komb12_<stat> | elem3_<stat> | n_outliers
    der <stat> er mean/median/std.

    Parametre
    ---------
    city, element_id1/2/3, statistic, frequency, remove_outliers, start, end
    """
    # --- Init ---
    stats = MonthlyStatistics("../data/processed")
    detector = OutlierDetector()

    # --- Last hele by‑DataFrame én gang ---
    df = stats._load_city(city)

    # --- Begrens periode hvis ønsket ---
    if start or end:
        mask_period = pd.Series(True, index=df.index)
        if start:
            start_ts = pd.to_datetime(start).tz_localize("UTC")
            mask_period &= df["referenceTime"] >= start_ts

        if end:
            end_ts = pd.to_datetime(end).tz_localize("UTC")
            mask_period &= df["referenceTime"] <= end_ts
        df = df[mask_period]

    # --- Hjelper: finn laveste timeOffset for et elementId ---
    def _laveste_offset(elem: str) -> str:
        offs = df.loc[df["elementId"] == elem, "timeOffset"].dropna().unique()
        # "PT0H", "PT6H" ...  => sortér på antall timer
        hours = [int(re.search(r"PT(\d+)H", o).group(1)) for o in offs]
        return offs[hours.index(min(hours))]

    # --- Velg ut min‑offset per variabel ---
    off1 = _laveste_offset(element_id1)
    off2 = _laveste_offset(element_id2)
    off3 = _laveste_offset(element_id3)

    sub1 = df[(df["elementId"] == element_id1) & (df["timeOffset"] == off1)].copy()
    sub2 = df[(df["elementId"] == element_id2) & (df["timeOffset"] == off2)].copy()
    sub3 = df[(df["elementId"] == element_id3) & (df["timeOffset"] == off3)].copy()

    # --- Felles pre‑arbeid ---
    for sub in (sub1, sub2, sub3):
        sub["value"] = pd.to_numeric(sub["value"], errors="coerce")
        sub["referenceTime"] = pd.to_datetime(sub["referenceTime"], utc=True)
        sub.set_index("referenceTime", inplace=True)

    # --- Aggregér til ønsket frekvens & statistikk ---
    def _agg(sub: pd.DataFrame) -> pd.Series:
        ser = sub["value"]
        if remove_outliers:
            ser = ser[~detector.detect_iqr(ser, extreme=True)]
        fn = {"mean": "mean", "median": "median", "std": lambda x: x.std(ddof=0)}[statistic]
        return getattr(ser.resample(frequency), fn)()

    s1 = _agg(sub1)
    s2 = _agg(sub2)
    s3 = _agg(sub3)

    # --- Sett sammen resultat ---
    df_out = pd.DataFrame({
        "komb12_" + statistic: (s1 + s2),
        f"{element_id3}_{statistic}": s3,
    }).dropna(how="all")  # fjerner perioder uten data

    # legg på antall outliers fjernet hvis ønsket
    if remove_outliers:
        outlier_count = (
            detector.detect_iqr(sub1["value"], extreme=True).resample(frequency).sum() +
            detector.detect_iqr(sub2["value"], extreme=True).resample(frequency).sum() +
            detector.detect_iqr(sub3["value"], extreme=True).resample(frequency).sum()
        )
        df_out["n_outliers"] = outlier_count.astype("Int64")

    df_out.index.name = "periode"
    df_out.reset_index(inplace=True)
    return df_out


df = kombiner_variabler_analyse(
    city="oslo",
    element_id1="mean(air_temperature P1D)",
    element_id2="range(air_temperature P1D)",
    element_id3="sum(precipitation_amount P1D)",
    statistic="mean",
    frequency="ME",
    remove_outliers=True,
    start="2005-01",
    end="2024-12",
)
df.head()


Unnamed: 0,periode,komb12_mean,sum(precipitation_amount P1D)_mean,n_outliers
0,2005-01-31 00:00:00+00:00,7.106452,0.816667,1
1,2005-02-28 00:00:00+00:00,4.725,0.357143,0
2,2005-03-31 00:00:00+00:00,8.309677,0.8,0
3,2005-04-30 00:00:00+00:00,,0.562069,1
4,2005-05-31 00:00:00+00:00,,2.025,3


In [20]:
import pandas as pd

def les_manglende_verdier(path: str) -> pd.DataFrame:
    """
    Leser en CSV med manglende verdier og returnerer rader
    der enten 'oslo_value' eller 'tromso_value' mangler.

    Returnerer en DataFrame med:
    - date
    - timeOffset
    - elementId
    - mangler (Oslo eller Tromsø)
    """
    df = pd.read_csv(path)

    # Finn hvilke som mangler
    mangler_oslo = df["oslo_value"].isna()
    mangler_tromso = df["tromso_value"].isna()

    # Lag én kolonne som sier hvilken by som mangler verdi
    df["mangler"] = None
    df.loc[mangler_oslo, "mangler"] = "Oslo"
    df.loc[mangler_tromso, "mangler"] = "Tromsø"

    # Filtrer kun rader der noe mangler
    df_mangler = df[df["mangler"].notna()].copy()

    # Returner bare relevante kolonner
    return df_mangler[["date", "timeOffset", "elementId", "mangler"]].reset_index(drop=True)

df_mangler = les_manglende_verdier("../data/missing/missing_in_both.csv")
df_mangler.head()



Unnamed: 0,date,timeOffset,elementId,mangler
0,2003-06-01,PT0H,max(air_temperature P1D),Oslo
1,2003-06-01,PT0H,min(air_temperature P1D),Oslo
2,2003-06-01,PT0H,range(air_temperature P1D),Oslo
3,2003-06-02,PT0H,max(air_temperature P1D),Oslo
4,2003-06-02,PT0H,min(air_temperature P1D),Oslo
