# Datenfehler

## Initialisierung

In [196]:
import pandas as pd
import numpy as np

In [197]:
def dfTimeSeries():
    df = pd.DataFrame(
        { "Datum":  pd.date_range(start='1-1-2020', periods=10, freq="1D"), 
        "Werte": [157.0, 142, 153, 161, 158, 149, 142, 138, 132, 136] })
    df["WerteM"] = df["Werte"]
    # Fehlende Werte erzeugen
    df.at[2, "WerteM"] = np.NaN
    df.at[5, "WerteM"] = np.NaN
    df.at[6, "WerteM"] = np.NaN
    df["Datum"] = pd.to_datetime(df["Datum"])
    return df

def dfStetigDiskret():
    df = pd.DataFrame({"Stetig":[-50.0,-5,20,50,100], "Diskret": [1.0,2,1,2,2]})
    df["StetigM"] = df["Stetig"]
    df["DiskretM"] = df["Diskret"]
    df.at[2, "StetigM"] = np.NaN
    df.at[2, "DiskretM"] = np.NaN
    return df

def dfPersonen():
    df = pd.DataFrame({"Name":["S1", "S2", "S3", "S4", "L1", "L2"], "Art": [1, 1, 1, 1, 2, 2], "Alter": [10.0, 11, 10, 9, 35, 57]})
    df["AlterM"] = df["Alter"]
    df.at[2, "AlterM"] = np.NaN
    return df



## Analyse

## Datensätze entfernen

In [198]:
df = dfTimeSeries()
df


Unnamed: 0,Datum,Werte,WerteM
0,2020-01-01,157.0,157.0
1,2020-01-02,142.0,142.0
2,2020-01-03,153.0,
3,2020-01-04,161.0,161.0
4,2020-01-05,158.0,158.0
5,2020-01-06,149.0,
6,2020-01-07,142.0,
7,2020-01-08,138.0,138.0
8,2020-01-09,132.0,132.0
9,2020-01-10,136.0,136.0


In [199]:
df = df.dropna()
df

Unnamed: 0,Datum,Werte,WerteM
0,2020-01-01,157.0,157.0
1,2020-01-02,142.0,142.0
3,2020-01-04,161.0,161.0
4,2020-01-05,158.0,158.0
7,2020-01-08,138.0,138.0
8,2020-01-09,132.0,132.0
9,2020-01-10,136.0,136.0


## Konstanter Wert

In [200]:
df = dfStetigDiskret()
df["StetigKonst"] = df["StetigM"].fillna(0)
df["DiskretKonst"] = df["DiskretM"].fillna(1).astype('Int64')
df

Unnamed: 0,Stetig,Diskret,StetigM,DiskretM,StetigKonst,DiskretKonst
0,-50.0,1.0,-50.0,1.0,-50.0,1
1,-5.0,2.0,-5.0,2.0,-5.0,2
2,20.0,1.0,,,0.0,1
3,50.0,2.0,50.0,2.0,50.0,2
4,100.0,2.0,100.0,2.0,100.0,2


## Mittelwert, Median

In [201]:
df = dfStetigDiskret()
df.drop(columns=["Diskret", "DiskretM"], inplace=True)
StetigMean = df["StetigM"].mean()
StetigMedian = df["StetigM"].median()
df["StetigMean"] = df["StetigM"].fillna(StetigMean)
df["StetigMedian"] = df["StetigM"].fillna(StetigMedian)
df


Unnamed: 0,Stetig,StetigM,StetigMean,StetigMedian
0,-50.0,-50.0,-50.0,-50.0
1,-5.0,-5.0,-5.0,-5.0
2,20.0,,23.75,22.5
3,50.0,50.0,50.0,50.0
4,100.0,100.0,100.0,100.0


## Modus

In [202]:
df = dfStetigDiskret()
df.drop(columns=["Stetig", "StetigM"], inplace=True)
DiskretMode = df["DiskretM"].mode()
df["DiskretMode"] = df["DiskretM"].fillna(DiskretMode[0]).astype('Int64')
df


Unnamed: 0,Diskret,DiskretM,DiskretMode
0,1.0,1.0,1
1,2.0,2.0,2
2,1.0,,2
3,2.0,2.0,2
4,2.0,2.0,2


## Zeitreihen

In [203]:
df = dfTimeSeries()
df["LCOF"] =  df["WerteM"]
df["NOCB"] =  df["WerteM"]
df["Interpol"] =  df["WerteM"]
df["LCOF"].fillna(method='ffill', inplace=True)
df["NOCB"].fillna(method='backfill', inplace=True)
df["Interpol"].interpolate(method="linear", inplace=True)

# df[df["WerteM"].isnull()]
df


Unnamed: 0,Datum,Werte,WerteM,LCOF,NOCB,Interpol
0,2020-01-01,157.0,157.0,157.0,157.0,157.0
1,2020-01-02,142.0,142.0,142.0,142.0,142.0
2,2020-01-03,153.0,,142.0,161.0,151.5
3,2020-01-04,161.0,161.0,161.0,161.0,161.0
4,2020-01-05,158.0,158.0,158.0,158.0,158.0
5,2020-01-06,149.0,,158.0,138.0,151.333333
6,2020-01-07,142.0,,158.0,138.0,144.666667
7,2020-01-08,138.0,138.0,138.0,138.0,138.0
8,2020-01-09,132.0,132.0,132.0,132.0,132.0
9,2020-01-10,136.0,136.0,136.0,136.0,136.0


## Median Teilgruppen

In [204]:
df = dfPersonen()
df["AlterMeanFull"] = df["AlterM"].fillna(df["AlterM"].mean())
df["AlterMeanPart"] = df["AlterM"]

for art in df["Art"].unique():
    dfArt = df["Art"] == art
    dfArtNaN = dfArt & df["AlterMeanPart"].isnull()

    median = df[dfArt]["AlterMeanPart"].median()
    df.loc[dfArtNaN, "AlterMeanPart"] = median

# df
df[df["AlterM"].isnull()]

Unnamed: 0,Name,Art,Alter,AlterM,AlterMeanFull,AlterMeanPart
2,S3,1,10.0,,24.4,10.0
