# Qualitycontol of Groundtruth
This script allows the comparison of two different groundtruth data to find differences in classification and missing 

In [1]:
import pandas as pd

def compare_dataframes(df1, df2, time_threshold=300):
    """
    Diese Funktion sucht nach ähnlichen Zeilen in den beiden Dataframes basierend auf der 'SectionID' und 'DateTime'.
    Die Zeilen werden als ähnlich betrachtet, wenn die 'SectionID' identisch ist und der 'DateTime'-Unterschied kleiner ist als der
    angegebene 'time_threshold' in Sekunden.

    :param df1: Erstes DataFrame
    :param df2: Zweites DataFrame
    :param time_threshold: Zeitschwellenwert in Sekunden (Standardwert: 300)
    :return: Ein Tuple mit zwei DataFrames, die die nicht ähnlichen Zeilen der beiden Eingabe-DataFrames enthalten.
    """
    # Zuerst die Datetime-Spalten in datetime64 konvertieren, falls sie noch nicht konvertiert wurden
    df1["DateTime"] = pd.to_datetime(df1["DateTime"])
    df2["DateTime"] = pd.to_datetime(df2["DateTime"])

    # Leere DataFrames erstellen, um nicht ähnliche Zeilen zu speichern
    not_similar_df1 = pd.DataFrame(columns=df1.columns)
    not_similar_df2 = pd.DataFrame(columns=df2.columns)

    for index1, row1 in df1.iterrows():
        similar = False
        for index2, row2 in df2.iterrows():
            if row1["SectionID"] == row2["SectionID"]:
                time_difference = abs((row1["DateTime"] - row2["DateTime"]).total_seconds())
                if time_difference <= time_threshold:
                    similar = True
                    break
        if not similar:
            not_similar_df1 = not_similar_df1.append(row1)

    for index2, row2 in df2.iterrows():
        similar = False
        for index1, row1 in df1.iterrows():
            if row1["SectionID"] == row2["SectionID"]:
                time_difference = abs((row1["DateTime"] - row2["DateTime"]).total_seconds())
                if time_difference <= time_threshold:
                    similar = True
                    break
        if not similar:
            not_similar_df2 = not_similar_df2.append(row2)

    return not_similar_df1, not_similar_df2


In [64]:
gt =  pd.read_csv("./data/gt_data/TUDCam01_FR20_2022-09-20_08-15-00.csv")
gtv = pd.read_csv("./data/gtv_data/TUDCam01_FR20_2022-09-20_08-15-00-events.csv")

In [65]:
#Class translation Bicycle zu Fahrrad ohne Anhänger
#wird  nur  für TUDCam datensatz benötigt!
gtv['Class'] = gtv['Class'].str.replace('Bicycle', 'Fahrrad ohne Anhänger')

In [66]:
#Absolute Zeilen differenz
gt.shape[0] - gtv.shape[0]

28

In [67]:
#gt["Class"].value_counts() - gtv["Class"].value_counts()
gt_f = gt["Class"].value_counts()
gt_f.name = "gt"
gtv_f = gtv["Class"].value_counts()
gtv_f.name = "gtv"
gt_merged = pd.concat([gt_f, gtv_f], axis = 1)
gt_merged.fillna(value=0, inplace=True)
gt_merged["diff"] = gt_merged["gt"] - gt_merged["gtv"]
gt_merged["rel"] = gt_merged["diff"] / gt_merged["gt"] * 100
gt_merged["prio"] = abs(gt_merged["rel"]) * gt_merged["gt"] / gt.shape[0]
gt_merged.sort_values("prio", ascending= False )
#Wenn positiv dann hat gt mehr gezählt als gtv  

Unnamed: 0,gt,gtv,diff,rel,prio
Fahrrad ohne Anhänger,181,166.0,15.0,8.287293,7.211538
Schienenfahrzeug,6,0.0,6.0,100.0,2.884615
Person,19,14.0,5.0,26.315789,2.403846
Lkw ohne Anhänger,2,0.0,2.0,100.0,0.961538


In [68]:
comp = "Fahrrad ohne Anhänger"

In [69]:
gt_diff, gtv_diff = compare_dataframes(gt[gt["Class"] == comp], gtv[gtv["Class"] == comp], 2);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["DateTime"] = pd.to_datetime(df1["DateTime"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["DateTime"] = pd.to_datetime(df2["DateTime"])
  not_similar_df1 = not_similar_df1.append(row1)
  not_similar_df1 = not_similar_df1.append(row1)
  not_similar_df1 = not_similar_df1.append(row1)
  not_similar_df1 = not_similar_df1.append(row1)
  not_similar_df1 = not_similar_df1.append(row1)
  not_similar_df1 = not_similar_df1.append(row1)
  not_similar_df1 = not_similar_df1.append(row1)
  not_similar_df2 = not_sim

In [70]:
gt_diff.sort_values("DateTime") #ggf falsch Klassifiziert!

Unnamed: 0,EventID,SectionID,TrackID,X,Y,Frame,Class,Vidfilename,DateTime
103,107,Nord,56,432,173,9725,Fahrrad ohne Anhänger,TUDCam01_FR20_2022-09-20_08-15-00.mp4,2022-09-20 08:23:06
104,108,Sued,56,471,113,9826,Fahrrad ohne Anhänger,TUDCam01_FR20_2022-09-20_08-15-00.mp4,2022-09-20 08:23:11
123,127,Nord,66,219,165,11326,Fahrrad ohne Anhänger,TUDCam01_FR20_2022-09-20_08-15-00.mp4,2022-09-20 08:24:26
124,128,Sued,66,390,110,11390,Fahrrad ohne Anhänger,TUDCam01_FR20_2022-09-20_08-15-00.mp4,2022-09-20 08:24:29
168,174,Sued,89,402,112,15785,Fahrrad ohne Anhänger,TUDCam01_FR20_2022-09-20_08-15-00.mp4,2022-09-20 08:28:09
204,210,Nord,108,213,166,17768,Fahrrad ohne Anhänger,TUDCam01_FR20_2022-09-20_08-15-00.mp4,2022-09-20 08:29:48
205,211,Sued,108,392,111,17827,Fahrrad ohne Anhänger,TUDCam01_FR20_2022-09-20_08-15-00.mp4,2022-09-20 08:29:51


In [71]:
gtv_diff.sort_values("DateTime") #ggf nicht erfasst!

Unnamed: 0,EventID,SectionID,TrackID,X,Y,Frame,Class,Vidfilename,DateTime
2,3,Nord,2,308,168,486,Fahrrad ohne Anhänger,TUDCam01_FR20_2022-09-20_08-15-00.mp4,2022-09-20 08:15:24
