In [16]:
import pandas as pd


In [None]:
df = pd.read_csv("recordingDatabase.csv", sep=",", header=0)

# remove person 0 and person 23, because person 0 is the system and person 23 was not in the analysis

df = df[(df["PersonID"] != 0) & (df["PersonID"] != 23)]
same_as_next = (
    (df["Role"] == df["Role"].shift(-1)) &
    (df["TypeOfRecording"] == df["TypeOfRecording"].shift(-1)) &
    (df["PersonID"] == df["PersonID"].shift(-1))
)

df = df[~same_as_next].copy()


df["Timestamp"] = pd.to_datetime(df["Timestamp"], dayfirst=True)




In [18]:
def compute_latency(df, role_from, type_from, role_to, type_to):
    results = []
    for pid, group_df in df.groupby("PersonID"):
        

        df_from = group_df[(group_df["Role"] == role_from) & 
                           (group_df["TypeOfRecording"] == type_from)].copy()
        df_to   = group_df[(group_df["Role"] == role_to) & 
                           (group_df["TypeOfRecording"] == type_to)].copy()
        


        df_from = df_from.sort_values("Timestamp")
        df_to   = df_to.sort_values("Timestamp")

        df_from = df_from.rename(columns={"Timestamp": "Timestamp_from"})
        df_to = df_to.rename(columns={"Timestamp": "Timestamp_to"})

        merged = pd.merge_asof(
            df_from, 
            df_to,
            left_on="Timestamp_from",
            right_on="Timestamp_to",
            by="PersonID",
            direction="forward"
        )
        

        merged["latency_seconds"] = (
            merged["Timestamp_to"] - merged["Timestamp_from"]
        ).dt.total_seconds()

        #remove line if latency is larger than 20 seconds because it may indicate system error
        merged = merged[merged["latency_seconds"] < 10]
        results.append(merged[["PersonID", "Timestamp_to", "Timestamp_from", "latency_seconds"]])

    return pd.concat(results, ignore_index=True)



In [None]:
df = df.sort_values(["PersonID", "Timestamp"]).reset_index(drop=True)

lat_user_rec_stt = compute_latency(df, 
                                   role_from="User", 
                                   type_from="Recording", 
                                   role_to="User", 
                                   type_to="STT")
print("=== User Recording → STT ===")
print("Durchschnitt:", lat_user_rec_stt["latency_seconds"].mean())
print("Anzahl Werte:", len(lat_user_rec_stt))
print()

# 3.2 User STT → Answer 
lat_sys_answer = compute_latency(df,
                                 role_from="User",
                                 type_from="STT",
                                 role_to="Answer",
                                 type_to="TextCompletion")
print("=== User STT → Answer ===")
print("Durchschnitt:", lat_sys_answer["latency_seconds"].mean())
print("Anzahl Werte:", len(lat_sys_answer))
print()

# 3.3 Answer → TTS
lat_answer_tts = compute_latency(df,
                                 role_from="Answer",
                                 type_from="TextCompletion",
                                 role_to="Assistant",
                                 type_to="TTS")
print("=== Answer → TTS ===")
print("Durchschnitt:", lat_answer_tts["latency_seconds"].mean())
print("Anzahl Werte:", len(lat_answer_tts))
print()

print("=== Globale Statistik (Sekunden) ===")
for label, df_ in [
    ("UserRec→STT", lat_user_rec_stt),
    ("SysTxt→Answer", lat_sys_answer),
    ("Answer→TTS", lat_answer_tts)
]:
    print(f"{label}:")
    print(df_["latency_seconds"].describe()) 
    print()

In [None]:
import numpy as np
import itertools


# remove PersonID 24 because there is no data
df_ = df_[df_["PersonID"] != 24]

m = df_.groupby("PersonID")["latency_seconds"].mean()
print(m)
person_ids = m.index.tolist()

diffs = []
for (p1, p2) in itertools.combinations(person_ids, 2):
    diffs.append(abs(m[p1] - m[p2]))
avg_diff = np.mean(diffs)

print("=== Average difference between persons ===")
print(avg_diff)
