In [16]:
import pandas as pd


In [17]:
df = pd.read_csv("recordingDatabase.csv", sep=",", header=0)

# remove person 0 and person 23

df = df[(df["PersonID"] != 0) & (df["PersonID"] != 23)]
same_as_next = (
    (df["Role"] == df["Role"].shift(-1)) &
    (df["TypeOfRecording"] == df["TypeOfRecording"].shift(-1)) &
    (df["PersonID"] == df["PersonID"].shift(-1))
)

df = df[~same_as_next].copy()


df["Timestamp"] = pd.to_datetime(df["Timestamp"], dayfirst=True)




      PersonID  Role TypeOfRecording           Timestamp ToolCall  \
12           4  User       Recording 2024-11-25 13:47:05      NaN   
13           4  User             STT 2024-11-25 13:47:06      NaN   
17           4  User  TextCompletion 2024-11-25 13:47:10      NaN   
34           4  User       Recording 2024-11-25 13:48:49      NaN   
35           4  User             STT 2024-11-25 13:48:52      NaN   
...        ...   ...             ...                 ...      ...   
8881        22  User             STT 2024-11-28 11:40:13      NaN   
8882        22  User  TextCompletion 2024-11-28 11:40:17      NaN   
8939        22  User       Recording 2024-11-28 11:45:36      NaN   
8940        22  User             STT 2024-11-28 11:45:38      NaN   
8941        22  User  TextCompletion 2024-11-28 11:45:42      NaN   

             Condition  
12    ECA + kein Audio  
13    ECA + kein Audio  
17    ECA + kein Audio  
34    ECA + kein Audio  
35    ECA + kein Audio  
...                ..

In [18]:
def compute_latency(df, role_from, type_from, role_to, type_to):
    results = []
    for pid, group_df in df.groupby("PersonID"):
        
        # nur Zeilen dieser Person, filtern nach Role+Type
        df_from = group_df[(group_df["Role"] == role_from) & 
                           (group_df["TypeOfRecording"] == type_from)].copy()
        df_to   = group_df[(group_df["Role"] == role_to) & 
                           (group_df["TypeOfRecording"] == type_to)].copy()
        


        df_from = df_from.sort_values("Timestamp")
        df_to   = df_to.sort_values("Timestamp")

        df_from = df_from.rename(columns={"Timestamp": "Timestamp_from"})
        df_to = df_to.rename(columns={"Timestamp": "Timestamp_to"})

        merged = pd.merge_asof(
            df_from, 
            df_to,
            left_on="Timestamp_from",
            right_on="Timestamp_to",
            by="PersonID",
            direction="forward"
        )
        
        # latency
        merged["latency_seconds"] = (
            merged["Timestamp_to"] - merged["Timestamp_from"]
        ).dt.total_seconds()

        #remove line if latency is larger than 20 seconds because it may indicate system error
        merged = merged[merged["latency_seconds"] < 10]
        results.append(merged[["PersonID", "Timestamp_to", "Timestamp_from", "latency_seconds"]])

    return pd.concat(results, ignore_index=True)



In [19]:
# sort left keys in merge_asof must be sorted
df = df.sort_values(["PersonID", "Timestamp"]).reset_index(drop=True)

lat_user_rec_stt = compute_latency(df, 
                                   role_from="User", 
                                   type_from="Recording", 
                                   role_to="User", 
                                   type_to="STT")
print("=== User Recording → STT ===")
print("Durchschnitt:", lat_user_rec_stt["latency_seconds"].mean())
print("Anzahl Werte:", len(lat_user_rec_stt))
print()

# 3.2 User STT → Answer 
lat_sys_answer = compute_latency(df,
                                 role_from="User",
                                 type_from="STT",
                                 role_to="Answer",
                                 type_to="TextCompletion")
print("=== User STT → Answer ===")
print("Durchschnitt:", lat_sys_answer["latency_seconds"].mean())
print("Anzahl Werte:", len(lat_sys_answer))
print()

# 3.3 Answer → TTS
lat_answer_tts = compute_latency(df,
                                 role_from="Answer",
                                 type_from="TextCompletion",
                                 role_to="Assistant",
                                 type_to="TTS")
print("=== Answer → TTS ===")
print("Durchschnitt:", lat_answer_tts["latency_seconds"].mean())
print("Anzahl Werte:", len(lat_answer_tts))
print()

print("=== Globale Statistik (Sekunden) ===")
for label, df_ in [
    ("UserRec→STT", lat_user_rec_stt),
    ("SysTxt→Answer", lat_sys_answer),
    ("Answer→TTS", lat_answer_tts)
]:
    print(f"{label}:")
    print(df_["latency_seconds"].describe()) 
    print()

=== User Recording → STT ===
Durchschnitt: 1.8686131386861313
Anzahl Werte: 274

=== User STT → Answer ===
Durchschnitt: 7.11353711790393
Anzahl Werte: 229

=== Answer → TTS ===
Durchschnitt: 1.3978779840848807
Anzahl Werte: 754

=== Globale Statistik (Sekunden) ===
UserRec→STT:
count    274.000000
mean       1.868613
std        0.801258
min        0.000000
25%        1.000000
50%        2.000000
75%        2.000000
max        5.000000
Name: latency_seconds, dtype: float64

SysTxt→Answer:
count    229.000000
mean       7.113537
std        1.568674
min        2.000000
25%        6.000000
50%        8.000000
75%        8.000000
max        9.000000
Name: latency_seconds, dtype: float64

Answer→TTS:
count    754.000000
mean       1.397878
std        0.770158
min        0.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        6.000000
Name: latency_seconds, dtype: float64



In [21]:
import numpy as np
import itertools


# remove PersonID 24 because there is no data
df_ = df_[df_["PersonID"] != 24]

m = df_.groupby("PersonID")["latency_seconds"].mean()
print(m)
person_ids = m.index.tolist()

diffs = []
for (p1, p2) in itertools.combinations(person_ids, 2):
    diffs.append(abs(m[p1] - m[p2]))
avg_diff = np.mean(diffs)

print("=== Average difference between persons ===")
print(avg_diff)


PersonID
2     0.921053
3     1.444444
4     1.588235
6     1.500000
7     1.510204
8     1.435897
9     1.040000
10    1.440000
11    1.441176
12    1.411765
13    1.473684
14    1.303030
15    1.242424
16    1.500000
17    1.473684
18    1.625000
19    1.375000
20    1.439024
21    1.440678
22    1.276596
31    1.444444
Name: latency_seconds, dtype: float64
=== Average difference between persons ===
0.16800223514120602
