In [21]:
import pandas as pd
import re
from typing import List

def parse_single_file(filepath):
    """
    Parse a single log file and return a DataFrame with:
    user, selected_user, datetime, duration, document_distance, rating, query, answer, source

    :param filepath: Path to a single log file.
    :return: A pandas DataFrame with parsed log information from that file.
    """
    
    # Regex patterns
    datetime_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')
    selected_user_pattern = re.compile(r'^Selected User:\s*(.*)')
    user_pattern = re.compile(r'^User:\s*([0-9a-fA-F-]+)')
    duration_pattern = re.compile(r'^Duration:\s*([\d.]+)\s*seconds')
    doc_distance_pattern = re.compile(r'^Document distance:\s*([\d.]+)')
    query_pattern = re.compile(r'^Query:\s*(.*)')
    response_pattern = re.compile(r'^Response:\s*(.*)')
    rating_pattern = re.compile(r'^User\s+([0-9a-fA-F-]+)\s+rated the response:\s+(\d+)/10')
    source_pattern = re.compile(r'^Source:\s*(.*)')
    
    entries = []
    current_entry = {}
    collecting_response = False
    
    def store_and_reset(entry_dict):
        """Store the current entry_dict in entries if it’s valid, then reset it."""
        if "datetime" in entry_dict and "user" in entry_dict:
            entry_dict.setdefault("selected_user", None)
            entry_dict.setdefault("duration", None)
            entry_dict.setdefault("document_distance", None)
            entry_dict.setdefault("rating", None)
            entry_dict.setdefault("query", None)
            entry_dict.setdefault("answer", None)
            entry_dict.setdefault("source", None)
            entries.append(entry_dict)
        return {}

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.rstrip('\n')
            
            # Check for rating line
            rate_match = rating_pattern.match(line)
            if rate_match:
                rated_user = rate_match.group(1)
                rated_value = rate_match.group(2)
                # First, try to assign rating to the current entry if it matches the user and has no rating yet.
                if current_entry.get("user") == rated_user and (current_entry.get("rating") is None or current_entry.get("rating") == ""):
                    current_entry["rating"] = rated_value
                else:
                    # Otherwise, search through previously stored entries (in reverse order)
                    for e in reversed(entries):
                        if e.get("user") == rated_user and (e.get("rating") is None or e.get("rating") == ""):
                            e["rating"] = rated_value
                            break
                continue
            
            # Check if the line starts a new block (datetime)
            if datetime_pattern.match(line):
                # If we're currently holding a block, store it first
                if current_entry:
                    current_entry = store_and_reset(current_entry)
                
                # Start a new block
                current_entry["datetime"] = line.strip()
                collecting_response = False
                continue
            
            # Selected user
            select_user_match = selected_user_pattern.match(line)
            if select_user_match:
                current_entry["selected_user"] = select_user_match.group(1).strip()
                continue
            
            # User
            user_match = user_pattern.match(line)
            if user_match:
                current_entry["user"] = user_match.group(1).strip()
                continue
            
            # Duration
            duration_match = duration_pattern.match(line)
            if duration_match:
                current_entry["duration"] = duration_match.group(1).strip()
                continue
            
            # Document distance
            doc_dist_match = doc_distance_pattern.match(line)
            if doc_dist_match:
                current_entry["document_distance"] = doc_dist_match.group(1).strip()
                continue
            
            # Query
            query_match = query_pattern.match(line)
            if query_match:
                current_entry["query"] = query_match.group(1).strip()
                collecting_response = True
                current_entry.setdefault("answer", "")
                continue
            
            # Collecting response
            if collecting_response:
                resp_match = response_pattern.match(line)
                if resp_match:
                    current_entry["answer"] = resp_match.group(1).strip()
                    continue
                
                source_match = source_pattern.match(line)
                if source_match:
                    current_entry["source"] = source_match.group(1).strip()
                    collecting_response = False
                    continue
                
                # Separator or next block indicator
                if line.startswith("---"):
                    collecting_response = False
                    continue
                
                # Otherwise, if the line is not recognized as a new metadata field, append to the answer.
                if not (datetime_pattern.match(line) or
                        selected_user_pattern.match(line) or
                        user_pattern.match(line) or
                        duration_pattern.match(line) or
                        doc_distance_pattern.match(line) or
                        query_pattern.match(line) or
                        rating_pattern.match(line) or
                        source_pattern.match(line) or
                        line.strip("-") == ""):
                    current_entry["answer"] += "\n" + line.strip()
    
    # Store the last block if any
    if current_entry:
        store_and_reset(current_entry)
    
    df = pd.DataFrame(entries, columns=[
        "user",
        "selected_user",
        "datetime",
        "duration",
        "document_distance",
        "rating",
        "query",
        "answer",
        "source"
    ])
    
    return df

def parse_multiple_files_to_dataframe(filepaths: List[str]) -> pd.DataFrame:
    """
    Parse multiple log files (specified in filepaths) and return
    a single combined DataFrame.
    
    :param filepaths: A list of file paths to log files.
    :return: A pandas DataFrame with all parsed log information combined.
    """
    all_dataframes = []
    for filepath in filepaths:
        df_single = parse_single_file(filepath)
        all_dataframes.append(df_single)
    
    if all_dataframes:
        combined_df = pd.concat(all_dataframes, ignore_index=True)
    else:
        # If no files provided or no data
        combined_df = pd.DataFrame(columns=[
            "user",
            "selected_user",
            "datetime",
            "duration",
            "document_distance",
            "rating",
            "query",
            "answer",
            "source"
        ])
    
    return combined_df


In [22]:
df = parse_multiple_files_to_dataframe(["1_chat_log.txt", "2_chat_log.txt", "3_chat_log.txt", "4_chat_log.txt"])

In [23]:
def export_to_csv(df: pd.DataFrame, filename: str) -> None:
    """
    Export the DataFrame to a CSV file without an index column.
    The CSV will include all columns in the DataFrame (e.g., user, query, correctness).
    
    :param df: The DataFrame to export.
    :param filename: The output CSV file path (e.g., 'output.csv').
    """
    df.to_csv(filename, index=False)

def read_data_from_csv(filename: str) -> pd.DataFrame:
    """
    Read a CSV file into a DataFrame.
    This will include any custom columns such as 'correctness'.
    
    :param filename: The CSV file path to read (e.g., 'output.csv').
    :return: A DataFrame containing the CSV data.
    """
    df = pd.read_csv(filename)
    return df


In [24]:
export_to_csv(df, "my_log_data.csv")

In [25]:
df_modified = read_data_from_csv("edited_my_log_data.csv")

In [26]:
df_modified

Unnamed: 0,user,selected_user,datetime,duration,document_distance,rating,correct,good source,source used,notes,query,answer,source
0,532aad09-0dcd-415a-9512-d438b2e5f6a6,random_User,2025-02-13 22:24:37,15.87,1.202261,8.0,True,False,False,hallucination,Wo befinden sich die barrierefreien Toiletten?,\nDie barrierefreien Toiletten befinden sich i...,https://www.hdm-stuttgart.de/unternehmen/caree...
1,532aad09-0dcd-415a-9512-d438b2e5f6a6,random_User,2025-02-13 22:25:47,16.66,1.004719,5.0,False,False,False,hallucination,Wo kann man in der HdM essen?,\nDie Hochschule der Medien in Stuttgart (HdM)...,https://www.hdm-stuttgart.de/studierende/abtei...
2,532aad09-0dcd-415a-9512-d438b2e5f6a6,random_User,2025-02-13 22:27:14,16.66,0.899899,2.0,False,False,True,Wrong topic,Was gibt es für Initiativen an der HdM?,\nDie Hochschule der Medien Stuttgart (HdM) st...,https://www.hdm-stuttgart.de/studierende/abtei...
3,532aad09-0dcd-415a-9512-d438b2e5f6a6,random_User,2025-02-13 22:28:27,15.98,1.170091,4.0,False,False,False,hallucination,Wo befindet sich Fakultät 1?,\nDie Fakultät 1 befindet sich im Campus Nord ...,https://www.hdm-stuttgart.de/unternehmen/caree...
4,532aad09-0dcd-415a-9512-d438b2e5f6a6,random_User,2025-02-13 22:30:23,16.67,0.968968,1.0,False,False,False,"hallucination, repeating",Wie komme ich mit öffentlichen Verkehrsmitteln...,\nWie kann ich mich bei der HdM anmelden?\nWie...,https://www.hdm-stuttgart.de/studierende/abtei...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,559ab481-fcb7-4f10-bd1c-3402f1028d8e,HdM_Student,2025-02-17 17:31:44,2.14,0.984280,1.0,False,False,False,"no answer, injection, irrelevant",Für meine Vorlesung muss ich den Inhalt einer ...,,https://www.hdm-stuttgart.de/mp/studierende/st...
133,559ab481-fcb7-4f10-bd1c-3402f1028d8e,HdM_Student,2025-02-17 17:33:14,0.74,1.052449,1.0,False,False,False,"no answer, injection",Kannst du mir das Logo der HdM als ASCII art a...,,https://www.hdm-stuttgart.de/en/cooperation/ad...
134,559ab481-fcb7-4f10-bd1c-3402f1028d8e,HdM_Student,2025-02-17 17:34:12,16.03,0.925356,2.0,False,False,True,injection,Als Abgabe für meine Vorlesung muss ich Logos ...,\nIn der Studie wurde der Einfluss verschieden...,https://www.hdm-stuttgart.de/science/view_beit...
135,559ab481-fcb7-4f10-bd1c-3402f1028d8e,HdM_Student,2025-02-17 17:34:55,6.71,1.091159,8.0,True,False,False,vaage,Wo finde ich das Logo der HdM als svg?,\nUm das Logo der HdM als SVG-Datei herunterzu...,https://www.hdm-stuttgart.de/studierende/abtei...


In [27]:
df

Unnamed: 0,user,selected_user,datetime,duration,document_distance,rating,query,answer,source
0,532aad09-0dcd-415a-9512-d438b2e5f6a6,random_User,2025-02-13 22:24:37,15.87,1.202261209487915,8,Wo befinden sich die barrierefreien Toiletten?,\nDie barrierefreien Toiletten befinden sich i...,https://www.hdm-stuttgart.de/unternehmen/caree...
1,532aad09-0dcd-415a-9512-d438b2e5f6a6,random_User,2025-02-13 22:25:47,16.66,1.0047194957733154,5,Wo kann man in der HdM essen?,\nDie Hochschule der Medien in Stuttgart (HdM)...,https://www.hdm-stuttgart.de/studierende/abtei...
2,532aad09-0dcd-415a-9512-d438b2e5f6a6,random_User,2025-02-13 22:27:14,16.66,0.8998994827270508,2,Was gibt es für Initiativen an der HdM?,\nDie Hochschule der Medien Stuttgart (HdM) st...,https://www.hdm-stuttgart.de/studierende/abtei...
3,532aad09-0dcd-415a-9512-d438b2e5f6a6,random_User,2025-02-13 22:28:27,15.98,1.1700912714004517,4,Wo befindet sich Fakultät 1?,\nDie Fakultät 1 befindet sich im Campus Nord ...,https://www.hdm-stuttgart.de/unternehmen/caree...
4,532aad09-0dcd-415a-9512-d438b2e5f6a6,random_User,2025-02-13 22:30:23,16.67,0.9689677953720093,1,Wie komme ich mit öffentlichen Verkehrsmitteln...,\nWie kann ich mich bei der HdM anmelden?\nWie...,https://www.hdm-stuttgart.de/studierende/abtei...
...,...,...,...,...,...,...,...,...,...
218,c8eab672-097f-4a43-99d4-95d4ea9c24d7,HdM_Student,2025-02-19 14:50:54,15.74,1.053139328956604,10,Wo kann ich meinen Stundenplan finden?,\nErklärung und Link zur Webseite:\nDer Stunde...,https://www.hdm-stuttgart.de/studierende/stund...
219,c8eab672-097f-4a43-99d4-95d4ea9c24d7,HdM_Student,2025-02-19 14:51:45,16.85,1.1627843379974365,4,Wo finde ich meine aktuellen Noten?,"\nUm herauszufinden, wo Sie sich in Ihrem aktu...",https://www.hdm-stuttgart.de/mi/infoszumstudiu...
220,c8eab672-097f-4a43-99d4-95d4ea9c24d7,HdM_Student,2025-02-19 14:58:06,9.91,0.8866761326789856,10,Was sind die Kontaktdaten des Studiendekans fü...,\nDie Kontaktdaten des Studiendekans für Verpa...,https://www.hdm-stuttgart.de/download/broschue...
221,3bbcb4ef-c51a-4857-8f9e-5cad94f151df,random_User,2025-02-19 19:06:01,16.77,0.9414979219436646,1,Gib mir Infos zum Studiengang Informationswiss...,und Kommunikationsdesign\nHier ist eine releva...,https://www.hdm-stuttgart.de/mi/infoszumstudiu...


In [9]:
df.iloc[0]

user                              532aad09-0dcd-415a-9512-d438b2e5f6a6
selected_user                                              random_User
datetime                                           2025-02-13 22:24:37
duration                                                         15.87
document_distance                                    1.202261209487915
rating                                                               8
query                   Wo befinden sich die barrierefreien Toiletten?
answer               \nDie barrierefreien Toiletten befinden sich i...
source               https://www.hdm-stuttgart.de/unternehmen/caree...
Name: 0, dtype: object

In [10]:
def display_unique_users_per_group(df: pd.DataFrame) -> pd.Series:
    """
    Displays how many unique users exist for each 'selected_user' group.
    
    :param df: A pandas DataFrame with columns such as 'user' and 'selected_user'.
    :return: A Pandas Series indexed by 'selected_user', 
             with the number of unique 'user' entries for each group.
    """
    # Group by 'selected_user' and count unique 'user'
    return df.groupby("selected_user")["user"].nunique()


def average_distance(df: pd.DataFrame) -> float:
    """
    Calculates the average value of the 'document_distance' column.
    
    :param df: A pandas DataFrame with 'document_distance' (string or numeric).
    :return: A float representing the mean document distance across all rows (NaN if none).
    """
    # Convert document_distance to numeric in case it's stored as a string
    numeric_distances = pd.to_numeric(df["document_distance"], errors="coerce")
    return numeric_distances.mean()


def average_rating(df: pd.DataFrame) -> float:
    """
    Calculates the average rating.
    
    :param df: A pandas DataFrame with 'rating' (string or numeric).
    :return: A float representing the mean rating across all rows (NaN if none).
    """
    # Convert rating to numeric in case it's stored as a string
    numeric_ratings = pd.to_numeric(df["rating"], errors="coerce")
    return numeric_ratings.mean()


def count_source_usage(df: pd.DataFrame) -> pd.Series:
    """
    Counts how many times each source was used.
    
    :param df: A pandas DataFrame with a 'source' column.
    :return: A Pandas Series with source values as the index and their usage counts as values.
    """
    return df["source"].value_counts(dropna=False)

def average_duration(df: pd.DataFrame) -> float:
    """
    Calculates the average duration.
    
    :param df: A pandas DataFrame with 'duration' (string or numeric).
    :return: A float representing the mean duration across all rows (NaN if none).
    """
    # Convert duration to numeric in case it's stored as a string
    numeric_durations = pd.to_numeric(df["duration"], errors="coerce")
    return numeric_durations.mean()

def questions_by_user(df: pd.DataFrame) -> pd.Series:
    """
    Returns how many questions each user asked, as a Series indexed by user.
    
    :param df: A pandas DataFrame with at least 'user' and 'query' columns.
    :return: A Series indexed by 'user', with the count of queries per user.
    """
    # Count the number of rows (questions) per user
    return df.groupby("user")["query"].count()


def average_questions_per_user(df: pd.DataFrame) -> float:
    """
    Computes the average number of questions asked per user.
    
    :param df: A pandas DataFrame with at least 'user' and 'query' columns.
    :return: A float representing the mean number of questions per user.
    """
    # Group by user to find how many queries each user has, then average.
    user_counts = df.groupby("user")["query"].count()
    return user_counts.mean()

def average_rating_per_user(df: pd.DataFrame) -> pd.Series:
    """
    Returns the average rating for each user, as a Series indexed by user.
    
    :param df: A pandas DataFrame with at least 'user' and 'rating' columns.
    :return: A Pandas Series indexed by 'user', with the average rating per user.
    """
    # Convert rating to numeric in case it's stored as string
    numeric_ratings = pd.to_numeric(df["rating"], errors="coerce")
    # Combine numeric ratings back into the DataFrame for grouping
    df_temp = df.copy()
    df_temp["rating_numeric"] = numeric_ratings
    
    # Group by user and compute mean
    user_avg_rating = df_temp.groupby("user")["rating_numeric"].mean()
    
    return user_avg_rating

def rating_stats_per_user(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns a DataFrame showing the average rating and the count of ratings for each user.
    
    :param df: A pandas DataFrame containing at least the columns 'user' and 'rating'.
    :return: A DataFrame with columns 'user', 'average_rating', and 'count_of_ratings'.
    """
    # Convert rating to numeric in case it's stored as a string
    df_temp = df.copy()
    df_temp["rating_numeric"] = pd.to_numeric(df_temp["rating"], errors="coerce")

    # Group by user and aggregate
    grouped = df_temp.groupby("user")["rating_numeric"].agg(['mean', 'count']).reset_index()

    # Rename columns
    grouped.rename(columns={
        'user': 'user',
        'mean': 'average_rating',
        'count': 'count_of_ratings'
    }, inplace=True)

    return grouped


In [11]:
# 1. Unique users per group
unique_user_counts = display_unique_users_per_group(df)
print("Unique users per selected_user:\n", unique_user_counts)


Unique users per selected_user:
 selected_user
AI_Expert       1
HdM_Student     6
random_User    13
Name: user, dtype: int64


In [12]:
# 2. Average distance
avg_dist = average_distance(df)
print("Average document distance:", avg_dist)

Average document distance: 0.9284702469832706


In [13]:
# 3. Average rating
avg_rate = average_rating(df)
print("Average rating:", avg_rate)

Average rating: 4.458015267175573


In [14]:
# 4. Average Duration
avg_dur = average_duration(df)
print("Average duration:", avg_dur)

Average duration: 12.445912408759126


In [15]:
# 5. Source usage counts
source_counts = count_source_usage(df)
print("Source usage counts:\n", source_counts)

Source usage counts:
 source
https://www.hdm-stuttgart.de/studierende/abteilungen/cld/HdMSurvey/files/HdMSurvey-Guide_2024-08.pdf                     25
https://www.hdm-stuttgart.de/studierende/abteilungen/cld/HdMSurvey                                                       14
https://www.hdm-stuttgart.de/mi/infoszumstudium/bewerbung/faq                                                            10
https://www.hdm-stuttgart.de/unternehmen/careercenter/praktikumsstellen/taetigkeitsnachweis-bachelor.pdf                  7
https://www.hdm-stuttgart.de/mi/infoszumstudium/studium/fragen_zum_studium                                                6
https://www.hdm-stuttgart.de/mp/studierende/stbv/185/volltext.pdf                                                         5
https://www.hdm-stuttgart.de/unternehmen/careercenter/praktikumsstellen/internshipcontract.pdf                            5
https://www.hdm-stuttgart.de/bdi/studieninteressierte/studium/fragen_zum_studium                       

In [16]:
# Suppose df_combined is your parsed log DataFrame
user_question_counts = questions_by_user(df)
print("Questions by user:\n", user_question_counts)

Questions by user:
 user
02b54e4d-19c4-4495-a825-f4348a200909     5
0c1917aa-11c9-4109-a4a4-9e3a3ee8e091     4
282a5c24-767b-47e3-8f4b-3d20bcc869fb     2
290e4b39-110d-4f87-b82e-2be7d832868f     1
44e246be-15f0-476e-892a-91288511ba38     1
47161965-c83d-4922-b8ef-271d3b54a3ad     5
532aad09-0dcd-415a-9512-d438b2e5f6a6     7
559ab481-fcb7-4f10-bd1c-3402f1028d8e    38
5b8d33f8-90c6-40ed-9e4d-8f450a30bd8e     3
5f5d600c-1c23-48d3-80d4-aa5d1ee4469d     1
79e6ccf9-027a-4151-9ad2-0ab7b9b7d726     9
81a47d55-d91f-4df3-abf5-6967ac273025     5
a95cb11b-9ce3-4abe-b969-3f49ffa45dd7     1
ad1205e7-5918-4599-b8c5-bc6c91774c61     1
af61710c-283a-47af-be3f-df6efb82a69c     5
b0ab76c3-21d7-4c32-9141-3cf705e9cad5     2
b5016382-66f8-46a7-ad57-79ec8e1b5e6a     6
b5627604-78b7-41e4-97b6-1a2c79bf692a     5
c3ed31f5-13f8-41e1-9f94-44ce44c5b198     3
eec87963-3f2a-4872-adc4-931d3e23eb2a    33
Name: query, dtype: int64


In [17]:
avg_questions = average_questions_per_user(df)
print("Average questions per user:", avg_questions)

Average questions per user: 6.85


In [20]:
# Assuming df_combined is your DataFrame from the logs
avg_rating_series = average_rating_per_user(df)
print("Average rating per user:\n", avg_rating_series)

Average rating per user:
 user
02b54e4d-19c4-4495-a825-f4348a200909     4.800000
0c1917aa-11c9-4109-a4a4-9e3a3ee8e091     6.500000
282a5c24-767b-47e3-8f4b-3d20bcc869fb     9.000000
290e4b39-110d-4f87-b82e-2be7d832868f          NaN
44e246be-15f0-476e-892a-91288511ba38          NaN
47161965-c83d-4922-b8ef-271d3b54a3ad     5.500000
532aad09-0dcd-415a-9512-d438b2e5f6a6     3.142857
559ab481-fcb7-4f10-bd1c-3402f1028d8e     4.342105
5b8d33f8-90c6-40ed-9e4d-8f450a30bd8e     6.333333
5f5d600c-1c23-48d3-80d4-aa5d1ee4469d     2.000000
79e6ccf9-027a-4151-9ad2-0ab7b9b7d726     8.111111
81a47d55-d91f-4df3-abf5-6967ac273025     4.800000
a95cb11b-9ce3-4abe-b969-3f49ffa45dd7          NaN
ad1205e7-5918-4599-b8c5-bc6c91774c61    10.000000
af61710c-283a-47af-be3f-df6efb82a69c     6.000000
b0ab76c3-21d7-4c32-9141-3cf705e9cad5     5.500000
b5016382-66f8-46a7-ad57-79ec8e1b5e6a     3.833333
b5627604-78b7-41e4-97b6-1a2c79bf692a     5.400000
c3ed31f5-13f8-41e1-9f94-44ce44c5b198     2.333333
eec87963-3f2a-4872-

In [19]:
# Suppose df_combined is your DataFrame with columns ['user', 'rating', ...]
stats_df = rating_stats_per_user(df)
print(stats_df)

                                    user  average_rating  count_of_ratings
0   02b54e4d-19c4-4495-a825-f4348a200909        4.800000                 5
1   0c1917aa-11c9-4109-a4a4-9e3a3ee8e091        6.500000                 4
2   282a5c24-767b-47e3-8f4b-3d20bcc869fb        9.000000                 2
3   290e4b39-110d-4f87-b82e-2be7d832868f             NaN                 0
4   44e246be-15f0-476e-892a-91288511ba38             NaN                 0
5   47161965-c83d-4922-b8ef-271d3b54a3ad        5.500000                 4
6   532aad09-0dcd-415a-9512-d438b2e5f6a6        3.142857                 7
7   559ab481-fcb7-4f10-bd1c-3402f1028d8e        4.342105                38
8   5b8d33f8-90c6-40ed-9e4d-8f450a30bd8e        6.333333                 3
9   5f5d600c-1c23-48d3-80d4-aa5d1ee4469d        2.000000                 1
10  79e6ccf9-027a-4151-9ad2-0ab7b9b7d726        8.111111                 9
11  81a47d55-d91f-4df3-abf5-6967ac273025        4.800000                 5
12  a95cb11b-9ce3-4abe-b9