In [1]:
import pandas as pd
import re
from typing import List

def parse_single_file(filepath):
    """
    Parse a single log file and return a DataFrame with:
    user, selected_user, datetime, duration, document_distance, rating, query, answer, source
    
    :param filepath: Path to a single log file.
    :return: A pandas DataFrame with parsed log information from that file.
    """
    
    # Regex patterns
    datetime_pattern = re.compile(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')
    selected_user_pattern = re.compile(r'^Selected User:\s*(.*)')
    user_pattern = re.compile(r'^User:\s*([0-9a-fA-F-]+)')
    duration_pattern = re.compile(r'^Duration:\s*([\d.]+)\s*seconds')
    doc_distance_pattern = re.compile(r'^Document distance:\s*([\d.]+)')
    query_pattern = re.compile(r'^Query:\s*(.*)')
    response_pattern = re.compile(r'^Response:\s*(.*)')
    rating_pattern = re.compile(r'^User\s+([0-9a-fA-F-]+)\s+rated the response:\s+(\d+)/10')
    source_pattern = re.compile(r'^Source:\s*(.*)')
    
    entries = []
    current_entry = {}
    collecting_response = False
    
    def store_and_reset(entry_dict):
        """Store the current entry_dict in entries if it’s valid, then reset it."""
        if "datetime" in entry_dict and "user" in entry_dict:
            entry_dict.setdefault("selected_user", None)
            entry_dict.setdefault("duration", None)
            entry_dict.setdefault("document_distance", None)
            entry_dict.setdefault("rating", None)
            entry_dict.setdefault("query", None)
            entry_dict.setdefault("answer", None)
            entry_dict.setdefault("source", None)
            entries.append(entry_dict)
        return {}

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.rstrip('\n')
            
            # Check for rating line
            rate_match = rating_pattern.match(line)
            if rate_match:
                rated_user = rate_match.group(1)
                rated_value = rate_match.group(2)
                # Find the last entry for that user that has no rating
                for e in reversed(entries):
                    if e["user"] == rated_user and (e["rating"] is None or e["rating"] == ""):
                        e["rating"] = rated_value
                        break
                continue
            
            # Check if the line starts a new block (datetime)
            if datetime_pattern.match(line):
                # If we're currently holding a block, store it first
                if current_entry:
                    current_entry = store_and_reset(current_entry)
                
                current_entry["datetime"] = line.strip()
                collecting_response = False
                continue
            
            # Selected user
            select_user_match = selected_user_pattern.match(line)
            if select_user_match:
                current_entry["selected_user"] = select_user_match.group(1).strip()
                continue
            
            # User
            user_match = user_pattern.match(line)
            if user_match:
                current_entry["user"] = user_match.group(1).strip()
                continue
            
            # Duration
            duration_match = duration_pattern.match(line)
            if duration_match:
                current_entry["duration"] = duration_match.group(1).strip()
                continue
            
            # Document distance
            doc_dist_match = doc_distance_pattern.match(line)
            if doc_dist_match:
                current_entry["document_distance"] = doc_dist_match.group(1).strip()
                continue
            
            # Query
            query_match = query_pattern.match(line)
            if query_match:
                current_entry["query"] = query_match.group(1).strip()
                collecting_response = True
                current_entry.setdefault("answer", "")
                continue
            
            # Collecting response
            if collecting_response:
                resp_match = response_pattern.match(line)
                if resp_match:
                    current_entry["answer"] = resp_match.group(1).strip()
                    continue
                
                source_match = source_pattern.match(line)
                if source_match:
                    current_entry["source"] = source_match.group(1).strip()
                    collecting_response = False
                    continue
                
                # Separator or next block
                if line.startswith("---"):
                    collecting_response = False
                    continue
                
                # Otherwise accumulate into "answer" if not obviously new metadata
                if not (datetime_pattern.match(line) or
                        selected_user_pattern.match(line) or
                        user_pattern.match(line) or
                        duration_pattern.match(line) or
                        doc_distance_pattern.match(line) or
                        query_pattern.match(line) or
                        rating_pattern.match(line) or
                        source_pattern.match(line) or
                        line.strip("-") == ""):
                    current_entry["answer"] += "\n" + line.strip()
    
    # Store the last block
    if current_entry:
        store_and_reset(current_entry)
    
    df = pd.DataFrame(entries, columns=[
        "user",
        "selected_user",
        "datetime",
        "duration",
        "document_distance",
        "rating",
        "query",
        "answer",
        "source"
    ])
    
    return df

def parse_multiple_files_to_dataframe(filepaths: List[str]) -> pd.DataFrame:
    """
    Parse multiple log files (specified in filepaths) and return
    a single combined DataFrame.
    
    :param filepaths: A list of file paths to log files.
    :return: A pandas DataFrame with all parsed log information combined.
    """
    all_dataframes = []
    for filepath in filepaths:
        df_single = parse_single_file(filepath)
        all_dataframes.append(df_single)
    
    if all_dataframes:
        combined_df = pd.concat(all_dataframes, ignore_index=True)
    else:
        # If no files provided or no data
        combined_df = pd.DataFrame(columns=[
            "user",
            "selected_user",
            "datetime",
            "duration",
            "document_distance",
            "rating",
            "query",
            "answer",
            "source"
        ])
    
    return combined_df


In [2]:
df = parse_multiple_files_to_dataframe(["1_chat_log.txt", "2_chat_log.txt"])

In [3]:
df

Unnamed: 0,user,selected_user,datetime,duration,document_distance,rating,query,answer,source
0,6847a1bc-d468-4500-9c46-ab13924fbf89,HdM_Student,2025-02-13 19:40:39,15.05,1.1441731452941895,,Initiale Nachricht,\n[Bei dieser Nachricht handelt es sich nicht ...,https://www.hdm-stuttgart.de/science/science/s...
1,532aad09-0dcd-415a-9512-d438b2e5f6a6,random_User,2025-02-13 22:24:37,15.87,1.202261209487915,5,Wo befinden sich die barrierefreien Toiletten?,\nDie barrierefreien Toiletten befinden sich i...,https://www.hdm-stuttgart.de/unternehmen/caree...
2,532aad09-0dcd-415a-9512-d438b2e5f6a6,random_User,2025-02-13 22:25:47,16.66,1.0047194957733154,2,Wo kann man in der HdM essen?,\nDie Hochschule der Medien in Stuttgart (HdM)...,https://www.hdm-stuttgart.de/studierende/abtei...
3,532aad09-0dcd-415a-9512-d438b2e5f6a6,random_User,2025-02-13 22:27:14,16.66,0.8998994827270508,4,Was gibt es für Initiativen an der HdM?,\nDie Hochschule der Medien Stuttgart (HdM) st...,https://www.hdm-stuttgart.de/studierende/abtei...
4,532aad09-0dcd-415a-9512-d438b2e5f6a6,random_User,2025-02-13 22:28:27,15.98,1.1700912714004517,1,Wo befindet sich Fakultät 1?,\nDie Fakultät 1 befindet sich im Campus Nord ...,https://www.hdm-stuttgart.de/unternehmen/caree...
...,...,...,...,...,...,...,...,...,...
65,02b54e4d-19c4-4495-a825-f4348a200909,random_User,2025-02-14 22:30:44,16.62,0.7675274014472961,2,Ich möchte gern Mathe studieren. Was soll ich ...,\nDie Hochschule der Medien (HdM) bietet ein S...,https://www.hdm-stuttgart.de/mi/infoszumstudiu...
66,02b54e4d-19c4-4495-a825-f4348a200909,random_User,2025-02-14 22:33:04,16.54,0.8527412414550781,5,Hat die Uni ein Mathestudium?,"Nein, Mathestudium braucht man nicht. Aber es ...",https://www.hdm-stuttgart.de/mi/infoszumstudiu...
67,02b54e4d-19c4-4495-a825-f4348a200909,random_User,2025-02-14 22:34:09,5.05,0.8939355611801147,1,Welche Studiengänge gibt es?,\nDie Hochschule der Medien Stuttgart bietet B...,https://www.hdm-stuttgart.de/bdi/studienintere...
68,02b54e4d-19c4-4495-a825-f4348a200909,random_User,2025-02-14 22:35:08,16.53,0.7486181259155273,10,Kannst du mir eine Liste an Studiengängen geben?,"\nJa, hier ist eine Liste der verschiedenen Ma...",https://www.hdm-stuttgart.de/studieninteressie...


In [21]:
df.iloc[0]

user                              6847a1bc-d468-4500-9c46-ab13924fbf89
selected_user                                              HdM_Student
datetime                                           2025-02-13 19:40:39
duration                                                         15.05
document_distance                                   1.1441731452941895
rating                                                            None
query                                               Initiale Nachricht
answer               \n[Bei dieser Nachricht handelt es sich nicht ...
source               https://www.hdm-stuttgart.de/science/science/s...
Name: 0, dtype: object