In [59]:
import os
import pandas as pd
from bs4 import BeautifulSoup

In [60]:
import os
import pandas as pd
from bs4 import BeautifulSoup

def parse_mail_archive_thread(ul):
    """
    Parse a mail-archive-style <ul>, handling the pattern:
      - <li class="icons-email"> => a message
      - a sibling <li> (w/o "icons-email") may contain <ul> => replies to that message
    Returns a list of messages in a nested structure:
      [ { "subject": ..., "author": ..., "link": ..., "children": [...] }, ... ]
    """
    items = ul.find_all("li", recursive=False)
    messages = []
    i = 0

    while i < len(items):
        li = items[i]
        li_classes = li.get("class", [])

        if "icons-email" in li_classes:
            # This <li> is an actual message
            subject_link = li.select_one(".subject a")
            subject = subject_link.get_text(strip=True) if subject_link else "No Subject"
            link = subject_link["href"] if subject_link else None

            author_span = li.select_one(".sender.italic")
            author = author_span.get_text(strip=True) if author_span else "Unknown"

            # Initialize children = []
            children = []

            # Look ahead to the *next* <li> to see if it holds a <ul> of replies
            if i + 1 < len(items):
                next_li = items[i + 1]
                next_li_classes = next_li.get("class", [])
                if "icons-email" not in next_li_classes:
                    # If the next <li> doesn't have "icons-email", it may be a container for a <ul> of replies
                    sub_ul = next_li.find("ul", recursive=False)
                    if sub_ul:
                        children = parse_mail_archive_thread(sub_ul)
                        i += 1  # Skip that container <li> so we don't re-parse

            messages.append({
                "subject": subject,
                "author": author,
                "link": link,
                "children": children
            })
        else:
            # This <li> isn't a message; it might be a container for replies
            # e.g., <li><ul>...</ul></li>
            # We can parse that <ul> and attach those to the *last* message if we want them
            sub_ul = li.find("ul", recursive=False)
            if sub_ul and messages:
                # Attach these sub-messages to the last message's children
                messages[-1]["children"].extend(parse_mail_archive_thread(sub_ul))
        i += 1

    return messages

def parse_local_thread_html(file_path):
    """
    Load the local HTML file, locate <div class="tSliceList"><ul>...,
    parse it with parse_mail_archive_thread(...).
    """
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    container = soup.select_one(".tSliceList ul")
    if not container:
        print(f"Could not find '.tSliceList ul' in {file_path}")
        return []

    return parse_mail_archive_thread(container)

def flatten_thread_with_msgid(thread, parent_msgid=None):
    """
    Flatten the nested structure so each message has:
      Subject, Author, Link (ID), and Parent_MsgID.
    """
    rows = []
    for msg in thread:
        rows.append({
            "Subject": msg["subject"],
            "Author": msg["author"],
            "Link": msg["link"],
            "Parent_MsgID": parent_msgid
        })
        # Recurse for children
        rows.extend(flatten_thread_with_msgid(msg["children"], parent_msgid=msg["link"]))
    return rows

def get_thread_title(file_path):
    """
    Extract the top-level thread's title from .msgHead.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    title_tag = soup.select_one(".msgHead .subject a")
    title = title_tag.get_text(strip=True) if title_tag else "Unknown Title"
    return title

def get_thread_author(file_path):
    """
    Extract the top-level thread's author from .msgHead.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    author_tag = soup.select_one(".msgHead .sender")
    author = author_tag.get_text(strip=True) if author_tag else "Unknown Author"
    return author


def get_thread_timestamp(file_path):
    """
    Extracts the timestamp from a mailing list HTML file.
    It looks for the first anchor element within the .msgHead .date element.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
    
    date_tag = soup.select_one('.msgHead .date a')
    if date_tag:
        return date_tag.get_text(strip=True)
    return "Unknown Timestamp"


def get_thread_body(file_path):
    """
    Extracts the body of the message from a mailing list HTML file.
    It collects text from all <pre> tags within the .msgBody container,
    removes any lines starting with '>' (denoting quoted text),
    and returns the cleaned text.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
    
    pre_tags = soup.select('.msgBody pre')
    if not pre_tags:
        return "No message body found."
    
    # Collect text from each <pre> tag, filtering out lines that start with '>'
    lines = []
    for pre in pre_tags:
        for line in pre.get_text().splitlines():
            if not line.strip().startswith('>'):
                lines.append(line)
    
    return "\n".join(lines)


def get_conversational_relationship(file_path):
    """
    Extracts the conversational relationship from a mailing list HTML file.
    It looks for the first anchor element within the .msgHead .subject element.
    """
    file_path = os.path.join("all_messages", file_path)
    print(f"Parsing {file_path}...")

    # 1) Extract the root (thread) title & author
    root_title = get_thread_title(file_path)
    root_author = get_thread_author(file_path)

    # 2) Parse the nested structure from .tSliceList ul
    thread_tree = parse_local_thread_html(file_path)

    # 3) Create a synthetic "root" node
    root_link = os.path.basename(file_path)  # e.g. "msg00572.html"
    root_node = {
        "subject": root_title,
        "author": root_author,
        "link": root_link,
        "children": thread_tree
    }

    # 4) Flatten the nested structure
    flattened = flatten_thread_with_msgid([root_node], parent_msgid=None)

    # 5) Convert to a DataFrame
    df = pd.DataFrame(flattened)

    # 6) Remove "No Subject" rows by transferring Parent_MsgID
    indices_to_drop = []
    for i in range(len(df) - 1):
        if df.loc[i, "Subject"] == "No Subject":
            # Transfer this row's Parent_MsgID to the next row
            parent_id = df.loc[i, "Parent_MsgID"]
            df.loc[i + 1, "Parent_MsgID"] = parent_id
            # Mark this row for removal
            indices_to_drop.append(i)

    # 7) Drop the "No Subject" rows
    df.drop(indices_to_drop, inplace=True)

    # 8) Reset the index for clarity
    df.reset_index(drop=True, inplace=True)

    # 9) For any row where Parent_MsgID is missing (None/NaN) (except the root),
    #    default it to the first message's link.
    root_msg_id = df.loc[0, "Link"]  # the root message's ID
    for idx in df.index:
        if pd.isna(df.loc[idx, "Parent_MsgID"]) and idx != 0:
            df.loc[idx, "Parent_MsgID"] = root_msg_id
            
    return df

In [61]:
def create_conversational_details_df(conv_df):
    
    detailed_rows = []
    for idx, row in conv_df.iterrows():
        message_path = os.path.join("all_messages", row["Link"])
        title = get_thread_title(message_path)
        sender = get_thread_author(message_path)
        body = get_thread_body(message_path)
        timestamp = get_thread_timestamp(message_path)
        
        if pd.isna(row["Parent_MsgID"]) or not row["Parent_MsgID"] or row["Parent_MsgID"] == row["Link"]:
            recipient = ""
        else:
            recipient_path = os.path.join("all_messages", row["Parent_MsgID"])
            recipient = get_thread_author(recipient_path)

        detailed_rows.append({
            "timestamp": timestamp,
            "from": sender,
            "to": recipient,
            "body": body,
            "title": title,
            "msg_file": row["Link"]
        })
    
    new_df = pd.DataFrame(detailed_rows)
    return new_df
        

In [62]:
file_path = "msg00422.html"
conv_df = get_conversational_relationship(file_path)
detailed_df = create_conversational_details_df(conv_df)
detailed_df

Parsing all_messages\msg00422.html...


Unnamed: 0,timestamp,from,to,body,title,msg_file
0,"Wed, 13 Jan 2021 13:38:04 -0800",Brett Okken,,Mark Adler has posted an optimized crc64 imple...,[xz-devel] java crc64 implementation,msg00422.html
1,"Tue, 19 Jan 2021 09:04:14 -0800",Lasse Collin,Brett Okken,On 2021-01-13 Brett Okken wrote:\n\n\nThe CRC6...,Re: [xz-devel] java crc64 implementation,msg00427.html
2,"Thu, 21 Jan 2021 17:57:29 -0800",Brett Okken,Lasse Collin,Here is a slice by 4 implementation. It goes b...,Re: [xz-devel] java crc64 implementation,msg00434.html
3,"Tue, 02 Feb 2021 08:59:51 -0800",Lasse Collin,Brett Okken,Hello!\n\nI need to make a new release in the ...,Re: [xz-devel] java crc64 implementation,msg00449.html
4,"Tue, 02 Feb 2021 11:23:27 -0800",Lasse Collin,Lasse Collin,I assume you accidentally didn't post to the l...,Re: [xz-devel] java crc64 implementation,msg00451.html
5,"Tue, 02 Feb 2021 11:36:21 -0800",Brett Okken,Lasse Collin,I accidentally hit reply instead of reply all....,Re: [xz-devel] java crc64 implementation,msg00452.html
6,"Tue, 02 Feb 2021 15:29:26 -0800",Brett Okken,Brett Okken,"I tested jdk 15 64bit and jdk 11 32bit, client...",Re: [xz-devel] java crc64 implementation,msg00453.html
7,"Fri, 05 Feb 2021 09:07:56 -0800",Lasse Collin,Brett Okken,On 2021-02-02 Brett Okken wrote:\n\n\nTo ensur...,Re: [xz-devel] java crc64 implementation,msg00463.html
8,"Fri, 05 Feb 2021 11:21:32 -0800",Brett Okken,Brett Okken,"On Fri, Feb 5, 2021 at 11:07 AM Lasse Collin <...",Re: [xz-devel] java crc64 implementation,msg00465.html
9,"Fri, 05 Feb 2021 11:54:53 -0800",Lasse Collin,Brett Okken,"On 2021-02-05 Brett Okken wrote:\n\n\nOK, smal...",Re: [xz-devel] java crc64 implementation,msg00468.html


In [63]:
# List of title thread IDs (as provided)
title_thread_ids = [
    665, 645, 638, 610, 602, 591, 584, 572, 562, 554, 553, 532,
    527, 519, 513, 512, 509, 507, 503, 499, 494, 493, 481, 472,
    464, 458, 443, 437, 429, 422, 401, 400
]

# Folder where the HTML files are stored
# (e.g., if they are in a folder called "all_messages")
base_folder = "all_messages"

# Folder to save the individual conversation CSV files
output_folder = "individual_ml_convo"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Loop through each thread ID
for thread_id in title_thread_ids:
    # Construct the thread file name (e.g., "msg00665.html")
    file_name = f"msg00{thread_id}.html"
    
    # Call your existing function to get the conversational relationship DataFrame.
    # get_conversational_relationship is assumed to join the base folder inside itself.
    conv_df = get_conversational_relationship(file_name)
    
    # Now, get the detailed conversation DataFrame with all extra info.
    detailed_df = create_conversational_details_df(conv_df)
    
    # Save the detailed conversation DataFrame to a CSV file.
    csv_file = os.path.join(output_folder, f"conversation_{thread_id}.csv")
    detailed_df.to_csv(csv_file, index=False)
    
    print(f"Saved conversation for thread {thread_id} to {csv_file}")


Parsing all_messages\msg00665.html...
Saved conversation for thread 665 to individual_ml_convo\conversation_665.csv
Parsing all_messages\msg00645.html...
Saved conversation for thread 645 to individual_ml_convo\conversation_645.csv
Parsing all_messages\msg00638.html...
Saved conversation for thread 638 to individual_ml_convo\conversation_638.csv
Parsing all_messages\msg00610.html...
Saved conversation for thread 610 to individual_ml_convo\conversation_610.csv
Parsing all_messages\msg00602.html...
Saved conversation for thread 602 to individual_ml_convo\conversation_602.csv
Parsing all_messages\msg00591.html...
Saved conversation for thread 591 to individual_ml_convo\conversation_591.csv
Parsing all_messages\msg00584.html...
Saved conversation for thread 584 to individual_ml_convo\conversation_584.csv
Parsing all_messages\msg00572.html...
Saved conversation for thread 572 to individual_ml_convo\conversation_572.csv
Parsing all_messages\msg00562.html...
Saved conversation for thread 562 