In [28]:
import pandas as pd
from datetime import datetime
import marko
from marko.block import Heading
from marko.inline import RawText

def extract_text(node) -> str:
    """
    Recursively extracts plain text from a marko node (including inline formatting).
    """
    if isinstance(node, RawText):
        return node.children
    if hasattr(node, 'children'):
        return ''.join(extract_text(child) for child in node.children)
    return ''

def parse_markdown(filepath: str, skip_first_heading: bool = False, column_map: dict = None) -> pd.DataFrame:
    """
    Parses a Markdown travel log file using marko to extract travel durations grouped by country and city.
    Splits visits to the same location into multiple entries if they appear non-consecutively.

    Parameters:
        filepath (str): Path to the .md file.
        skip_first_heading (bool): Whether to skip the first heading (e.g., title or introduction).
        column_map (dict): Optional mapping of output column names, e.g., {'country': 'Country', 'city': 'City', ...}

    Returns:
        pd.DataFrame: A DataFrame with columns [Country, City, Start, End, Duration] by default.
    """
    with open(filepath, encoding="utf-8") as f:
        content = f.read()

    ast = marko.parse(content)

    country = city = None
    date_entries = []
    heading_count = 0

    for node in ast.children:
        if not isinstance(node, Heading):
            continue

        heading_count += 1
        if skip_first_heading and heading_count == 1:
            continue

        level = node.level
        text = extract_text(node).strip()

        if level == 1:
            country = text
            city = None  # reset city
        elif level == 2:
            city = text
        elif level == 3:
            parts = text.split(", ")
            if len(parts) == 2:
                try:
                    date = datetime.strptime(parts[1], "%d.%m.%Y").date()
                    effective_city = city if city else country
                    date_entries.append({
                        "country": country,
                        "city": effective_city,
                        "date": date
                    })
                except ValueError:
                    continue  # malformed date

    # Sentry date entry

    # Sort by date to ensure order
    date_entries.sort(key=lambda x: x["date"])

    # Track city/country transitions
    visits = []
    if not date_entries:
        return pd.DataFrame(columns=column_map.values() if column_map else [])

    prev = date_entries[0]
    start_date = end_date = prev["date"]

    for i in range(1, len(date_entries)):
        entry = date_entries[i]
        is_last = i == len(date_entries) - 1

        if entry["country"] == prev["country"] and entry["city"] == prev["city"] and not is_last:
            end_date = entry["date"]  # extend visit
        else:
            visits.append({
                "country": prev["country"],
                "city": prev["city"],
                "start": start_date,
                "end": end_date,
                "duration": (end_date - start_date).days + 1
            })
            prev = entry
            start_date = end_date = entry["date"]

    # Default column mapping
    default_columns = {
        "country": "Country",
        "city": "City",
        "start": "Start",
        "end": "End",
        "duration": "Duration"
    }

    if column_map:
        columns = column_map
    else:
        columns = default_columns

    # Build DataFrame using column map
    df = pd.DataFrame([
        {
            columns["country"]: visit["country"],
            columns["city"]: visit["city"],
            columns["start"]: visit["start"],
            columns["end"]: visit["end"],
            columns["duration"]: visit["duration"]
        }
        for visit in visits
    ])

    return df

def summarize_travel(df: pd.DataFrame, column_map: dict = None, show_total: bool = True) -> pd.DataFrame:
    """
    Summarizes travel statistics per country:
    - Total duration (sum of days)
    - Number of unique cities visited
    - First date of entry
    - Last date of exit
    - (Optional) Total row with overall stats

    Parameters:
        df (pd.DataFrame): The travel DataFrame returned by parse_markdown.
        column_map (dict): Optional mapping of DataFrame column names.
        show_total (bool): Whether to include a final row with totals.

    Returns:
        pd.DataFrame: Summary table with optional total row.
    """
    default_columns = {
        "country": "Country",
        "city": "City",
        "duration": "Duration",
        "start": "Start",
        "end": "End"
    }

    columns = column_map if column_map else default_columns

    summary = df.groupby(columns["country"]).agg(
        Total_Days=(columns["duration"], "sum"),
        Cities_Visited=(columns["city"], pd.Series.nunique),
        Entry_Date=(columns["start"], "min"),
        Exit_Date=(columns["end"], "max")
    ).reset_index()

    # Sort by Entry_Date
    summary = summary.sort_values(by="Entry_Date").reset_index(drop=True)

    if show_total:
        total_row = {
            columns["country"]: "TOTAL",
            "Total_Days": summary["Total_Days"].sum(),
            "Cities_Visited": summary["Cities_Visited"].sum(),
            "Entry_Date": summary["Entry_Date"].min(),
            "Exit_Date": summary["Exit_Date"].max()
        }
        summary = pd.concat([summary, pd.DataFrame([total_row])], ignore_index=True)

    return summary


In [30]:
df = parse_markdown("reise.md", skip_first_heading=True)
df

Unnamed: 0,Country,City,Start,End,Duration
0,Singapur,Singapur,2024-08-08,2024-08-13,6
1,Indonesien,Malang,2024-08-14,2024-08-16,3
2,Indonesien,Yogyakarta,2024-08-19,2024-08-24,6
3,Indonesien,Bandung,2024-08-25,2024-08-27,3
4,Indonesien,Jakarta,2024-08-28,2024-08-28,1
5,Japan,Tokio,2024-08-29,2024-09-08,11
6,Japan,Fuji,2024-09-09,2024-09-13,5
7,Japan,Takayama,2024-09-14,2024-09-18,5
8,Japan,Kanazawa,2024-09-19,2024-09-20,2
9,Japan,Tottori,2024-09-21,2024-09-22,2


In [31]:
df_summary=summarize_travel(df)
df_summary

Unnamed: 0,Country,Total_Days,Cities_Visited,Entry_Date,Exit_Date
0,Singapur,6,1,2024-08-08,2024-08-13
1,Indonesien,13,4,2024-08-14,2024-08-28
2,Japan,77,19,2024-08-29,2024-11-13
3,Korea,24,5,2024-11-14,2024-12-07
4,Taiwan,27,7,2024-12-08,2025-01-03
5,Thailand,38,7,2025-01-06,2025-02-13
6,TOTAL,185,43,2024-08-08,2025-02-13
