In [None]:
from pathlib import Path

text = Path("_chat.txt").open("r").read()

In [None]:
import re
import pandas as pd
from datetime import datetime, timedelta
from typing import List, Tuple


def extract_messages(input_text: str) -> List[Tuple[str, str, str]]:
    pattern = re.compile(
        r"\[(?P<date>\d{1,2}\/\d{1,2}\/\d{2,4}), (?P<time>\d{1,2}:\d{2}:\d{2})\] (?P<sender>[^:]+): (?P<message>.+)"
    )
    join_pattern = re.compile(r"joined using this group\'s invite link")

    messages = []
    for line in input_text.split("\n"):
        match = pattern.match(line)
        if match and not join_pattern.search(line):
            date, time, sender, message = match.groups()
            datetime_str = f"{date} {time}"
            dt = datetime.strptime(datetime_str, "%m/%d/%y %H:%M:%S")
            messages.append((sender, dt, message))

    return messages


messages = extract_messages(text)
df = pd.DataFrame(messages, columns=["Sender", "Datetime", "Message"])

print(df)

In [None]:
def top_k_senders(df, freq, k=5):
    # Resample and count the number of messages per sender
    resampled = df.groupby("Sender").resample(freq).count()

    # Sort and group by the period (week or month)
    sorted_grouped = (
        resampled["Message"]
        .reset_index()
        .sort_values(["Datetime", "Message"], ascending=[True, False])
        .groupby("Datetime")
    )

    # Get the top K senders for each period
    top_senders = sorted_grouped.head(k)

    return top_senders


# df.set_index('Datetime', inplace=True)
# Top K senders per week
k = 6
top_senders_weekly = top_k_senders(df, "W", k)
print(f"Top {k} senders per week:")

# Top K senders per month
top_senders_monthly = top_k_senders(df, "M", k)
print(f"\nTop {k} senders per month:")
top_senders_monthly

In [None]:
df.columns

In [None]:
# Assuming 'df' is the DataFrame with the columns 'Sender', 'Datetime', and 'Message'
# Make sure the 'Datetime' column is set as the index
df.set_index("Datetime", inplace=True)
df.sort_index(inplace=True)

# Resample DataFrame to a weekly frequency
weekly_data = df.resample("W").count()

# Initialize lists to store the results
new_senders = []
active_senders = []
churned_senders = []

# Initialize sets for active and churned senders
current_senders = set()
previous_senders = set()
churned = set()

# Time window to consider a sender as churned (21 days)
churn_window = timedelta(days=21)

# Iterate through each week
for week in weekly_data.index:
    # Get the data for the current week
    current_week_data = df.truncate(
        before=week - timedelta(weeks=1) + timedelta(seconds=1), after=week
    )

    # Calculate new, active, and churned senders for the current week
    new_senders_count = 0
    active_senders_count = 0
    churned_senders_count = 0

    for sender in current_week_data["Sender"].unique():
        # Check if the sender is new
        if sender not in current_senders and sender not in previous_senders:
            new_senders_count += 1
            current_senders.add(sender)

        # Check if the sender is active
        if sender in current_senders or sender in previous_senders:
            active_senders_count += 1
            current_senders.add(sender)

    # Update churned senders
    for sender in previous_senders:
        if (
            sender not in current_senders
            and (week - df[df["Sender"] == sender].index[-1]) > churn_window
        ):
            churned.add(sender)

    # Store the results in the lists
    new_senders.append(new_senders_count)
    active_senders.append(active_senders_count)
    churned_senders.append(len(churned))

    # Update previous_senders for the next iteration
    previous_senders.update(current_senders)
    current_senders.clear()

# Create a DataFrame with the results
result_df = pd.DataFrame(
    {
        "Date": weekly_data.index,
        "New Senders": new_senders,
        "Active Senders": active_senders,
        "Churned Senders": churned_senders,
    }
)
result_df.set_index("Date", inplace=True)

In [None]:
display(result_df)
# display(result_df.style.hide(axis="index"))