In [None]:
import pandas as pd
from pathlib import Path

In [None]:
df = pd.read_csv("../20231120_Messages.csv", sep="|")
df.Datetime = pd.to_datetime(df.Datetime)

In [None]:
def top_k_senders(df, freq, k=5):
    # Resample and count the number of messages per sender
    resampled = df.groupby("Sender").resample(freq).count()

    # Sort and group by the period (week or month)
    sorted_grouped = (
        resampled["Message"]
        .reset_index()
        .sort_values(["Datetime", "Message"], ascending=[True, False])
        .groupby("Datetime")
    )

    # Get the top K senders for each period
    top_senders = sorted_grouped.head(k)

    return top_senders


top_df = df.set_index('Datetime')
# Top K senders per week
k = 6
top_senders_weekly = top_k_senders(top_df, "W", k)
print(f"Top {k} senders per week:")
display(top_senders_weekly[-20:])

In [None]:
# Top K senders per month
top_senders_monthly = top_k_senders(top_df, "M", k)
print(f"\nTop {k} senders per month:")
top_senders_monthly

In [None]:
df.columns
from datetime import timedelta

In [None]:
# Assuming 'df' is the DataFrame with the columns 'Sender', 'Datetime', and 'Message'
# Make sure the 'Datetime' column is set as the index
df = pd.read_csv("../20231120_Messages.csv", sep="|")
df.Datetime = pd.to_datetime(df.Datetime)

df.set_index("Datetime", inplace=True)
df.sort_index(inplace=True)

# Resample DataFrame to a weekly frequency
weekly_data = df.resample("M").count()

# Initialize lists to store the results
new_senders = []
active_senders = []
churned_senders = []

# Initialize sets for active and churned senders
current_senders = set()
previous_senders = set()
churned = set()

# Time window to consider a sender as churned (21 days)
churn_window = timedelta(days=21)

# Iterate through each week
for week in weekly_data.index:
    # Get the data for the current week
    current_week_data = df.truncate(
        before=week - timedelta(weeks=1) + timedelta(seconds=1), after=week
    )

    # Calculate new, active, and churned senders for the current week
    new_senders_count = 0
    active_senders_count = 0
    churned_senders_count = 0

    for sender in current_week_data["Sender"].unique():
        # Check if the sender is new
        if sender not in current_senders and sender not in previous_senders:
            new_senders_count += 1
            current_senders.add(sender)

        # Check if the sender is active
        if sender in current_senders or sender in previous_senders:
            active_senders_count += 1
            current_senders.add(sender)

    # Update churned senders
    for sender in previous_senders:
        if (
            sender not in current_senders
            and (week - df[df["Sender"] == sender].index[-1]) > churn_window
        ):
            churned.add(sender)

    # Store the results in the lists
    new_senders.append(new_senders_count)
    active_senders.append(active_senders_count)
    churned_senders.append(len(churned))

    # Update previous_senders for the next iteration
    previous_senders.update(current_senders)
    current_senders.clear()

# Create a DataFrame with the results
result_df = pd.DataFrame(
    {
        "Date": weekly_data.index,
        "New Senders": new_senders,
        "Active Senders": active_senders,
        "Churned Senders": churned_senders,
    }
)
result_df.set_index("Date", inplace=True)

In [None]:
display(result_df[-6:])
# display(result_df.style.hide(axis="index"))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot the active senders for each week
sns.barplot(x=result_df.index, y=result_df['Active Senders'])
plt.xlabel('Week')
plt.ylabel('Active Senders')
plt.title('Active Senders per Month')
plt.xticks(rotation=45)
plt.show()
