In [1]:
%load_ext autoreload
%autoreload 2

import re
import pandas as pd
from pathlib import Path
from datetime import datetime
from whatsapp_parser import cleanup
from typing import Union

In [2]:
def parse_chat_line(line):
    patterns = [
        r"\[(.*?)\] (.*?): (.*)",  # Default pattern
        r"\[(.*?)\] ~\u202f(.*?): (.*)",  # Pattern with ~ and non-breaking space
    ]

    for pattern in patterns:
        match = re.match(pattern, line)
        if match:
            date_time_str, sender, message = match.groups()
            try:
                date_time = datetime.strptime(date_time_str, "%Y-%m-%d, %H:%M:%S")
            except ValueError:
                try:
                    date_time = datetime.strptime(
                        date_time_str, "%d/%m/%y, %I:%M:%S\u202f%p"
                    )
                except ValueError:
                    continue
            return date_time, sender.strip(), message.strip()
    return None


def parse_chat(file_path: str) -> pd.DataFrame:
    """
    Parses a WhatsApp chat log into a DataFrame.

    Parameters:
        file_path (str): Path to the chat log file.
        lines_to_read (int): Number of lines to read from the file.

    Returns:
        pd.DataFrame: DataFrame containing the parsed chat with columns 'Sender', 'Datetime', 'Message'.
    """
    parsed_data = []
    with open(file_path, "r") as file:
        for _, line in enumerate(file):
            parsed_line = parse_chat_line(line)
            if parsed_line:
                parsed_data.append(parsed_line)

    # Creating a DataFrame
    df = pd.DataFrame(parsed_data, columns=["Datetime", "Sender", "Message"])
    # df['Datetime'] = pd.to_datetime(df['Datetime'], format="%Y-%m-%d, %H:%M:%S")
    return df


def chat_to_df(
    file_path: Path,
    previous_df_path: Union[Path, None] = None,
    group_name: Union[str, None] = None,
) -> pd.DataFrame:
    file_path = Path(file_path)  # Path to your exported WhatsApp chat
    assert file_path.exists()

    df = parse_chat(file_path=file_path)
    df = cleanup(df)

    if previous_df_path:
        previous_df = pd.read_csv(previous_df_path, sep="|")
        previous_df["Datetime"] = pd.to_datetime(previous_df["Datetime"])
        df = pd.concat([df, previous_df], ignore_index=True)
        df = cleanup(df)

    if group_name:
        df["Group"] = group_name
    return df


community, main, startup = (
    chat_to_df("community.txt"),
    chat_to_df("_chat 6.txt", "../20240520_Messages.csv", group_name="main"),
    chat_to_df("startup_ecosystem.txt", group_name="startup_ecosystem"),
)

In [3]:
group_list = [main, startup]

In [4]:
# For each group, use the leaving pattern to find users who've left the group
# leaving_pattern  is when str.contains(
#             "left|removed", case=False, na=False
#         )

for group in group_list:
    group["Leaving"] = group["Message"].str.contains("left", case=False, na=False)

In [8]:
# Create user list
users = []
for group in group_list:
    users.extend(group["Sender"].unique())
users = list(set(users))

# For each user, find their oldest message Datetime, most recent message Datetime, and total number of messages
user_stats = []
for user in users:
    user_df = pd.concat(
        [group[group["Sender"] == user] for group in group_list], ignore_index=True
    )
    # Filter out users who've left the group
    user_df = user_df[~user_df["Leaving"]]
    user_stats.append(
        {
            "User": user,
            "Joining_Date": user_df["Datetime"].min(),
            "Most_Recent_Message": user_df["Datetime"].max(),
            "Total_Messages_Sent": len(user_df),
        }
    )

user_stats_df = pd.DataFrame(user_stats)

In [9]:
user_stats_df.sort_values(
    by=["Total_Messages_Sent", "Joining_Date"], ascending=[True, True]
)

Unnamed: 0,User,Joining_Date,Most_Recent_Message,Total_Messages_Sent
33,~ param,NaT,NaT,0
114,~ Amit Bhor,NaT,NaT,0
149,Ajay Rungta Ex-BITS Pilani/Practo Engg Leader,NaT,NaT,0
186,‪+1 (408) 431‑7342‬,NaT,NaT,0
213,‪+91 70934 44488‬,NaT,NaT,0
...,...,...,...,...
813,Bharat Shetty GenAI WhatsApp Group,2023-10-27 12:50:45,2024-05-21 17:31:24,446
685,Anshuman Pandey Ai Startups Group,2023-07-08 07:13:11,2024-05-18 16:39:08,480
158,Dr. Pratik Desai KissanAI,2023-10-25 23:46:53,2024-05-19 16:06:41,671
1047,Aditya Agrawal,2023-05-18 12:25:16,2024-05-17 19:53:25,847


In [16]:
# Give me the users who are inactive and can be removed
inactive_users = user_stats_df[
    (user_stats_df["Total_Messages_Sent"] < 10)
    & (user_stats_df["Most_Recent_Message"] < pd.to_datetime("2024-02-01"))
]

In [17]:
inactive_users.sort_values(
    by=["Total_Messages_Sent", "Joining_Date"], ascending=[True, True]
)

Unnamed: 0,User,Joining_Date,Most_Recent_Message,Total_Messages_Sent
247,~ Pranshul Chandhok,2023-10-26 14:45:33,2023-10-26 14:45:33,1
1655,~ Ayush Rajgor AR,2023-10-27 17:13:27,2023-10-27 17:13:27,1
21,~ Prajna Prayas,2023-10-29 11:41:18,2023-10-29 11:41:18,1
362,~ Ahinsa,2023-10-30 11:51:30,2023-10-30 11:51:30,1
980,Kaushik S YC W23,2023-11-01 22:11:45,2023-11-01 22:11:45,1
...,...,...,...,...
454,Amogh V,2023-11-23 18:53:20,2023-11-23 19:02:07,8
1371,~ Harsh Goel,2023-10-28 14:54:37,2023-11-19 19:27:33,9
876,~ Shipra,2023-11-07 20:34:31,2023-11-15 11:30:07,9
1508,~ SJ,2023-11-24 22:38:27,2023-12-03 13:00:20,9


In [19]:
inactive_users.to_csv("inactive_users_master.csv", index=False)