In [1]:
%load_ext autoreload
%autoreload 2

import re
import pandas as pd
from pathlib import Path
from datetime import datetime
from whatsapp_parser import cleanup
from typing import Union

In [3]:
def parse_chat_line(line):
    patterns = [
        r"\[(.*?)\] (.*?): (.*)",  # Default pattern
        r"\[(.*?)\] ~\u202f(.*?): (.*)",  # Pattern with ~ and non-breaking space
    ]

    for pattern in patterns:
        match = re.match(pattern, line)
        if match:
            date_time_str, sender, message = match.groups()
            try:
                date_time = datetime.strptime(date_time_str, "%Y-%m-%d, %H:%M:%S")
            except ValueError:
                try:
                    date_time = datetime.strptime(
                        date_time_str, "%d/%m/%y, %I:%M:%S\u202f%p"
                    )
                except ValueError:
                    continue
            return date_time, sender.strip(), message.strip()
    return None


def parse_chat(file_path: str) -> pd.DataFrame:
    """
    Parses a WhatsApp chat log into a DataFrame.

    Parameters:
        file_path (str): Path to the chat log file.
        lines_to_read (int): Number of lines to read from the file.

    Returns:
        pd.DataFrame: DataFrame containing the parsed chat with columns 'Sender', 'Datetime', 'Message'.
    """
    parsed_data = []
    with open(file_path, "r") as file:
        for _, line in enumerate(file):
            parsed_line = parse_chat_line(line)
            if parsed_line:
                parsed_data.append(parsed_line)

    # Creating a DataFrame
    df = pd.DataFrame(parsed_data, columns=["Datetime", "Sender", "Message"])
    # df['Datetime'] = pd.to_datetime(df['Datetime'], format="%Y-%m-%d, %H:%M:%S")
    return df


def chat_to_df(
    file_path: Path,
    previous_df_path: Union[Path, None] = None,
    group_name: Union[str, None] = None,
) -> pd.DataFrame:
    file_path = Path(file_path)  # Path to your exported WhatsApp chat
    assert file_path.exists()

    df = parse_chat(file_path=file_path)
    df = cleanup(df)

    if previous_df_path:
        previous_df = pd.read_csv(previous_df_path, sep="|")
        previous_df["Datetime"] = pd.to_datetime(previous_df["Datetime"])
        df = pd.concat([df, previous_df], ignore_index=True)
        df = cleanup(df)

    if group_name:
        df["Group"] = group_name
    return df


ai_policy, deepmedia, main, startup = (
    chat_to_df("ai_policy.txt", group_name="ai_policy"),
    chat_to_df("deepmedia.txt", group_name="deepmedia"),
    chat_to_df("_chat 6.txt", "../20240520_Messages.csv", group_name="main"),
    chat_to_df("startup_ecosystem.txt", group_name="startup_ecosystem"),
)

In [4]:
group_list = [main, startup, ai_policy, deepmedia]

In [5]:
# For each group, use the leaving pattern to find users who've left the group
# leaving_pattern  is when str.contains(
#             "left|removed", case=False, na=False
#         )

for group in group_list:
    group["Leaving"] = group["Message"].str.contains("left", case=False, na=False)

In [19]:
# Create user list
users = []
for group in group_list:
    users.extend(group["Sender"].unique())
users = list(set(users))

# For each user, find their oldest message Datetime, most recent message Datetime, and total number of messages
user_stats = []
for user in users:
    user_df = pd.concat(
        [group[group["Sender"] == user] for group in group_list], ignore_index=True
    )
    # Filter out users who've left the group
    user_df = user_df[~user_df["Leaving"]]
    # Find the most recent message and which group it was in
    user_df = user_df.sort_values("Datetime", ascending=False)
    try: 
        recent_group = list(user_df["Group"])[0]
    except IndexError:
        recent_group = None
    user_stats.append(
        {
            "User": user,
            "Joining_Date": user_df["Datetime"].min(),
            "Most_Recent_Message": user_df["Datetime"].max(),
            "Total_Messages_Sent": len(user_df),
            "Recent_Group": recent_group,
        }
    )

user_stats_df = pd.DataFrame(user_stats)

In [20]:
user_stats_df.sort_values(
    by=["Total_Messages_Sent", "Joining_Date"], ascending=[True, True]
)

Unnamed: 0,User,Joining_Date,Most_Recent_Message,Total_Messages_Sent,Recent_Group
78,~ Kaushik Jaiswal,NaT,NaT,0,
92,Priyanka Chandak Swiggy AI VC,NaT,NaT,0,
169,~ Aditi Sharma,NaT,NaT,0,
769,‪+91 70934 44488‬,NaT,NaT,0,
907,~ Atma Gunupudi,NaT,NaT,0,
...,...,...,...,...,...
916,Anubhav mishra Zupay,2023-07-18 13:20:49,2024-05-21 18:31:10,632,ai_policy
56,Vamshi,2023-04-22 22:55:36,2024-05-20 20:39:31,765,ai_policy
473,Aditya Agrawal,2023-05-18 12:25:16,2024-05-17 19:53:25,853,main
1610,Dr. Pratik Desai KissanAI,2023-04-16 14:21:21,2024-05-21 13:53:18,1148,ai_policy


In [21]:
# Give me the users who are inactive and can be removed
inactive_users = user_stats_df[
    (user_stats_df["Total_Messages_Sent"] < 10)
    & (user_stats_df["Most_Recent_Message"] < pd.to_datetime("2024-02-01"))
]

In [22]:
inactive_users = inactive_users.sort_values(
    by=["Total_Messages_Sent", "Joining_Date"], ascending=[True, True]
)

In [23]:
inactive_users

Unnamed: 0,User,Joining_Date,Most_Recent_Message,Total_Messages_Sent,Recent_Group
1240,~ Abhirup,2023-03-29 10:16:35,2023-03-29 10:16:35,1,ai_policy
1558,~ Apoorv Saxena,2023-03-29 10:33:08,2023-03-29 10:33:08,1,ai_policy
655,Dipin Chopra,2023-03-29 14:02:48,2023-03-29 14:02:48,1,ai_policy
261,~ Priti,2023-03-29 17:33:44,2023-03-29 17:33:44,1,ai_policy
1603,~ Santhosh Guru,2023-03-29 18:36:13,2023-03-29 18:36:13,1,ai_policy
...,...,...,...,...,...
201,Rasagy Sharma,2023-05-09 18:03:23,2024-01-01 15:55:49,9,main
730,~ Krishaay,2023-06-07 17:49:39,2023-11-13 10:49:36,9,deepmedia
611,~ Harsh Goel,2023-10-28 14:54:37,2023-11-19 19:27:33,9,main
567,~ Shiraz,2023-11-05 22:37:09,2023-11-29 09:44:34,9,deepmedia


In [24]:
inactive_users.to_csv("inactive_users_master.csv", index=False)