In [1]:
%load_ext autoreload
%autoreload 2

import enum
import os
import re
from datetime import datetime
from getpass import getpass
from pathlib import Path
from typing import Union

import instructor
import pandas as pd
from openai import OpenAI
from pandarallel import pandarallel
from pydantic import BaseModel
from whatsapp_parser import cleanup

os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")

In [2]:
def parse_chat_line(line):
    patterns = [
        r"\[(.*?)\] (.*?): (.*)",  # Default pattern
        r"\[(.*?)\] ~\u202f(.*?): (.*)",  # Pattern with ~ and non-breaking space
        r"(\d{2}/\d{2}/\d{4}, \d{2}:\d{2}) - (.*)",  # Pattern for system messages
    ]

    for pattern in patterns:
        match = re.match(pattern, line)
        if match:
            if len(match.groups()) == 3:
                date_time_str, sender, message = match.groups()
            elif len(match.groups()) == 2:
                date_time_str, message = match.groups()
                sender = "System"
            try:
                date_time = datetime.strptime(date_time_str, "%Y-%m-%d, %H:%M:%S")
            except ValueError:
                try:
                    date_time = datetime.strptime(
                        date_time_str, "%d/%m/%y, %I:%M:%S\u202f%p"
                    )
                except ValueError:
                    try:
                        date_time = datetime.strptime(date_time_str, "%d/%m/%Y, %H:%M")
                    except ValueError:
                        continue
            return date_time, sender.strip(), message.strip()
    return None


def parse_chat(file_path: str) -> pd.DataFrame:
    """
    Parses a WhatsApp chat log into a DataFrame.

    Parameters:
        file_path (str): Path to the chat log file.

    Returns:
        pd.DataFrame: DataFrame containing the parsed chat with columns 'Sender', 'Datetime', 'Message'.
    """
    parsed_data = []
    with open(file_path, "r") as file:
        for _, line in enumerate(file):
            parsed_line = parse_chat_line(line)
            if parsed_line:
                parsed_data.append(parsed_line)

    # Creating a DataFrame
    df = pd.DataFrame(parsed_data, columns=["Datetime", "Sender", "Message"])
    return df


def chat_to_df(
    file_path: Path,
    previous_df_path: Union[Path, None] = None,
    group_name: Union[str, None] = None,
) -> pd.DataFrame:
    file_path = Path(file_path)  # Path to your exported WhatsApp chat
    assert file_path.exists()

    df = parse_chat(file_path=file_path)
    df = cleanup(df)

    if previous_df_path:
        previous_df = pd.read_csv(previous_df_path, sep="|")
        previous_df["Datetime"] = pd.to_datetime(previous_df["Datetime"])
        df = pd.concat([df, previous_df], ignore_index=True)
        df = cleanup(df)

    if group_name:
        print(f"Adding group name {group_name} to the chat")
        df["Group"] = group_name
    return df

In [3]:
source_directory = Path(".")

chat_files = list(source_directory.glob("*.txt"))
group_list = []
for chat_file in chat_files:
    group_name = chat_file.stem
    chat_df = chat_to_df(chat_file, group_name=group_name)
    assert len(chat_df) > 0, f"Chat file {chat_file} is empty"
    # chat_df.to_csv(f"{group_name}.csv", sep="|", index=False)
    group_list.append(chat_df)

Adding group name community to the chat
Adding group name deepmedia to the chat
Adding group name ai_policy to the chat
Adding group name main to the chat
Adding group name gpu to the chat
Adding group name creatives to the chat


In [4]:
group_list[0].head()

Unnamed: 0,Datetime,Sender,Message,Group
0,2023-03-08 11:12:05,Generative AI,‎Messages and calls are end-to-end encrypted. ...,community
1,2023-03-08 11:12:05,Generative AI,‎Welcome to your community!,community
2,2023-03-08 11:12:07,Generative AI,"‎You added Abhilash, Abhishek Jatram Samsung, ...",community
3,2023-03-08 11:13:32,Zainab Bawa,‎Zainab Bawa is now a community admin,community
4,2023-03-08 11:13:48,Soumyadeep Mukherjee,‎Soumyadeep Mukherjee is now a community admin,community


In [5]:
# # group_list = [main, startup, ai_policy, deepmedia]
# group_list = [main, startup, deepmedia]

In [21]:
# For each group, using the message string, find the users who've been removed or left
# and add them to a list of users who've left the group
left_users = []
for group in group_list:
    left_users.extend(group.loc[group["Message"].str.contains("left|removed")]["Sender"].unique())

In [24]:
# Create user list
users = []
for group in group_list:
    users.extend(group["Sender"].unique())
users = list(set(users))

# For each user, find their oldest message Datetime, most recent message Datetime, and total number of messages
user_stats = []
for user in users:
    user_df = pd.concat(
        [group[group["Sender"] == user] for group in group_list], ignore_index=True
    )
    # Filter out users who've left the group
    user_df = user_df[~user_df["Sender"].isin(left_users)]
    # Find the most recent message and which group it was in
    user_df = user_df.sort_values("Datetime", ascending=False)
    try:
        recent_group = list(user_df["Group"])[0]
    except IndexError:
        recent_group = None
    user_stats.append(
        {
            "User": user,
            "Joining_Date": user_df["Datetime"].min(),
            "Most_Recent_Message": user_df["Datetime"].max(),
            "Total_Messages_Sent": len(user_df),
            "Recent_Group": recent_group,
        }
    )

user_stats_df = pd.DataFrame(user_stats)

In [25]:
sorted_df = user_stats_df.sort_values(
    by=["Total_Messages_Sent", "Joining_Date", "User"], ascending=[True, True, False]
)

In [26]:
# Give me the users who are inactive and can be removed
inactive_users = user_stats_df[
    (user_stats_df["Total_Messages_Sent"] < 3)
    & (user_stats_df["Most_Recent_Message"] < pd.to_datetime("2024-02-01"))
]

In [27]:
inactive_users

Unnamed: 0,User,Joining_Date,Most_Recent_Message,Total_Messages_Sent,Recent_Group
12,‪+91 88009 40288‬,2024-01-14 09:29:29,2024-01-14 09:29:29,1,gpu
16,Karthik CRED,2023-04-22 16:37:17,2023-04-22 16:37:17,1,deepmedia
17,‪+91 83749 99651‬,2023-04-26 14:31:23,2023-04-26 14:31:23,1,community
19,~ PR,2023-08-15 10:28:57,2023-08-15 10:28:57,1,community
21,~ Kartikeya Bhardwaj,2023-11-26 22:44:10,2023-11-26 22:44:10,1,gpu
...,...,...,...,...,...
2622,‪+91 82105 79249‬,2023-04-26 14:16:02,2023-04-26 14:16:02,1,community
2629,~ Akshat,2023-03-19 14:05:18,2023-07-04 10:41:03,2,ai_policy
2645,Revant Bhate,2023-04-20 13:06:14,2023-04-20 13:06:14,1,community
2653,~ Tapish Rathore,2023-05-25 14:44:41,2023-05-25 14:44:41,1,community


In [28]:
inactive_users = inactive_users.sort_values(
    by=["Total_Messages_Sent", "Joining_Date", "User", "Recent_Group"],
    ascending=[True, True, True, True],
)

In [29]:
class Labels(str, enum.Enum):
    """Enumeration for single-label text classification."""

    MALE = "male"
    FEMALE = "female"
    UNDECIDED = "undecided"


class SinglePrediction(BaseModel):
    """
    Class for a single class label prediction.
    """

    class_label: Labels
    class_probability: float

In [30]:
# Apply the patch to the OpenAI client
# enables response_model keyword
client = instructor.from_openai(OpenAI())


def classify(data: str) -> SinglePrediction:
    """For the following name, guess whether it's male, female or undecided."""
    return client.chat.completions.create(
        model="gpt-3.5-turbo-0613",
        response_model=SinglePrediction,
        messages=[
            {
                "role": "user",
                "content": f"Assign gender to name: {data}",
            },
        ],
    )  # type: ignore

In [31]:
# Initialization
pandarallel.initialize()

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [35]:
%%time
genders = inactive_users.parallel_apply(
    lambda x: str(classify(x["User"]).class_label), axis=1
)

CPU times: user 17.4 ms, sys: 38 ms, total: 55.4 ms
Wall time: 39.7 s


In [36]:
inactive_users["genders"] = genders

In [43]:
inactive_users.describe()

Unnamed: 0,Joining_Date,Most_Recent_Message,Total_Messages_Sent
count,512,512,512.0
mean,2023-07-04 18:38:09.630859264,2023-07-13 07:34:54.603515648,1.240234
min,2023-03-10 18:53:02,2023-03-10 18:53:02,1.0
25%,2023-04-25 11:37:16.249999872,2023-04-26 14:20:23,1.0
50%,2023-06-04 05:55:25.500000,2023-06-13 05:26:05,1.0
75%,2023-08-21 12:20:29,2023-09-21 11:24:46.750000128,1.0
max,2024-01-30 18:49:06,2024-01-31 17:32:16,2.0
std,,,0.427644


In [45]:
# Remove rows with female users
inactive_users = inactive_users[inactive_users['genders'] != "Labels.FEMALE"]

In [46]:
len(inactive_users)

421

In [48]:
# Drop the genders column
inactive_users = inactive_users.drop(columns=["genders"])

In [49]:
inactive_users.to_csv("inactive_users.csv", index=False)