In [1]:
%load_ext autoreload
%autoreload 2

import re
import pandas as pd
from typing import Tuple, Optional
from pathlib import Path
from datetime import timedelta, datetime
from whatsapp_parser import extract_messages, cleanup, WhatsAppGroupAnalysis

In [3]:
file_path = Path("_chat 4.txt") # Path to your exported WhatsApp chat
assert file_path.exists()

In [13]:
previous_df = pd.read_csv("../20231215_Messages.csv", sep="|")
previous_df["Datetime"] = pd.to_datetime(previous_df["Datetime"])

0        False
1        False
2        False
3        False
4        False
         ...  
35701    False
35702    False
35703    False
35704    False
35705    False
Name: Datetime, Length: 35706, dtype: bool

In [46]:
def parse_chat_line(line):
    match = re.match(r"\[(.*?)\] (.*?): (.*)", line)
    if match:
        date_time_str, sender, message = match.groups()
        try:
            date_time = datetime.strptime(date_time_str, '%Y-%m-%d, %H:%M:%S')
        except ValueError:
            date_time = datetime.strptime(date_time_str, '%m/%d/%y, %H:%M:%S')
        return date_time, sender, message
    return None

def parse_chat(file_path: str) -> pd.DataFrame:
    """
    Parses a WhatsApp chat log into a DataFrame.

    Parameters:
        file_path (str): Path to the chat log file.
        lines_to_read (int): Number of lines to read from the file.

    Returns:
        pd.DataFrame: DataFrame containing the parsed chat with columns 'Sender', 'Datetime', 'Message'.
    """
    parsed_data = []
    with open(file_path, 'r') as file:
        for _, line in enumerate(file):
            parsed_line = parse_chat_line(line)
            if parsed_line:
                parsed_data.append(parsed_line)

    # Creating a DataFrame
    df = pd.DataFrame(parsed_data, columns=['Datetime', 'Sender', 'Message'])
    # df['Datetime'] = pd.to_datetime(df['Datetime'], format="%Y-%m-%d, %H:%M:%S")
    return df

df = parse_chat(file_path=file_path)
df = cleanup(df)
# Remove duplicate rows
len(df)

7054

In [52]:
df = pd.concat([df, previous_df], ignore_index=True)
df.Datetime.min(), df.Datetime.max()
# Find messages from 2023-12-16 00:00:00 onwards
# df[df["Sender"] >= datetime(2023, 12, 16)][60:80]
# Find messages from Sender which has "Surbhi" in it
df[df["Sender"].str.contains("Prakhar")]

Unnamed: 0,Datetime,Sender,Message
14143,2023-03-27 16:54:38,~ Prakhar,‎~ Prakhar joined using this group's invite link
49849,2023-03-27 16:54:38,~ Prakhar,‎~ Prakhar joined using this group's invite link


In [48]:
# Find latest date from df
latest_date = df['Datetime'].max().strftime("%Y%m%d")
file_name = f"../{latest_date}_Messages.csv"
df.Datetime.min(), df.Datetime.max()
# Save the dataframe as a csv file with {latest_date}_Messages.csv as the filename.
df.to_csv(file_name, index=False, encoding='utf-8', header=True, sep="|")
# df = pd.read_csv(file_name, sep="|")
# df.Datetime.min(), df.Datetime.max()

In [49]:
# Create an instance of the class using the uploaded CSV file
analysis = WhatsAppGroupAnalysis(df)

# Test the methods
current_users_df = analysis.get_current_users()
message_count_in_window_df = analysis.get_message_count_in_window(10)
# message_count_in_window_df

In [53]:
inactive_users_to_remove = analysis.get_inactive_users(exclude_contacts=False)
inactive_users_to_remove

Unnamed: 0,User,Message_Count_In_Window,Joining_Date,Total_Messages_Sent,Most_Recent_Message_Date
0,~ Harmandeep Singh Matharu,0.0,2023-10-25 07:42:35,3,2023-10-25 07:42:35
1,~ Raunak Kalani,0.0,2023-10-26 15:30:31,6,2023-10-26 15:30:31
2,~ Vivek Nayak,0.0,2023-10-26 16:34:43,15,2023-10-30 22:06:06
3,~ Mudit Tyagi,0.0,2023-10-27 14:23:04,12,2023-10-30 10:18:34
4,~ $@!,0.0,2023-10-28 13:19:52,6,2023-10-28 13:19:52
...,...,...,...,...,...
413,Karthik S Delivery,0.0,2023-10-17 19:17:00,1,2023-10-17 19:17:00
414,~ Yash Vijaykar,0.0,2023-10-17 19:17:00,1,2023-10-17 19:17:00
415,Siddharth Bhatia 2011P NUS,0.0,2023-10-19 10:58:45,2,2023-10-19 10:58:45
416,~ prahaladbelavadi,0.0,2023-10-28 13:19:54,2,2023-10-28 13:19:54


In [54]:
# Sort by total messages sent and then by joining date
inactive_users = inactive_users_to_remove.sort_values(
    by=['Total_Messages_Sent', 'Joining_Date'], ascending=[True, True])
# Display the DataFrame
inactive_users[:100]

Unnamed: 0,User,Message_Count_In_Window,Joining_Date,Total_Messages_Sent,Most_Recent_Message_Date
43,Anirudh Singla Pepper,0.0,2023-03-02 08:41:31,1,2023-03-02 08:41:31
48,Abhilash,0.0,2023-03-02 12:58:49,1,2023-03-02 12:58:49
49,Arjun Gandhi NexusVP,0.0,2023-03-03 13:40:26,1,2023-03-03 13:40:26
71,Rahul Seth,0.0,2023-03-20 23:25:34,1,2023-03-20 23:25:34
85,~ Prakhar,0.0,2023-03-27 16:54:38,1,2023-03-27 16:54:38
...,...,...,...,...,...
190,~ Rakesh,0.0,2023-04-28 00:37:58,2,2023-05-22 16:22:50
196,Prado Garv's Friend,0.0,2023-05-04 20:10:07,2,2023-09-04 14:41:33
213,Aditya Kothari Covid19,0.0,2023-05-15 16:01:31,2,2023-05-15 16:01:49
217,~ Akshay Jain,0.0,2023-05-20 16:43:35,2,2023-05-21 12:44:58


In [41]:
inactive_users.to_csv('100_inactive_users.csv', index=False)