In [5]:
%load_ext autoreload
%autoreload 2

import re
from typing import Tuple, Optional
from pathlib import Path
from datetime import timedelta, datetime

import pandas as pd
file_path = Path("_chat 2.txt")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
from whatsapp_parser import extract_messages, cleanup, WhatsAppGroupAnalysis

def parse_chat_line(line: str) -> Optional[Tuple[datetime, str, str]]:
    """
    Parses a single line of the chat log.

    Parameters:
        line (str): A line from the chat log.

    Returns:
        Optional[Tuple[datetime, str, str]]: A tuple containing the date-time, sender, and message,
                                             or None if the line doesn't represent a standard message.
    """
    # Regular expression to match the date-time and sender format
    match = re.match(r"\[(\d{4}-\d{2}-\d{2}, \d{2}:\d{2}:\d{2})\] ([^:]+): (.+)", line)
    if match:
        date_time_str, sender, message = match.groups()
        date_time = datetime.strptime(date_time_str, '%Y-%m-%d, %H:%M:%S')
        return date_time, sender, message
    return None

def parse_chat(file_path: str) -> pd.DataFrame:
    """
    Parses a WhatsApp chat log into a DataFrame.

    Parameters:
        file_path (str): Path to the chat log file.
        lines_to_read (int): Number of lines to read from the file.

    Returns:
        pd.DataFrame: DataFrame containing the parsed chat with columns 'Sender', 'Datetime', 'Message'.
    """
    parsed_data = []
    with open(file_path, 'r') as file:
        for _, line in enumerate(file):
            parsed_line = parse_chat_line(line)
            if parsed_line:
                parsed_data.append(parsed_line)

    # Creating a DataFrame
    return pd.DataFrame(parsed_data, columns=['Datetime', 'Sender', 'Message'])

df = extract_messages(file_path=Path("_chat.txt"))
print(f"Before cleanup: {len(df)}")
df = cleanup(df)
print(f"After cleanup: {len(df)}")
df

Before cleanup: 26244
After cleanup: 25687


Unnamed: 0,Sender,Datetime,Message
0,The GenerativeAI Group,2023-03-01 14:09:55,‎Messages and calls are end-to-end encrypted. ...
1,The GenerativeAI Group,2023-03-01 14:09:55,‎You created group “GenerativeAI/DeepMedia”
2,The GenerativeAI Group,2023-03-01 14:09:55,‎‎‎Disappearing messages were turned on. ‎New ...
3,Kaushik Bokka,2023-03-01 14:11:29,‎Kaushik Bokka joined using this group's invit...
4,Ravi Theja,2023-03-01 14:11:31,‎Ravi Theja joined using this group's invite link
...,...,...,...
26238,Jayanth Generative AI WhatsApp Group,2023-10-19 10:44:04,Thanks a lot🙌
26239,~ Sandeep,2023-10-19 10:44:20,"afaik, yes. increased accuracy & efficiency to..."
26240,Nirant K,2023-10-19 10:47:18,"You can also finetune on text to SQL ""task"" wi..."
26241,~ Sumit,2023-10-19 10:53:02,‎~ Sumit requested to join


In [24]:
chat_2_df = parse_chat(file_path=Path("_chat 2.txt"))
chat_2_df.Datetime = pd.to_datetime(chat_2_df.Datetime)
chat_2_df.Datetime.min(), chat_2_df.Datetime.max()

(Timestamp('2023-10-25 07:42:35'), Timestamp('2023-11-20 23:49:00'))

In [25]:
df = pd.concat([df, chat_2_df], ignore_index=True)
chat_2_df.Datetime.max()


Timestamp('2023-11-20 23:49:00')

In [26]:
# Parse datetime
df['Datetime'] = pd.to_datetime(df['Datetime'])
# df

In [27]:
# Find latest date from df
latest_date = df['Datetime'].max().strftime("%Y%m%d")
file_name = f"../{latest_date}_Messages.csv"

# Save the dataframe as a csv file with {latest_date}_Messages.csv as the filename.
df.to_csv(file_name, index=False, encoding='utf-8', header=True, sep="|")
pd.read_csv(file_name, sep="|")

Unnamed: 0,Sender,Datetime,Message
0,The GenerativeAI Group,2023-03-01 14:09:55,‎Messages and calls are end-to-end encrypted. ...
1,The GenerativeAI Group,2023-03-01 14:09:55,‎You created group “GenerativeAI/DeepMedia”
2,The GenerativeAI Group,2023-03-01 14:09:55,‎‎‎Disappearing messages were turned on. ‎New ...
3,Kaushik Bokka,2023-03-01 14:11:29,‎Kaushik Bokka joined using this group's invit...
4,Ravi Theja,2023-03-01 14:11:31,‎Ravi Theja joined using this group's invite link
...,...,...,...
29570,Digvijay GenAI Group,2023-11-20 22:58:24,Thinking salient part of the text would be whe...
29571,Digvijay GenAI Group,2023-11-20 23:04:25,"tested this with chatgpt generated content , s..."
29572,~ NC,2023-11-20 23:24:41,Anyone else noticed a sharp increase in latenc...
29573,~ NC,2023-11-20 23:25:37,I’ve seen reports of people saying adding in m...


In [28]:
# Create an instance of the class using the uploaded CSV file
analysis = WhatsAppGroupAnalysis(df)

# Test the methods
current_users_df = analysis.get_current_users()
message_count_in_window_df = analysis.get_message_count_in_window(60)

In [33]:
inactive_users_to_remove = analysis.get_inactive_users(exclude_contacts=False)

In [34]:
# Sort by total messages sent and then by joining date
inactive_users = inactive_users_to_remove.sort_values(
    by=['Total_Messages_Sent', 'Joining_Date'], ascending=[True, True])
# Display the DataFrame
inactive_users


Unnamed: 0,User,Message_Count_In_Window,Joining_Date,Total_Messages_Sent,Most_Recent_Message_Date
2,Soumya Shah Acton Garv's Bae,0.0,2023-03-01 14:18:30,1,2023-03-01 14:18:30
3,Kush Gupta 2014,0.0,2023-03-01 14:30:28,1,2023-03-01 14:30:28
6,Saurabh Kumar 2012,0.0,2023-03-01 14:40:37,1,2023-03-01 14:40:37
13,Vandit Gandotra 2014,0.0,2023-03-01 15:42:25,1,2023-03-01 15:42:25
14,Abhishek Jatram Samsung,0.0,2023-03-01 15:44:14,1,2023-03-01 15:44:14
...,...,...,...,...,...
154,Ashfakh GenerativeAI WA Group,0.0,2023-04-18 19:28:06,44,2023-08-20 13:12:46
209,Samanyou WriteSonic,0.0,2023-05-09 16:58:54,45,2023-08-14 18:26:30
31,Shivendu Kumar,0.0,2023-03-02 00:36:30,71,2023-09-06 18:32:06
15,Soumendra Dhanee,0.0,2023-03-01 15:44:33,102,2023-07-06 21:12:35


In [35]:
inactive_users.to_csv('inactive_users.csv', index=False)