In [2]:
%load_ext autoreload
%autoreload 2

import re
import pandas as pd
from typing import Tuple, Optional
from pathlib import Path
from datetime import timedelta, datetime
from whatsapp_parser import extract_messages, cleanup, WhatsAppGroupAnalysis

In [3]:
file_path = Path("../_chat 4.txt") # Path to your exported WhatsApp chat
assert file_path.exists()

In [7]:
previous_df = pd.read_csv("../20240101_Messages.csv", sep="|")
previous_df["Datetime"] = pd.to_datetime(previous_df["Datetime"])
# previous_df = pd.DataFrame()

In [8]:
def parse_chat_line(line):
    match = re.match(r"\[(.*?)\] (.*?): (.*)", line)
    if match:
        date_time_str, sender, message = match.groups()
        try:
            date_time = datetime.strptime(date_time_str, '%Y-%m-%d, %H:%M:%S')
        except ValueError:
            date_time = datetime.strptime(date_time_str, '%m/%d/%y, %H:%M:%S')
        return date_time, sender, message
    return None

def parse_chat(file_path: str) -> pd.DataFrame:
    """
    Parses a WhatsApp chat log into a DataFrame.

    Parameters:
        file_path (str): Path to the chat log file.
        lines_to_read (int): Number of lines to read from the file.

    Returns:
        pd.DataFrame: DataFrame containing the parsed chat with columns 'Sender', 'Datetime', 'Message'.
    """
    parsed_data = []
    with open(file_path, 'r') as file:
        for _, line in enumerate(file):
            parsed_line = parse_chat_line(line)
            if parsed_line:
                parsed_data.append(parsed_line)

    # Creating a DataFrame
    df = pd.DataFrame(parsed_data, columns=['Datetime', 'Sender', 'Message'])
    # df['Datetime'] = pd.to_datetime(df['Datetime'], format="%Y-%m-%d, %H:%M:%S")
    return df

df = parse_chat(file_path=file_path)
df = cleanup(df)
# Remove duplicate rows
len(df)

15548

In [9]:
df = pd.concat([df, previous_df], ignore_index=True)
df.Datetime.min(), df.Datetime.max()
# Find messages from 2023-12-16 00:00:00 onwards
# df[df["Sender"] >= datetime(2023, 12, 16)][60:80]
# Find messages from Sender which has "Surbhi" in it
df[df["Sender"].str.contains("Prakhar")]

Unnamed: 0,Datetime,Sender,Message
29691,2023-03-27 16:54:38,~ Prakhar,‎~ Prakhar joined using this group's invite link


In [10]:
# Find latest date from df
latest_date = df['Datetime'].max().strftime("%Y%m%d")
file_name = f"../{latest_date}_Messages.csv"
df.Datetime.min(), df.Datetime.max()
# Save the dataframe as a csv file with {latest_date}_Messages.csv as the filename.
df.to_csv(file_name, index=False, encoding='utf-8', header=True, sep="|")
# df = pd.read_csv(file_name, sep="|")
# df.Datetime.min(), df.Datetime.max()

In [11]:
# Create an instance of the class using the uploaded CSV file
analysis = WhatsAppGroupAnalysis(df)

# Test the methods
current_users_df = analysis.get_current_users()
message_count_in_window_df = analysis.get_message_count_in_window(10)
# message_count_in_window_df

In [12]:
inactive_users_to_remove = analysis.get_inactive_users(exclude_contacts=False)
inactive_users_to_remove

Unnamed: 0,User,Message_Count_In_Window,Joining_Date,Total_Messages_Sent,Most_Recent_Message_Date
0,~ Aniket Singh,0.0,2023-10-25 10:33:49,34,2024-01-17 13:08:04
1,~ Varun Khandelwal,0.0,2023-10-26 14:46:51,10,2023-12-24 22:03:58
2,~ Raunak Kalani,0.0,2023-10-26 15:30:31,8,2023-10-26 15:30:31
3,~ viv,0.0,2023-10-26 16:34:43,7,2024-02-18 07:36:33
4,~ Mudit Tyagi,0.0,2023-10-27 14:23:04,18,2024-02-19 05:50:54
...,...,...,...,...,...
542,Karthik S Delivery,0.0,2023-10-17 19:17:00,1,2023-10-17 19:17:00
543,~ Shikhil Kumar Gupta,0.0,2023-10-26 16:34:46,23,2023-11-20 21:22:04
544,~ prahaladbelavadi,0.0,2023-10-28 13:19:54,2,2023-10-28 13:19:54
545,~ Nithin Vasishta,0.0,2023-10-28 13:20:42,6,2023-11-03 14:14:35


In [13]:
# Sort by total messages sent and then by joining date
inactive_users = inactive_users_to_remove.sort_values(
    by=['Total_Messages_Sent', 'Joining_Date'], ascending=[True, True])
# Display the DataFrame
# inactive_users[:100]

Unnamed: 0,User,Message_Count_In_Window,Joining_Date,Total_Messages_Sent,Most_Recent_Message_Date
246,~ Prakhar,0.0,2023-03-27 16:54:38,1,2023-03-27 16:54:38
250,Sanket Nadhani,0.0,2023-03-29 10:03:25,1,2023-03-29 10:03:25
289,~ Madhur,0.0,2023-04-14 14:36:18,1,2023-04-14 14:36:18
292,~ ∆,0.0,2023-04-14 20:29:57,1,2023-04-14 20:29:57
305,~ ι,0.0,2023-04-18 10:14:33,1,2023-04-18 10:14:33
...,...,...,...,...,...
493,~ Enrique Ferrao,0.0,2023-09-01 10:11:52,2,2023-09-01 10:11:52
494,~ Pranav Reddy | Xylem AI,0.0,2023-09-01 11:10:53,2,2023-09-01 11:10:53
496,~ Sameera Poduri,0.0,2023-09-04 07:56:33,2,2023-09-10 19:34:31
507,~ Mihir Kulkarni,0.0,2023-09-08 13:06:52,2,2023-09-08 13:06:52


In [14]:
inactive_users.to_csv('inactive_users.csv', index=False)