In [None]:
from pathlib import Path
text = Path("../_chat.txt").open("r").read()

In [None]:
import re
import pandas as pd
from datetime import datetime 
from typing import List, Tuple

In [None]:
def extract_messages(input_text: str) -> List[Tuple[str, str, str]]:
    pattern = re.compile(r'\[(?P<date>\d{1,2}\/\d{1,2}\/\d{2,4}), (?P<time>\d{1,2}:\d{2}:\d{2})\] (?P<sender>[^:]+): (?P<message>.+)')
    join_pattern = re.compile(r'joined using this group\'s invite link')

    messages = []
    for line in input_text.split('\n'):
        match = pattern.match(line)
        if match and not join_pattern.search(line):
            date, time, sender, message = match.groups()
            datetime_str = f"{date} {time}"
            dt = datetime.strptime(datetime_str, "%m/%d/%y %H:%M:%S")
            messages.append((sender, dt, message))

    return messages

messages = extract_messages(text)
df = pd.DataFrame(messages, columns=['Sender', 'Datetime', 'Message'])

print(df)

In [None]:
def remove_pii(text):
    # Remove phone numbers
    phone_pattern = re.compile(r'\+?(\d[\d-]{7,}\d)')
    no_phones = phone_pattern.sub('[PHONE REMOVED]', text)

    # Remove email addresses
    email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
    no_emails = email_pattern.sub('[EMAIL REMOVED]', no_phones)

    return no_emails


def cleanup(df):
    # Drop the Sender column
    if "Sender" in df.columns:
        df = df.drop(columns=['Sender'])
    # Drop the rows with no message
    df = df.dropna()

    df = df[~df['Message'].str.contains('deleted this message')]
    df = df[~df['Message'].str.contains('message was deleted')]
    df = df[~df['Message'].str.contains('‎‪')]
    df = df[~df['Message'].str.contains('changed the subject to')]
    df = df[~df['Message'].str.contains('‎')]
    df = df[~df['Message'].str.contains('You added')]
    df = df[~df['Message'].str.contains('changed the group description')]
    df = df[~df['Message'].str.contains('POLL:')]
    df = df[~df['Message'].str.contains('reset this group\'s invite link')]
    df = df[~df['Message'].str.contains('changed this group\'s icon')]
    df = df[~df['Message'].str.contains('changed the subject from')]
    df = df[~df['Message'].str.contains('changed this group\'s settings')]
    df["Message"] = df["Message"].apply(remove_pii)
    return df

print(f"Before cleanup: {len(df)}")
df = cleanup(df)
print(f"After cleanup: {len(df)}")

In [None]:
# Today's date
today = datetime.today().strftime('%Y%m%d')
df.to_csv(f'../{today}_Messages.csv', index=False)