In [3]:
import re
import unicodedata

# Regex for WhatsApp message lines
LINE_REGEX = re.compile(
    r'^(\d{1,2}/\d{1,2}/\d{2,4}),\s(\d{1,2}:\d{2}\s(?:AM|PM))\s-\s([^:]+):\s(.+)$'
)

# System messages to ignore (keywords)
SYSTEM_KEYWORDS = ["end-to-end encrypted","changed the group","added","removed","left","joined","created group"]

MEDIA_MARKERS = ["<Media omitted>","image omitted","video omitted","audio omitted","document omitted"]


def remove_emojis(text):
    return ''.join(
        c for c in text
        if unicodedata.category(c) != 'So'
    )

def is_system_message(text):
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in SYSTEM_KEYWORDS)

def is_media_message(text):
    return any(marker.lower() in text.lower() for marker in MEDIA_MARKERS)

def parse_whatsapp_txt(file_path):
    messages = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            match = LINE_REGEX.match(line)

            if not match:
                continue

            date, time, sender, message = match.groups()
            message = remove_emojis(message).strip()

            if not message:
                continue
            if is_system_message(message):
                continue
            if is_media_message(message):
                continue

            messages.append({
                "timestamp": f"{date} {time}",
                "sender": sender.strip(),
                "message_text": message
            })

    return messages

# Example usage
if __name__ == "__main__":
    data = parse_whatsapp_txt("/content/Goodness.txt")
    for msg in data[:5]:
        print(msg)

{'timestamp': '10/7/23 4:06\u202fPM', 'sender': 'Olanle', 'message_text': '️️️'}
{'timestamp': '10/7/23 4:20\u202fPM', 'sender': 'Mejor amiga', 'message_text': 'Pele'}
{'timestamp': '10/7/23 4:20\u202fPM', 'sender': 'Mejor amiga', 'message_text': "There's no last message shq"}
{'timestamp': '10/7/23 4:34\u202fPM', 'sender': 'Olanle', 'message_text': 'Alright thanks'}
{'timestamp': '10/7/23 4:34\u202fPM', 'sender': 'Olanle', 'message_text': 'Na me do my sef o'}
