<a href="https://colab.research.google.com/github/Sujoy-004/Chat-Analyzer-Pro/blob/main/01_data_parsin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Cell 1: Setup and Imports

In [12]:
import pandas as pd
import re
from datetime import datetime
import os

# Create directory structure
os.makedirs('data/processed', exist_ok=True)
os.makedirs('data/sample_chats', exist_ok=True)

print("✅ Directories created")
print("📁 Structure ready for WhatsApp parser")

✅ Directories created
📁 Structure ready for WhatsApp parser


## Cell 2: Fetch and Parse GitHub Data

In [13]:
import requests

# Fetch data from GitHub
url = "https://raw.githubusercontent.com/Sujoy-004/Chat-Analyzer-Pro/refs/heads/main/data/sample_chats/whatsapp_sample.txt"
response = requests.get(url)
whatsapp_data = response.text

print(f"✅ Fetched {len(whatsapp_data)} characters from GitHub")
print(f"📄 Preview:\n{whatsapp_data[:200]}...")

✅ Fetched 1759 characters from GitHub
📄 Preview:
12/25/23, 9:30 AM - Alice: Merry Christmas! 🎄🎅
12/25/23, 9:32 AM - Bob: Merry Christmas to you too! Hope you're having a great day
12/25/23, 9:35 AM - Alice: Thanks! Opening presents with family right...


In [14]:
def parse_whatsapp_chat(content):
    """Parse WhatsApp chat content into structured DataFrame"""

    # WhatsApp message pattern - handles multi-line messages
    pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2} (?:AM|PM)) - ([^:]+): (.*)'

    messages = []
    lines = content.split('\n')
    current_message = None

    for line in lines:
        line = line.strip()
        if not line:
            continue

        match = re.match(pattern, line)
        if match:
            # New message
            if current_message:
                messages.append(current_message)

            date_str, time_str, sender, message = match.groups()
            datetime_str = f"{date_str} {time_str}"

            try:
                dt = datetime.strptime(datetime_str, '%m/%d/%y %I:%M %p')
            except:
                dt = datetime.strptime(datetime_str, '%m/%d/%Y %I:%M %p')

            current_message = {
                'datetime': dt,
                'sender': sender.strip(),
                'message': message.strip(),
                'date': dt.date(),
                'time': dt.time(),
                'hour': dt.hour,
                'message_length': len(message.strip())
            }
        else:
            # Continuation of previous message
            if current_message:
                current_message['message'] += ' ' + line
                current_message['message_length'] = len(current_message['message'])

    # Add the last message
    if current_message:
        messages.append(current_message)

    return pd.DataFrame(messages)


In [15]:
# Parse the data
df = parse_whatsapp_chat(whatsapp_data)
print(f"\n✅ Parsed {len(df)} messages")
print(f"👥 Senders: {list(df['sender'].unique())}")
print(f"📊 DataFrame shape: {df.shape}")
display(df.head())

# Save the parsed data to a CSV file
csv_filepath = 'data/processed/example_parsed.csv'
df.to_csv(csv_filepath, index=False)
print(f"\n✅ Parsed data saved to {csv_filepath}")

# Provide a download link
from google.colab import files
files.download(csv_filepath)


✅ Parsed 27 messages
👥 Senders: ['Alice', 'Bob']
📊 DataFrame shape: (27, 7)


Unnamed: 0,datetime,sender,message,date,time,hour,message_length
0,2023-12-25 09:30:00,Alice,Merry Christmas! 🎄🎅,2023-12-25,09:30:00,9,19
1,2023-12-25 09:32:00,Bob,Merry Christmas to you too! Hope you're having...,2023-12-25,09:32:00,9,58
2,2023-12-25 09:35:00,Alice,Thanks! Opening presents with family right now...,2023-12-25,09:35:00,9,73
3,2023-12-25 10:15:00,Bob,That sounds wonderful! I'm cooking dinner for ...,2023-12-25,10:15:00,10,60
4,2023-12-25 10:16:00,Alice,<Media omitted>,2023-12-25,10:16:00,10,15



✅ Parsed data saved to data/processed/example_parsed.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Cell 3: Telegram Parser Setup

In [16]:
import json
from datetime import datetime
import pandas as pd

# Check if we need to fetch Telegram sample data from GitHub
telegram_url = "https://raw.githubusercontent.com/Sujoy-004/Chat-Analyzer-Pro/refs/heads/main/data/sample_chats/telegram_sample.json"

try:
    # Try to fetch from GitHub first
    import requests
    response = requests.get(telegram_url)
    if response.status_code == 200:
        telegram_data = response.json()
        print("✅ Fetched Telegram data from GitHub")
        print(f"📄 Data type: {type(telegram_data)}")
        print(f"📊 Keys available: {list(telegram_data.keys()) if isinstance(telegram_data, dict) else 'List format'}")
    else:
        print("⚠️ GitHub data not found, will create sample data")
        telegram_data = None
except:
    print("⚠️ Could not fetch from GitHub, will create sample data")
    telegram_data = None

print("\n📝 Ready to implement Telegram parser...")

✅ Fetched Telegram data from GitHub
📄 Data type: <class 'dict'>
📊 Keys available: ['name', 'type', 'id', 'messages']

📝 Ready to implement Telegram parser...


#Cell 4: Examine Telegram Data Structure

In [17]:
# Basic info
print(f"Chat Name: {telegram_data.get('name', 'N/A')}")
print(f"Chat Type: {telegram_data.get('type', 'N/A')}")
print(f"Chat ID: {telegram_data.get('id', 'N/A')}")

Chat Name: Telegram Chat with Project Team
Chat Type: personal_chat
Chat ID: 123456789


In [18]:
# Messages analysis
messages = telegram_data.get('messages', [])
print(f"Total Messages: {len(messages)}")

if messages:
    # Examine first few messages structure
    print(f"\n📋 SAMPLE MESSAGE STRUCTURE:")
    sample_msg = messages[0]
    for key, value in sample_msg.items():
        print(f"{key}: {value} ({type(value).__name__})")

    # Check message types
    msg_types = {}
    for msg in messages[:10]:  # Check first 10
        msg_type = msg.get('type', 'unknown')
        msg_types[msg_type] = msg_types.get(msg_type, 0) + 1

    print(f"\n📊 MESSAGE TYPES (first 10):")
    for msg_type, count in msg_types.items():
        print(f"{msg_type}: {count}")

    # Check date format
    if 'date' in sample_msg:
        print(f"\n📅 DATE FORMAT EXAMPLE: {sample_msg['date']}")

print("\n✅ Data structure analyzed, ready to build parser")

Total Messages: 5

📋 SAMPLE MESSAGE STRUCTURE:
id: 1 (int)
type: message (str)
date: 2025-09-15T09:45:00 (str)
from: Sujoy (str)
text: Hey team, did you check the new repo update? (str)

📊 MESSAGE TYPES (first 10):
message: 5

📅 DATE FORMAT EXAMPLE: 2025-09-15T09:45:00

✅ Data structure analyzed, ready to build parser


## Cell 5: Implement Telegram Parser


In [19]:
def parse_telegram_chat(source):
    # Load data
    if source.startswith('http'):
        response = requests.get(source)
        data = response.json()
    else:
        with open(source, 'r', encoding='utf-8') as f:
            data = json.load(f)

    messages = data.get('messages', [])
    parsed_messages = []

    for msg in messages:
        # Skip non-message types
        if msg.get('type') != 'message':
            continue

        # Parse datetime (ISO format: 2025-09-15T09:45:00)
        date_str = msg.get('date', '')
        try:
            dt = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
        except:
            continue  # Skip messages with invalid dates

        # Extract message text
        text = msg.get('text', '')
        if isinstance(text, list):
            # Handle complex text formatting
            text_parts = []
            for part in text:
                if isinstance(part, str):
                    text_parts.append(part)
                elif isinstance(part, dict) and 'text' in part:
                    text_parts.append(part['text'])
            text = ''.join(text_parts)

        # Handle media messages
        if not text and ('photo' in msg or 'video' in msg or 'document' in msg):
            text = '<Media omitted>'

        parsed_msg = {
            'datetime': dt,
            'sender': msg.get('from', 'Unknown'),
            'message': text,
            'date': dt.date(),
            'time': dt.time(),
            'hour': dt.hour,
            'message_length': len(text),
            'message_id': msg.get('id'),
            'type': msg.get('type', 'message')
        }

        parsed_messages.append(parsed_msg)

    return pd.DataFrame(parsed_messages)

In [20]:
# Test the parser
print("🔄 PARSING TELEGRAM DATA...")
telegram_df = parse_telegram_chat(telegram_url)

print(f"✅ Parsed {len(telegram_df)} Telegram messages")
print(f"👥 Senders: {list(telegram_df['sender'].unique())}")
print(f"📊 DataFrame shape: {telegram_df.shape}")

# Display sample
telegram_df.head()

🔄 PARSING TELEGRAM DATA...
✅ Parsed 5 Telegram messages
👥 Senders: ['Sujoy', 'Ravi', 'Ananya']
📊 DataFrame shape: (5, 9)


Unnamed: 0,datetime,sender,message,date,time,hour,message_length,message_id,type
0,2025-09-15 09:45:00,Sujoy,"Hey team, did you check the new repo update?",2025-09-15,09:45:00,9,44,1,message
1,2025-09-15 09:46:12,Ravi,"Yes, I saw the commit. Looks good.",2025-09-15,09:46:12,9,34,2,message
2,2025-09-15 09:47:35,Ananya,We should add more test cases before merging.,2025-09-15,09:47:35,9,45,3,message
3,2025-09-15 09:48:00,Sujoy,Agree. I'll push test cases tonight.,2025-09-15,09:48:00,9,36,4,message
4,2025-09-15 10:00:00,Ravi,Cool. Let’s finalize by tomorrow.,2025-09-15,10:00:00,10,33,5,message


## Cell 6: Save Data and Create Unified Parser

In [21]:
# Fetch and use the existing telegram_parser.py from GitHub
github_parser_url = "https://raw.githubusercontent.com/Sujoy-004/Chat-Analyzer-Pro/refs/heads/main/src/parser/telegram_parser.py"
response = requests.get(github_parser_url)
parser_code = response.text

print("✅ Fetched telegram_parser.py from GitHub")

# Execute the parser code to make functions available
exec(parser_code)

# Test the GitHub version
telegram_df_github = parse_telegram_chat(telegram_url)
print(f"✅ GitHub parser works: {len(telegram_df_github)} messages parsed")

# Save parsed data
telegram_df_github.to_csv('data/processed/telegram_parsed.csv', index=False)
print("✅ Saved Telegram data to data/processed/telegram_parsed.csv")

# Final comparison
print(f"\n📊 PARSER COMPARISON:")
print(f"WhatsApp messages: {len(df)}")
print(f"Telegram messages: {len(telegram_df_github)}")
print(f"Total messages processed: {len(df) + len(telegram_df_github)}")

✅ Fetched telegram_parser.py from GitHub
✅ GitHub parser works: 5 messages parsed
✅ Saved Telegram data to data/processed/telegram_parsed.csv

📊 PARSER COMPARISON:
WhatsApp messages: 27
Telegram messages: 5
Total messages processed: 32
