In [1]:
# Install necessary libraries
!pip install nltk

import re
from collections import Counter
from datetime import datetime
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download VADER lexicon for sentiment analysis
nltk.download('vader_lexicon')

# Function to parse WhatsApp chat
def parse_chat(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        chat_data = file.readlines()

    messages = []
    for line in chat_data:
        match = re.match(r'(\d{1,2}/\d{1,2}/\d{4}, \d{1,2}:\d{2} [APM]+) - ([^:]+): (.*)', line)
        if match:
            timestamp = match.group(1)
            sender = match.group(2)
            message = match.group(3)
            messages.append({'timestamp': timestamp, 'sender': sender, 'message': message})
    return messages

# Function for message frequency analysis
def message_frequency(messages):
    senders = [msg['sender'] for msg in messages]
    return Counter(senders)

# Function for sentiment analysis
def sentiment_analysis(messages):
    sia = SentimentIntensityAnalyzer()
    for msg in messages:
        sentiment = sia.polarity_scores(msg['message'])
        msg['sentiment'] = sentiment
    return messages

# Function for word frequency analysis
def word_frequency(messages):
    words = []
    for msg in messages:
        words.extend(re.findall(r'\b\w+\b', msg['message'].lower()))
    return Counter(words).most_common(10)

# Function for media type analysis
def media_analysis(messages):
    media = {'images': 0, 'videos': 0, 'documents': 0}
    for msg in messages:
        if 'image' in msg['message'].lower():
            media['images'] += 1
        elif 'video' in msg['message'].lower():
            media['videos'] += 1
        elif 'document' in msg['message'].lower():
            media['documents'] += 1
    return media

# Function to build forensic timeline
def build_timeline(messages):
    for msg in messages:
        msg['timestamp'] = datetime.strptime(msg['timestamp'], '%m/%d/%Y, %I:%M %p')
    messages.sort(key=lambda x: x['timestamp'])
    return messages

# Main function
def whatsapp_chat_analyzer(file_path):
    # Parse the chat
    messages = parse_chat(file_path)

    # Analyze message frequency
    message_count = message_frequency(messages)
    print("Message Frequency:", message_count)

    # Perform sentiment analysis
    messages = sentiment_analysis(messages)
    print("Sentiment Analysis (Sample):", messages[:5])

    # Analyze word frequency
    top_words = word_frequency(messages)
    print("Top Words:", top_words)

    # Analyze media content
    media = media_analysis(messages)
    print("Media Analysis:", media)

    # Build forensic timeline
    timeline = build_timeline(messages)
    print("Forensic Timeline (Sample):", timeline[:5])

# Upload file using Google Colab (upload WhatsApp exported chat file)
from google.colab import files
uploaded = files.upload()

# Assuming the file is named 'whatsapp_chat.txt'
file_path = list(uploaded.keys())[0]  # Get the filename
whatsapp_chat_analyzer(file_path)




[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Saving chat.txt to chat.txt
Message Frequency: Counter({'Alice': 10, 'Bob': 10})
Sentiment Analysis (Sample): [{'timestamp': '12/25/2024, 10:30 AM', 'sender': 'Alice', 'message': "Hey! How's your day going?", 'sentiment': {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}}, {'timestamp': '12/25/2024, 10:31 AM', 'sender': 'Bob', 'message': "It's been great, thanks for asking! What about you?", 'sentiment': {'neg': 0.0, 'neu': 0.49, 'pos': 0.51, 'compound': 0.807}}, {'timestamp': '12/25/2024, 10:32 AM', 'sender': 'Alice', 'message': "I'm good, just finishing some work. Have you seen the latest movie?", 'sentiment': {'neg': 0.0, 'neu': 0.791, 'pos': 0.209, 'compound': 0.4404}}, {'timestamp': '12/25/2024, 10:33 AM', 'sender': 'Bob', 'message': 'Yes, I watched it last night! It was amazing.', 'sentiment': {'neg': 0.0, 'neu': 0.469, 'pos': 0.531, 'compound': 0.7777}}, {'timestamp': '12/25/2024, 10:34 AM', 'sender': 'Alice', 'message': 'I loved the plot twist! Totally unexpected.', 'sentim