In [None]:
import os
import json
import pandas as pd
import ftfy
import csv
import matplotlib 
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk import ngrams
import calendar
from textblob import TextBlob

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
class Message:
    def __init__(self, sender_name, receivers, content, timestamp_ms, chat_name):
        self.sender_name = sender_name
        self.receivers = receivers
        self.content = content
        self.timestamp_ms = timestamp_ms
        self.chat_name = chat_name

    def to_dict(self):
        return {
            'sender_name': self.sender_name,
            'receivers': ', '.join(self.receivers),
            'content': self.content,
            'timestamp_ms': self.timestamp_ms,
            'chat_name': self.chat_name
        }

In [None]:
def load_env(filename):
    with open(filename) as f:
        for line in f:
            key, value = line.strip().split('=')
            os.environ[key] = value

def load_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def process_message(message, participants, chat_name):
    sender = ftfy.fix_text(message.get('sender_name', ''))
    receivers = [ftfy.fix_text(p) for p in participants if p != sender]
    content = ftfy.fix_text(message.get('content', ''))
    timestamp_ms = pd.to_datetime(message['timestamp_ms'], unit='ms')
    return Message(sender, receivers, content, timestamp_ms, chat_name)

def process_chat_folder(folder_path):
    messages = []
    data = load_json_file(os.path.join(folder_path, 'message_1.json'))
    participants = [ftfy.fix_text(p['name']) for p in data.get('participants', [])]
    folder_name = os.path.basename(folder_path)
    chat_name = folder_name.split("_")[0]   

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            file_path = os.path.join(folder_path, file_name)
            data = load_json_file(file_path)
            for msg in data.get('messages', []):
                if not msg.get('is_taken_down'):
                    messages.append(process_message(msg, participants, chat_name))

    return messages, len(participants)

def process_folder(folder_path):
    all_messages = []
    for chat_folder in os.listdir(folder_path):
        chat_path = os.path.join(folder_path, chat_folder)
        if os.path.isdir(chat_path):
            chat_messages, participant_count = process_chat_folder(chat_path)
            if len(chat_messages) > 20 and participant_count < 10:
                all_messages.extend(chat_messages)
    return all_messages

def save_messages_csv(messages, filename):
    df = pd.DataFrame([msg.to_dict() for msg in messages])
    df.to_csv(filename, index=False)
    print(f"Saved {len(messages)} messages to {filename}")

def load_messages_csv(filename):
    df = pd.read_csv(filename, parse_dates=['timestamp_ms'])
    messages = []
    for _, row in df.iterrows():
        receivers = row['receivers'].split(', ') if isinstance(row['receivers'], str) else []
        messages.append(Message(
            row['sender_name'],
            receivers,
            row['content'],
            row['timestamp_ms'],
            row['chat_name']
        ))
    return messages

In [None]:
def main():
    load_env('.env')
    
    inbox_folder = os.getenv('INBOX_FOLDER')
    e2ee_folder = os.getenv('E2EE_FOLDER')
    processed_file = os.getenv('PROCESSED_FILE')

    if os.path.exists(processed_file):
        print("Loading processed messages from CSV file...")
        all_messages = load_messages_csv(processed_file)
    else:
        print("Processing messages...")
        inbox_messages = process_folder(inbox_folder)
        e2ee_messages = process_folder(e2ee_folder)
        all_messages = inbox_messages + e2ee_messages
        save_messages_csv(all_messages, processed_file)

    print(f"Total messages: {len(all_messages)}")

    df = pd.DataFrame([msg.to_dict() for msg in all_messages])
    return df, all_messages

In [None]:
if __name__ == "__main__":
    df, all_messages = main()