In [None]:
import os
import json
import pandas as pd
import ftfy
import csv
from dotenv import load_dotenv
import matplotlib 
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk import ngrams
import calendar
from textblob import TextBlob

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
class Message:
    def __init__(self, sender_name, receivers, content, timestamp_ms, chat_name):
        self.sender_name = sender_name
        self.receivers = receivers
        self.content = content
        self.timestamp_ms = timestamp_ms
        self.chat_name = chat_name

    def to_dict(self):
        return {
            'sender_name': self.sender_name,
            'receivers': ', '.join(self.receivers),
            'content': self.content,
            'timestamp_ms': self.timestamp_ms,
            'chat_name': self.chat_name
        }

In [None]:
def process_message(message_data, participants, chat_name):
    sender = ftfy.fix_text(message_data.get('sender_name', ''))
    receivers = [ftfy.fix_text(p) for p in participants if p != sender]
    content = ftfy.fix_text(message_data.get('content', ''))
    timestamp_ms = pd.to_datetime(message_data['timestamp_ms'], unit='ms')
    chat_name = chat_name.split("_")[0]
    return Message(sender, receivers, content, timestamp_ms, chat_name)

def load_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def process_chat_folder(folder_path):
    messages = []
    data = load_json_file(os.path.join(folder_path, 'message_1.json'))
    participants = [ftfy.fix_text(p['name']) for p in data.get('participants', [])]
    chat_name = os.path.basename(folder_path)

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            file_path = os.path.join(folder_path, file_name)
            data = load_json_file(file_path)
            for msg in data.get('messages', []):
                if not msg.get('is_taken_down'):
                    messages.append(process_message(msg, participants, chat_name))

    return messages, len(participants)

def process_folder(folder_path):
    all_messages = {}
    
    for chat_folder in os.listdir(folder_path):
        chat_path = os.path.join(folder_path, chat_folder)
        if os.path.isdir(chat_path):
            chat_messages, participant_count = process_chat_folder(chat_path)
            
            if len(chat_messages) > 20 and participant_count < 10:
                chat_name = os.path.basename(chat_path)
                all_messages[chat_name] = chat_messages
    
    return all_messages

def save_messages_csv(messages, filename):
    flat_messages = [msg.to_dict() for chat_messages in messages.values() for msg in chat_messages]
    df = pd.DataFrame(flat_messages)
    df.to_csv(filename, index=False)
    print(f"Saved {len(flat_messages)} messages to {filename}")

def load_messages_csv(filename):
    df = pd.read_csv(filename, parse_dates=['timestamp_ms'])
    all_messages = defaultdict(list)

    for _, row in df.iterrows():
        receivers = row['receivers'].split(', ') if isinstance(row['receivers'], str) else []
        message = Message(
            row['sender_name'],
            receivers,
            row['content'],
            row['timestamp_ms'],
            row['chat_name']
        )
        all_messages[row['chat_name']].append(message)

    return all_messages

In [None]:
def main():
    load_dotenv('.env')
    
    inbox_folder = os.getenv('INBOX_FOLDER')
    e2ee_folder = os.getenv('E2EE_FOLDER')
    processed_file = os.getenv('PROCESSED_FILE')

    if os.path.exists(processed_file):
        print("Loading processed messages from CSV file...")
        all_messages = load_messages_csv(processed_file)
    else:
        print("Processing messages...")
        inbox_messages = process_folder(inbox_folder)
        e2ee_messages = process_folder(e2ee_folder)
        
        all_messages = {**inbox_messages, **e2ee_messages}

        save_messages_csv(all_messages, processed_file)

    print(f"Total messages: {len(all_messages)}")

    df = pd.DataFrame([msg.to_dict() for chat_messages in all_messages.values() for msg in chat_messages])
    return df, all_messages

In [None]:
if __name__ == "__main__":
    df, all_messages = main()

In [None]:
def top_contacts(df, top_n=10):
    contact_counts = df['chat_name'].value_counts().head(top_n)
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x=contact_counts.values, y=contact_counts.index)
    plt.title(f'Top {top_n} Contacts by Number of Messages')
    plt.xlabel('Number of Messages')
    plt.ylabel('Contact')
    plt.show()
    
    return contact_counts


top_contacts_result = top_contacts(df, top_n=20)
print(top_contacts_result)

In [None]:
def message_activity(df, chat_name):
    chat_df = df[df['chat_name'] == chat_name]
    chat_df['date'] = chat_df['timestamp_ms'].dt.date
    daily_counts = chat_df.groupby('date').size().reset_index(name='count')
    
    plt.figure(figsize=(15, 6))
    plt.plot(daily_counts['date'], daily_counts['count'])
    plt.title(f'Message Activity with {chat_name}')
    plt.xlabel('Date')
    plt.ylabel('Number of Messages')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return daily_counts

message_activity_result = message_activity(df, "John_Doe")

In [None]:
def top_words(df, top_n=20, exclude_stopwords=True):
    all_words = ' '.join(df['content'].dropna()).lower()
    word_tokens = word_tokenize(all_words)
    
    if exclude_stopwords:
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in word_tokens if word.isalnum() and word not in stop_words]
    else:
        filtered_words = [word for word in word_tokens if word.isalnum()]
    
    word_freq = Counter(filtered_words)
    top_words = word_freq.most_common(top_n)
    
    words, counts = zip(*top_words)
    plt.figure(figsize=(12, 6))
    sns.barplot(x=list(counts), y=list(words))
    plt.title(f'Top {top_n} Words Used')
    plt.xlabel('Frequency')
    plt.ylabel('Word')
    plt.show()
    
    return top_words

top_words_result = top_words(df)
print(top_words_result)

In [None]:
def generate_wordcloud(df):
    all_words = ' '.join(df['content'].dropna()).lower()
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of Messages')
    plt.show()

generate_wordcloud(df)

In [None]:
def daily_messaging_pattern(df):
    df['hour'] = df['timestamp_ms'].dt.hour
    hourly_counts = df.groupby('hour').size().reset_index(name='count')
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x='hour', y='count', data=hourly_counts)
    plt.title('Daily Messaging Pattern')
    plt.xlabel('Hour of Day')
    plt.ylabel('Number of Messages')
    plt.xticks(range(0, 24))
    plt.show()
    
    return hourly_counts

daily_pattern = daily_messaging_pattern(df)
print(daily_pattern)

In [None]:
def weekly_messaging_pattern(df):
    df['day_of_week'] = df['timestamp_ms'].dt.dayofweek
    daily_counts = df.groupby('day_of_week').size().reindex(range(7)).fillna(0)
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x=daily_counts.index, y=daily_counts.values)
    plt.title('Weekly Messaging Pattern')
    plt.xlabel('Day of Week')
    plt.ylabel('Number of Messages')
    plt.xticks(range(7), calendar.day_abbr)
    plt.show()
    
    return daily_counts

weekly_pattern = weekly_messaging_pattern(df)
print(weekly_pattern)

In [None]:
def analyze_message_initiation(df, ME, top_n=10, hours_threshold=8):
    df['timestamp_ms'] = pd.to_datetime(df['timestamp_ms'])
    
    df = df.sort_values(['chat_name', 'timestamp_ms'])
    
    df['time_diff'] = df.groupby('chat_name')['timestamp_ms'].diff()
    
    df['is_new_conversation'] = (df['time_diff'] > pd.Timedelta(hours=hours_threshold)).astype(int)
    
    initiator_counts = df[(df['is_new_conversation'] == 1) & (df['sender_name'] != ME)]['sender_name'].value_counts()
    
    top_initiators = initiator_counts.nlargest(top_n)
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x=top_initiators.values, y=top_initiators.index)
    plt.title(f'Top {top_n} Conversation Initiators (Excluding You)')
    plt.xlabel('Number of Conversations Initiated')
    plt.ylabel('Sender')
    plt.tight_layout()
    plt.show()
    
    your_initiations = df[(df['is_new_conversation'] == 1) & (df['sender_name'] == ME)].shape[0]
    
    return top_initiators, your_initiations


ME = "Tiến Dũng Nguyễn"  
top_initiators, your_initiations = analyze_message_initiation(df, ME, top_n=10)

print(f"Number of conversations you initiated: {your_initiations}")
print(f"\nTop {len(top_initiators)} conversation initiators:")
print(top_initiators)
print("\nTotal initiators:", len(top_initiators))

In [None]:
def message_length_distribution(df):
    df['message_length'] = df['content'].str.len()
    
    plt.figure(figsize=(12, 6))
    sns.histplot(data=df, x='message_length', bins=50, kde=True)
    plt.title('Distribution of Message Lengths')
    plt.xlabel('Message Length (characters)')
    plt.ylabel('Frequency')
    plt.show()
    
    return df['message_length'].describe()

message_length_distribution(df)

In [None]:
def sentiment_over_time(df):
    df['sentiment'] = df['content'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    df['date'] = pd.to_datetime(df['timestamp_ms']).dt.date
    daily_sentiment = df.groupby('date')['sentiment'].mean().reset_index()
    
    plt.figure(figsize=(15, 6))
    plt.plot(daily_sentiment['date'], daily_sentiment['sentiment'])
    plt.title('Average Sentiment Over Time')
    plt.xlabel('Date')
    plt.ylabel('Average Sentiment')
    plt.axhline(y=0, color='r', linestyle='--')
    plt.tight_layout()
    plt.show()

sentiment_over_time(df)

In [None]:
def message_length_vs_sentiment(df):
    df['message_length'] = df['content'].str.len()
    df['sentiment'] = df['content'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    
    plt.figure(figsize=(12, 6))
    plt.scatter(df['message_length'], df['sentiment'], alpha=0.5)
    plt.title('Message Length vs. Sentiment')
    plt.xlabel('Message Length (characters)')
    plt.ylabel('Sentiment')
    plt.axhline(y=0, color='r', linestyle='--')
    plt.tight_layout()
    plt.show()

message_length_vs_sentiment(df)

In [None]:
def top_ngrams(df, n=2, top_k=20):
    text = ' '.join(df['content'].dropna())
    tokens = nltk.word_tokenize(text.lower())
    n_grams = ngrams(tokens, n)
    n_gram_freq = Counter(n_grams)
    
    top_n_grams = n_gram_freq.most_common(top_k)
    labels, values = zip(*top_n_grams)
    
    plt.figure(figsize=(12, 6))
    plt.bar(range(len(labels)), values)
    plt.xticks(range(len(labels)), [' '.join(label) for label in labels], rotation=45, ha='right')
    plt.title(f'Top {top_k} {n}-grams')
    plt.xlabel(f'{n}-gram')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

top_ngrams(df, n=2, top_k=20)  # For bigrams

In [None]:
def message_activity_heatmap(df):
    df['hour'] = pd.to_datetime(df['timestamp_ms']).dt.hour
    df['day'] = pd.to_datetime(df['timestamp_ms']).dt.dayofweek
    
    activity = df.groupby(['day', 'hour']).size().unstack()
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(activity, cmap='YlOrRd')
    plt.title('Message Activity Heatmap')
    plt.xlabel('Hour of Day')
    plt.ylabel('Day of Week')
    plt.yticks(range(7), ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
    plt.tight_layout()
    plt.show()

message_activity_heatmap(df)

In [None]:
def conversation_flow(df, chat_name):
    chat_df = df[df['chat_name'] == chat_name].sort_values('timestamp_ms')
    chat_df['time_diff'] = chat_df['timestamp_ms'].diff().dt.total_seconds() / 60  # in minutes
    
    plt.figure(figsize=(15, 6))
    plt.scatter(chat_df['timestamp_ms'], chat_df['time_diff'], alpha=0.5)
    plt.title(f'Conversation Flow with {chat_name}')
    plt.xlabel('Date')
    plt.ylabel('Time between messages (minutes)')
    plt.yscale('log')
    plt.tight_layout()
    plt.show()

conversation_flow(df, "John_Doe") 

In [None]:
def message_length_over_time(df):
    df['date'] = pd.to_datetime(df['timestamp_ms']).dt.date
    df['message_length'] = df['content'].str.len()
    daily_avg_length = df.groupby('date')['message_length'].mean().reset_index()
    
    plt.figure(figsize=(15, 6))
    plt.plot(daily_avg_length['date'], daily_avg_length['message_length'])
    plt.title('Average Message Length Over Time')
    plt.xlabel('Date')
    plt.ylabel('Average Message Length (characters)')
    plt.tight_layout()
    plt.show()

message_length_over_time(df)

In [None]:
def most_active_time_periods(df, period='month'):
    if period == 'month':
        df['period'] = pd.to_datetime(df['timestamp_ms']).dt.to_period('M')
    elif period == 'week':
        df['period'] = pd.to_datetime(df['timestamp_ms']).dt.to_period('W')
    elif period == 'day':
        df['period'] = pd.to_datetime(df['timestamp_ms']).dt.date
    else:
        raise ValueError("Period must be 'month', 'week', or 'day'")
    
    period_counts = df['period'].value_counts().sort_index()
    
    plt.figure(figsize=(15, 6))
    period_counts.plot(kind='bar')
    plt.title(f'Most Active {period.capitalize()} Periods')
    plt.xlabel(period.capitalize())
    plt.ylabel('Number of Messages')
    plt.tight_layout()
    plt.show()

most_active_time_periods(df)

In [None]:
def conversation_pace(df, chat_name):
    chat_df = df[df['chat_name'] == chat_name].sort_values('timestamp_ms')
    chat_df['time_diff'] = chat_df['timestamp_ms'].diff().dt.total_seconds() / 60  # in minutes
    
    plt.figure(figsize=(12, 6))
    sns.histplot(data=chat_df, x='time_diff', bins=50, kde=True)
    plt.title(f'Conversation Pace with {chat_name}')
    plt.xlabel('Time Between Messages (minutes)')
    plt.ylabel('Frequency')
    plt.xlim(0, chat_df['time_diff'].quantile(0.95))  # Limit x-axis to 95th percentile
    plt.tight_layout()
    plt.show()

conversation_pace(df, "John_Doe")

In [None]:
def word_usage_evolution(df, word, window_size='M'):
    df['date'] = pd.to_datetime(df['timestamp_ms'])
    df['word_count'] = df['content'].str.count(word)
    word_usage = df.set_index('date').resample(window_size)['word_count'].sum()
    
    plt.figure(figsize=(15, 6))
    word_usage.plot()
    plt.title(f'Usage of "{word}" Over Time')
    plt.xlabel('Date')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

word_usage_evolution(df, "đi")

In [None]:
def conversation_starter_words(df, top_n=10):
    df['is_new_conversation'] = (df['timestamp_ms'] - df.groupby('chat_name')['timestamp_ms'].shift() > pd.Timedelta(hours=8))
    starter_messages = df[df['is_new_conversation']]['content']
    
    words = [word.lower() for message in starter_messages for word in word_tokenize(message) if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    
    word_freq = Counter(filtered_words)
    top_starters = word_freq.most_common(top_n)
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x=[count for _, count in top_starters], y=[word for word, _ in top_starters])
    plt.title(f'Top {top_n} Conversation Starter Words')
    plt.xlabel('Frequency')
    plt.ylabel('Word')
    plt.tight_layout()
    plt.show()

conversation_starter_words(df)