# WhatsApp Chat Analysis - Exploratory Data Analysis (EDA)

This notebook contains the parsing and basic statistical analysis of the WhatsApp chat data.

## 1. Import Libraries & Load Data

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import sys
import os

# Add src to path to import parser
sys.path.append(os.path.abspath('../src'))
from parser import WhatsAppParser

# Load Data
file_path = '../data/WhatsApp Chat with gg bOys.txt'
parser = WhatsAppParser(file_path)
df = parser.parse()

print(f"Total Messages: {len(df)}")
df.head()

## 2. Volume Analysis
Who sends the most messages?

In [None]:
user_counts = df['Author'].value_counts().reset_index()
user_counts.columns = ['User', 'Message Count']

fig_vol = px.bar(user_counts, x='User', y='Message Count', title='Total Messages per User',
                 color='User', template='plotly_dark')
fig_vol.show()

## 3. Temporal Analysis
How has the conversation volume changed over time?

In [None]:
df['YearMonth'] = df['DateTime'].dt.to_period('M').astype(str)
monthly_counts = df.groupby('YearMonth').size().reset_index(name='Count')

fig_time = px.line(monthly_counts, x='YearMonth', y='Count', title='Message Volume Over Time',
                   markers=True, template='plotly_dark')
fig_time.show()

## 4. Activity Heatmap
When is the group most active? (Day of Week vs. Hour of Day)

In [None]:
df['Hour'] = df['DateTime'].dt.hour
df['DayOfWeek'] = df['DateTime'].dt.day_name()

# Order days correctly
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
heatmap_data = df.groupby(['DayOfWeek', 'Hour']).size().reset_index(name='Count')

fig_heatmap = px.density_heatmap(heatmap_data, x='Hour', y='DayOfWeek', z='Count', 
                                 title='Activity Heatmap: Day vs Hour',
                                 category_orders={'DayOfWeek': days_order},
                                 color_continuous_scale='Viridis', template='plotly_dark')
fig_heatmap.show()

## 5. Interaction Dynamics
**Conversation Initiators**: Who sends the first message after a period of silence (e.g., 2 hours)?

In [None]:
# Define a "New Conversation" as a message sent after > 2 hours of silence
df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds() / 3600 # In Hours
df['IsNewConv'] = df['TimeDiff'] > 2

initiator_counts = df[df['IsNewConv']]['Author'].value_counts().reset_index()
initiator_counts.columns = ['User', 'Initiations']

fig_init = px.pie(initiator_counts, names='User', values='Initiations', title='Who Inputs New Topics? (Conversation Initiators)',
                  template='plotly_dark')
fig_init.show()