# WhatsApp Chat Analysis - NLP & Content Analysis

This notebook focuses on the content of the messages: what words are used, which emojis are popular, and the overall sentiment of the conversation.

## 1. Import Libraries & Load Data

In [None]:
import pandas as pd
import plotly.express as px
import sys
import os
import re
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import emoji
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download VADER lexicon if not already present
try:
    nltk.data.find('sentiment/vader_lexicon.zip')
except LookupError:
    nltk.download('vader_lexicon')

# Add src to path to import parser
sys.path.append(os.path.abspath('../src'))
from parser import WhatsAppParser

# Load Data
file_path = '../data/WhatsApp Chat with gg bOys.txt'
parser = WhatsAppParser(file_path)
df = parser.parse()

# Filter out "Media omitted" messages
df_text = df[~df['Message'].str.contains('<Media omitted>', case=False, na=False)].copy()
print(f"Messages for Text Analysis: {len(df_text)}")

## 2. Word Frequency & Word Cloud
What are the most common words? We need to remove stopwords to see meaningful content.

In [None]:
# Custom Stopwords list (including Hinglish/Urdu terms)
STOPWORDS = set(['the', 'is', 'in', 'to', 'and', 'a', 'of', 'for', 'it', 'I', 'you', 'my', 'that', 'on', 'with', 'this', 'be', 'at', 
                 'media', 'omitted', 'image', 'video', 'sticker', 'GIF', 'lol', 'ok', 'okay', 'yeah', 'yes', 'no', 'haha', 'message', 'deleted',
                 'hai', 'ki', 'ke', 'ka', 'se', 'ko', 'aur', 'mai', 'to', 'bhi', 'tha', 'nahi', 'kya', 'kar', 'ho', 'ab', 'wo'])

def clean_text(text):
    # Simple cleaner: lowercase, remove non-alphabetic characters
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

all_text = ' '.join(df_text['Message'].apply(clean_text))
words = [word for word in all_text.split() if word not in STOPWORDS and len(word) > 2]

# Generate Word Cloud
# Explicit font path to ensure compatibility across environments
font_path = '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf'
try:
    wordcloud = WordCloud(width=800, height=400, background_color='black', colormap='viridis', font_path=font_path).generate(' '.join(words))
except ValueError:
    # Fallback if font not found
    wordcloud = WordCloud(width=800, height=400, background_color='black', colormap='viridis').generate(' '.join(words))

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Chat')
plt.show()

## 3. Emoji Analysis
Who uses which emojis the most?

In [None]:
def extract_emojis(text):
    return [c for c in text if c in emoji.EMOJI_DATA]

df['Emojis'] = df['Message'].apply(extract_emojis)
all_emojis = [e for sublist in df['Emojis'] for e in sublist]

emoji_counts = Counter(all_emojis).most_common(10)
emoji_df = pd.DataFrame(emoji_counts, columns=['Emoji', 'Count'])

fig_emoji = px.bar(emoji_df, x='Emoji', y='Count', title='Top 10 Most Used Emojis', 
                   template='plotly_dark', color='Count', color_continuous_scale='Magma')
fig_emoji.show()

## 4. Sentiment Analysis (VADER)
VADER (Valence Aware Dictionary and sEntiment Reasoner) is specifically tuned for social media sentiment.

In [None]:
sia = SentimentIntensityAnalyzer()

def get_sentiment(text):
    # VADER works best on raw text (with emojis/caps), so we use the original message
    return sia.polarity_scores(str(text))['compound']

df_text['Sentiment'] = df_text['Message'].apply(get_sentiment)

# Average Sentiment per User
user_sentiment = df_text.groupby('Author')['Sentiment'].mean().reset_index().sort_values('Sentiment', ascending=False)

fig_sent = px.bar(user_sentiment, x='Author', y='Sentiment', title='Average Sentiment Score per User (Positivity)',
                  color='Sentiment', color_continuous_scale='RdBu', range_color=[-0.5, 0.5], template='plotly_dark')
fig_sent.show()

# Sentiment Over Time (Monthly Average)
df_text['Month'] = df_text['DateTime'].dt.to_period('M').astype(str)
monthly_sentiment = df_text.groupby('Month')['Sentiment'].mean().reset_index()

fig_sent_time = px.line(monthly_sentiment, x='Month', y='Sentiment', title='Sentiment Trend Over Time',
                        markers=True, template='plotly_dark')
fig_sent_time.show()