# Depression Text Analysis: Word Frequency & Correlation Study

## Project Overview
This notebook analyzes linguistic patterns in psychologist-patient conversations to understand correlations between word usage and depression levels.

**Approach (following professor's recommendations):**
1. Start with simple word frequency distributions
2. Analyze correlations between word frequencies and depression levels  
3. Progressively move to advanced text processing techniques

**Dataset:** Interview transcripts from AVEC 2017 Depression Recognition Challenge
- Raw transcripts in `data/raw/`
- Depression labels (PHQ scores) in `data/labels/processed/`
- ~190 participants with binary depression classification

In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter

# Import our custom modules
from data_loader import DataLoader
from text_preprocessing import TextPreprocessor, CustomStopwords
from frequency_analysis import WordFrequencyAnalyzer, CorrelationAnalyzer
from visualization import DataVisualizer
import config

print("✓ All libraries loaded successfully")

## Section 1: Load and Explore Data

Let's start by loading the raw transcripts and depression labels to understand our dataset.

In [None]:
# Initialize data loader
data_loader = DataLoader('../data')

# Load labels
labels = data_loader.load_labels(use_processed=True)
print("Depression Labels Dataset:")
print(f"  Shape: {labels.shape}")
print(f"  Columns: {list(labels.columns)}")
print(f"\n  First 5 rows:")
print(labels.head())
print(f"\n  Data types:")
print(labels.dtypes)

In [None]:
# Create corpus combining transcripts with labels
corpus_df, metadata = data_loader.create_corpus_with_labels()

print("Combined Corpus Statistics:")
print(f"  Total participants: {metadata['n_total_participants']}")
print(f"  Depressed (PHQ_Binary=1): {metadata['n_depressed']}")
print(f"  Non-Depressed (PHQ_Binary=0): {metadata['n_non_depressed']}")
print(f"\n  PHQ Score Statistics:")
print(f"    Mean: {metadata['mean_phq_score']:.2f}")
print(f"    Min: {metadata['min_phq_score']}")
print(f"    Max: {metadata['max_phq_score']}")

print(f"\n  Corpus Shape: {corpus_df.shape}")
print(f"  Columns: {list(corpus_df.columns)}")
print(f"\n  First 3 records:")
for idx in range(min(3, len(corpus_df))):
    row = corpus_df.iloc[idx]
    text_preview = row['text'][:80] + "..."
    print(f"\n  Participant {row['Participant_ID']}: PHQ={row['PHQ_Score']}, Binary={row['PHQ_Binary']}")
    print(f"    Text: {text_preview}")

In [None]:
# Visualize depression distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Continuous PHQ scores
axes[0].hist(corpus_df['PHQ_Score'], bins=15, color='steelblue', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('PHQ Score')
axes[0].set_ylabel('Count')
axes[0].set_title('Distribution of PHQ Scores (Continuous)')
axes[0].axvline(corpus_df['PHQ_Score'].mean(), color='red', linestyle='--', 
               label=f'Mean: {corpus_df["PHQ_Score"].mean():.1f}')
axes[0].legend()

# Binary distribution
binary_counts = corpus_df['PHQ_Binary'].value_counts()
colors = ['lightgreen', 'lightcoral']
bars = axes[1].bar(['Non-Depressed (0)', 'Depressed (1)'], 
                    [binary_counts[0], binary_counts[1]], 
                    color=colors)
axes[1].set_ylabel('Number of Participants')
axes[1].set_title('Binary Depression Classification')

# Add percentages
for bar, count in zip(bars, [binary_counts[0], binary_counts[1]]):
    height = bar.get_height()
    pct = 100 * count / len(corpus_df)
    axes[1].text(bar.get_x() + bar.get_width()/2., height,
                f'{int(count)}\n({pct:.1f}%)',
                ha='center', va='bottom')

plt.tight_layout()
plt.show()

print(f"\nClass distribution:")
print(f"  Non-Depressed: {binary_counts[0]} ({100*binary_counts[0]/len(corpus_df):.1f}%)")
print(f"  Depressed: {binary_counts[1]} ({100*binary_counts[1]/len(corpus_df):.1f}%)")

## Section 2: Text Preprocessing and Cleaning

Now let's clean and preprocess the text data. We'll:
- Convert to lowercase
- Remove special characters and punctuation
- Remove common stopwords
- Tokenize into words

In [None]:
# Initialize preprocessor
preprocessor = TextPreprocessor(remove_stopwords=config.REMOVE_STOPWORDS, 
                                lemmatize=config.LEMMATIZE)

# Show example of preprocessing
sample_text = corpus_df.iloc[0]['text']
print("PREPROCESSING EXAMPLE")
print("=" * 60)
print(f"\nOriginal text:\n  {sample_text[:150]}...")

cleaned = preprocessor.clean_text(sample_text)
print(f"\nAfter cleaning:\n  {cleaned[:150]}...")

tokens = preprocessor.process(sample_text)
print(f"\nTokenized and processed (first 20 tokens):\n  {tokens[:20]}")
print(f"\nTotal tokens: {len(tokens)}")

In [None]:
# Process all texts
print(f"\nProcessing {len(corpus_df)} texts...")
processed_tokens = preprocessor.process_batch(corpus_df['text'].values)

# Calculate statistics
token_counts = [len(tokens) for tokens in processed_tokens]
print(f"\n✓ Preprocessing complete!")
print(f"\nStatistics after preprocessing:")
print(f"  Total documents: {len(processed_tokens)}")
print(f"  Average tokens per document: {np.mean(token_counts):.1f}")
print(f"  Median tokens per document: {np.median(token_counts):.1f}")
print(f"  Max tokens in a document: {np.max(token_counts)}")
print(f"  Min tokens in a document: {np.min(token_counts)}")

## Section 3: Word Frequency Analysis (Starting Point)

This is the simplest and most recommended starting point. We'll analyze:
1. Most frequent words across all participants
2. Word frequency distributions

In [None]:
# Initialize frequency analyzer
freq_analyzer = WordFrequencyAnalyzer(config.MIN_WORD_FREQUENCY)

# Compute word frequencies
word_freq = freq_analyzer.compute_frequencies(processed_tokens)

print("WORD FREQUENCY ANALYSIS")
print("=" * 60)
print(f"\nTotal unique words: {len(word_freq)}")
print(f"Total word occurrences: {sum(word_freq.values())}")

# Get top words
top_words = freq_analyzer.get_top_words(n=20)
print(f"\nTop 20 Most Frequent Words:")
for i, (word, freq) in enumerate(top_words, 1):
    print(f"  {i:2d}. {word:15s} - {freq:4d} occurrences")

# Show word frequency distribution
DataVisualizer.plot_top_words(top_words, title="Top 20 Most Frequent Words")
plt.show()

## Section 4: Correlate Word Frequencies with Depression Levels

Now let's find words that are associated with depression. We'll compute correlations between word frequencies and PHQ scores.

In [None]:
# Analyze frequency by depression status
phq_binary = corpus_df['PHQ_Binary'].values

# Compute frequencies by group
freq_analyzer.compute_frequencies_by_group(processed_tokens, phq_binary)

print("WORD FREQUENCY BY DEPRESSION STATUS")
print("=" * 60)

print("\nTop 10 words in NON-DEPRESSED group (PHQ_Binary=0):")
top_non_dep = freq_analyzer.get_top_words(n=10, group=0)
for i, (word, freq) in enumerate(top_non_dep, 1):
    print(f"  {i:2d}. {word:15s} - {freq:4d}")

print("\nTop 10 words in DEPRESSED group (PHQ_Binary=1):")
top_dep = freq_analyzer.get_top_words(n=10, group=1)
for i, (word, freq) in enumerate(top_dep, 1):
    print(f"  {i:2d}. {word:15s} - {freq:4d}")

# Compare top words between groups
top_all = freq_analyzer.get_top_words(n=15)
words = [w for w, _ in top_all]
freq_group0 = [freq_analyzer.word_freq_by_group[0].get(w, 0) for w in words]
freq_group1 = [freq_analyzer.word_freq_by_group[1].get(w, 0) for w in words]

DataVisualizer.plot_word_frequency_comparison(
    words, freq_group0, freq_group1,
    group1_label="Non-Depressed (PHQ=0)",
    group2_label="Depressed (PHQ=1)"
)
plt.show()

In [None]:
# Compute statistical correlations
phq_scores = corpus_df['PHQ_Score'].values

corr_analyzer = CorrelationAnalyzer(processed_tokens, phq_scores)
corr_analyzer.build_frequency_matrix(config.MIN_WORD_FREQUENCY)
corr_analyzer.compute_correlations(method=config.CORRELATION_METHOD, phq_binary=phq_binary)

print("\nCORRELATION ANALYSIS")
print("=" * 60)
print(f"Method: {config.CORRELATION_METHOD}")
print(f"Vocabulary size: {corr_analyzer.word_freq_matrix.shape[1]} words")

# Get top correlated words
top_positive = corr_analyzer.get_top_correlated_words(n=15, positive=True)
top_negative = corr_analyzer.get_top_correlated_words(n=15, positive=False)

print(f"\nTop 15 words POSITIVELY correlated with depression:")
print("(Higher frequency = more likely to be depressed)")
for i, (word, corr) in enumerate(top_positive, 1):
    print(f"  {i:2d}. {word:15s} - correlation: {corr:+.4f}")

print(f"\nTop 15 words NEGATIVELY correlated with depression:")
print("(Higher frequency = less likely to be depressed)")
for i, (word, corr) in enumerate(top_negative, 1):
    print(f"  {i:2d}. {word:15s} - correlation: {corr:+.4f}")

# Visualize correlations
all_words = [w for w, _ in top_positive + top_negative]
all_corr = [c for _, c in top_positive + top_negative]

DataVisualizer.plot_correlations(all_words, all_corr, 
                                title=f"Word-Depression Correlations ({config.CORRELATION_METHOD})")
plt.tight_layout()
plt.show()

## Section 5: Visualize Frequency Distributions

Let's create additional visualizations to better understand the data patterns.

In [None]:
# Text length distribution
text_lengths = [len(tokens) for tokens in processed_tokens]

DataVisualizer.plot_text_length_distribution(text_lengths, phq_binary)
plt.show()

print("Text Length Statistics:")
print(f"  Mean: {np.mean(text_lengths):.1f} words")
print(f"  Median: {np.median(text_lengths):.1f} words")
print(f"  Std Dev: {np.std(text_lengths):.1f} words")
print(f"  Range: {np.min(text_lengths)} - {np.max(text_lengths)} words")

In [None]:
# Word frequency distribution (log scale)
freq_values = sorted(word_freq.values(), reverse=True)

fig, axes = plt.subplots(1, 2, figsize=(13, 5))

# Linear scale
axes[0].hist(freq_values, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Word Frequency')
axes[0].set_ylabel('Count')
axes[0].set_title('Word Frequency Distribution (Linear Scale)')
axes[0].axvline(np.mean(freq_values), color='red', linestyle='--', 
               label=f'Mean: {np.mean(freq_values):.1f}')
axes[0].legend()

# Log scale
axes[1].hist(freq_values, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Word Frequency')
axes[1].set_ylabel('Count (log scale)')
axes[1].set_yscale('log')
axes[1].set_title('Word Frequency Distribution (Log Scale)')

plt.tight_layout()
plt.show()

print("Word Frequency Statistics:")
print(f"  Mean frequency: {np.mean(freq_values):.2f}")
print(f"  Median frequency: {np.median(freq_values):.2f}")
print(f"  Max frequency: {np.max(freq_values)}")
print(f"  Min frequency: {np.min(freq_values)}")

## Section 6: Advanced Text Processing Approaches (Foundation for Future Work)

Beyond simple word frequency, here are more advanced techniques you can explore:

In [None]:
# TF-IDF Analysis (Term Frequency - Inverse Document Frequency)
from sklearn.feature_extraction.text import TfidfVectorizer

print("ADVANCED TECHNIQUES PREVIEW")
print("=" * 60)

# Prepare texts
texts = corpus_df['text'].values

# TF-IDF
tfidf = TfidfVectorizer(max_features=50, min_df=2, max_df=0.8)
tfidf_matrix = tfidf.fit_transform(texts)

print(f"\n1. TF-IDF Analysis")
print(f"   - Considers both word frequency AND rarity across documents")
print(f"   - Matrix shape: {tfidf_matrix.shape}")
print(f"   - Top TF-IDF terms: {tfidf.get_feature_names_out()[:20].tolist()}")

# TF-IDF importance for each class
print(f"\n   Top TF-IDF features by depression status:")
tfidf_depressed = tfidf_matrix[phq_binary == 1].mean(axis=0).A1
tfidf_non_depressed = tfidf_matrix[phq_binary == 0].mean(axis=0).A1

top_dep_tfidf = np.argsort(tfidf_depressed)[-5:][::-1]
top_non_dep_tfidf = np.argsort(tfidf_non_depressed)[-5:][::-1]

feature_names = tfidf.get_feature_names_out()
print(f"   Depressed: {[feature_names[i] for i in top_dep_tfidf]}")
print(f"   Non-Depressed: {[feature_names[i] for i in top_non_dep_tfidf]}")

In [None]:
# N-grams Analysis
from collections import Counter

def get_ngrams(tokens, n=2):
    """Extract n-grams from tokens"""
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

print(f"\n2. N-grams Analysis")
print(f"   - Captures word sequences and context")
print(f"   - Examples: bigrams (2 words), trigrams (3 words), etc.")

# Extract bigrams from depressed group
depressed_bigrams = []
for idx, binary in enumerate(phq_binary):
    if binary == 1:
        bigrams = get_ngrams(processed_tokens[idx], n=2)
        depressed_bigrams.extend(bigrams)

depressed_bigram_freq = Counter(depressed_bigrams)
print(f"\n   Top bigrams in DEPRESSED group:")
for bigram, freq in depressed_bigram_freq.most_common(5):
    print(f"     '{bigram}' - {freq} occurrences")

# Extract bigrams from non-depressed group
non_depressed_bigrams = []
for idx, binary in enumerate(phq_binary):
    if binary == 0:
        bigrams = get_ngrams(processed_tokens[idx], n=2)
        non_depressed_bigrams.extend(bigrams)

non_depressed_bigram_freq = Counter(non_depressed_bigrams)
print(f"\n   Top bigrams in NON-DEPRESSED group:")
for bigram, freq in non_depressed_bigram_freq.most_common(5):
    print(f"     '{bigram}' - {freq} occurrences")

In [None]:
# Key Findings Summary
print("\n" + "=" * 60)
print("KEY FINDINGS SUMMARY")
print("=" * 60)

print(f"\n1. DATASET OVERVIEW:")
print(f"   - Total participants: {len(corpus_df)}")
print(f"   - Depressed: {(phq_binary==1).sum()} ({100*(phq_binary==1).sum()/len(corpus_df):.1f}%)")
print(f"   - Non-depressed: {(phq_binary==0).sum()} ({100*(phq_binary==0).sum()/len(corpus_df):.1f}%)")

print(f"\n2. WORD FREQUENCY INSIGHTS:")
print(f"   - Total unique words: {len(word_freq)}")
print(f"   - Most common word: '{top_words[0][0]}' ({top_words[0][1]} occurrences)")
print(f"   - Average word frequency: {np.mean(list(word_freq.values())):.2f}")

print(f"\n3. DEPRESSION-RELATED PATTERNS:")
if top_positive:
    print(f"   - Words associated with depression:")
    for word, corr in top_positive[:3]:
        print(f"     • '{word}' (correlation: {corr:+.4f})")

print(f"\n4. TEXT CHARACTERISTICS:")
print(f"   - Average text length: {np.mean(text_lengths):.0f} words")
print(f"   - Texts are roughly similar length across groups")

print(f"\n5. RECOMMENDATIONS FOR NEXT STEPS:")
print(f"   ✓ Current analysis: Simple word frequencies & correlations")
print(f"   → Next steps:")
print(f"      1. Explore TF-IDF for importance weighting")
print(f"      2. Analyze n-grams for contextual patterns")
print(f"      3. Apply machine learning (classification models)")
print(f"      4. Investigate semantic patterns (word embeddings)")
print(f"      5. Analyze linguistic features (sentiment, pronouns, etc.)")