# Data Exploration: DAIC-WOZ Depression Dataset

This notebook explores the DAIC-WOZ dataset and visualizes patterns in depression-related interview transcripts.

In [None]:
import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from data.data_processor import load_sample_data, DAICWOZDataProcessor
from utils.visualization import plot_label_distribution, plot_word_cloud

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries loaded!")

## Load and Process Data

In [None]:
# Load sample data
transcripts, labels = load_sample_data()

# Create structured dataset
processor = DAICWOZDataProcessor('../data')
df = processor.create_dataset(transcripts, labels)

print(f"Dataset shape: {df.shape}")
df.head()

## Basic Statistics

In [None]:
# Label distribution
print("Label Distribution:")
print(df['label'].value_counts())
print(f"\nDepression rate: {df['label'].mean():.2%}")

In [None]:
# Text length statistics
print("Text Length Statistics:")
print(df[['text_length', 'word_count']].describe())

## Visualizations

In [None]:
# Label distribution
plot_label_distribution(df['label'].values)

In [None]:
# Word count distribution by label
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Depression cases
axes[0].hist(df[df['label']==1]['word_count'], bins=20, alpha=0.7, color='red')
axes[0].set_title('Word Count - Depression Cases')
axes[0].set_xlabel('Word Count')
axes[0].set_ylabel('Frequency')

# No depression cases
axes[1].hist(df[df['label']==0]['word_count'], bins=20, alpha=0.7, color='blue')
axes[1].set_title('Word Count - No Depression Cases')
axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Word clouds
depression_texts = df[df['label']==1]['cleaned_transcript'].tolist()
no_depression_texts = df[df['label']==0]['cleaned_transcript'].tolist()

print("Word Cloud - Depression Cases:")
plot_word_cloud(depression_texts, title="Depression Cases")

print("\nWord Cloud - No Depression Cases:")
plot_word_cloud(no_depression_texts, title="No Depression Cases")

## Sample Analysis

In [None]:
# Show sample transcripts
print("=" * 80)
print("Sample Depression Case:")
print("=" * 80)
print(df[df['label']==1].iloc[0]['cleaned_transcript'])

print("\n" + "=" * 80)
print("Sample No Depression Case:")
print("=" * 80)
print(df[df['label']==0].iloc[0]['cleaned_transcript'])

## Key Observations

- Depression cases tend to use more negative language
- Certain keywords appear more frequently in depression cases
- Text length varies but is not a strong discriminator alone