In [None]:
import os
import sys

# Allow imports from src/
sys.path.append(os.path.abspath(".."))

from src.download_daicwoz import extract_daicwoz_transcripts, download_phq_file
from src.context_chunker import  match_phq_transcripts, generate_dataset, build_text_representations, save_text_representations, match_all_phq_transcripts, generate_all_phq_dataset

In [None]:
# Define data directory (relative to project root)
ZIP_DIR = "../data/raw/zips"
TRANSCRIPT_DIR = "../data/raw/transcripts"
PHQ_FILE_PATH = "../data/raw/phq/phq_scores.csv"
PROCESSED_DATA_DIR = "../data/processed/all_score"

# Ensure directory exists
os.makedirs(ZIP_DIR, exist_ok=True)
os.makedirs(TRANSCRIPT_DIR, exist_ok=True)
os.makedirs(os.path.dirname(PHQ_FILE_PATH), exist_ok=True)
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

print(f"Download directory: {os.path.abspath(ZIP_DIR)}")
print(f"Transcript directory: {os.path.abspath(TRANSCRIPT_DIR)}")
print(f"PHQ file directory: {os.path.abspath(os.path.dirname(PHQ_FILE_PATH))}")
print(f"Processed data directory: {os.path.abspath(PROCESSED_DATA_DIR)}")


In [None]:
extract_daicwoz_transcripts(
    zip_dir=ZIP_DIR,
    out_dir=TRANSCRIPT_DIR,
    start_id=300,
    end_id=492,
    remove_zip=True
)

In [None]:
csv_path = download_phq_file(
    filename="Detailed_PHQ8_Labels.csv",
    output_dir="../data/raw/phq"
)

In [None]:
all_phq_dict = match_all_phq_transcripts(
    transcript_dir=TRANSCRIPT_DIR,
    meta_csv=PHQ_FILE_PATH
)
print(all_phq_dict)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

PHQ8_ITEM_NAMES = [
    "Little interest / pleasure",
    "Feeling down / hopeless",
    "Sleep problems",
    "Low energy / tired",
    "Poor appetite / overeating",
    "Feeling bad / failure",
    "Trouble concentrating",
    "Motor agitation / retardation"
]

def analyze_phq_item_distribution(phq_dict, show_plots=True):
    """
    Analyze PHQ-8 item-level distribution with item names.
    
    phq_dict: {pid: np.array([8 scores])}

    Returns:
        df: long-form DataFrame with PID, ItemIndex, ItemName, Score
        item_stats: dict[item_index] -> value_counts
    """

    records = []
    
    for pid, vec in phq_dict.items():
        for item_idx, score in enumerate(vec):
            records.append([
                pid, 
                item_idx, 
                PHQ8_ITEM_NAMES[item_idx], 
                score
            ])

    df = pd.DataFrame(records, columns=["Participant_ID", "Item", "ItemName", "Score"])

    # Collect stats per item
    item_stats = {}

    for item_idx in range(8):
        counts = df[df["Item"] == item_idx]["Score"].value_counts().sort_index()
        item_stats[item_idx] = counts

        print(f"\n=== Item {item_idx}: {PHQ8_ITEM_NAMES[item_idx]} ===")
        print(counts)

        if show_plots:
            plt.figure(figsize=(6,3))
            counts.plot(kind="bar")
            plt.title(f"PHQ-8 Item {item_idx}: {PHQ8_ITEM_NAMES[item_idx]}")
            plt.xlabel("Score (0â€“3)")
            plt.ylabel("Count")
            plt.show()

    return df, item_stats
phq_item_df, phq_item_stats = analyze_phq_item_distribution(all_phq_dict, show_plots=True)

In [None]:
match_all_phq_transcripts = generate_all_phq_dataset(
    transcript_dir=TRANSCRIPT_DIR,
    phq_dict=all_phq_dict
)
print(match_all_phq_transcripts[0])

In [None]:
dataset_word, dataset_sentence, dataset_dialogue = build_text_representations(match_all_phq_transcripts, sequence_len=512, num_samples_per_pid=20)
print(dataset_word[0])
print(dataset_sentence[0])
print(dataset_dialogue[0])


In [None]:
save_text_representations(
    dataset_word,
    dataset_sentence,
    dataset_dialogue,
    output_dir=PROCESSED_DATA_DIR
)