In [None]:
import os
import sys

# Allow imports from src/
sys.path.append(os.path.abspath(".."))

from src.download_daicwoz import extract_daicwoz_transcripts, download_phq_file
from src.context_chunker import  match_phq_transcripts, generate_dataset, build_text_representations, save_text_representations

In [None]:
# Define data directory (relative to project root)
ZIP_DIR = "../data/raw/zips"
TRANSCRIPT_DIR = "../data/raw/transcripts"
PHQ_FILE_PATH = "../data/raw/phq/phq_scores.csv"
PROCESSED_DATA_DIR = "../data/processed/"

# Ensure directory exists
os.makedirs(ZIP_DIR, exist_ok=True)
os.makedirs(TRANSCRIPT_DIR, exist_ok=True)
os.makedirs(os.path.dirname(PHQ_FILE_PATH), exist_ok=True)
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

print(f"Download directory: {os.path.abspath(ZIP_DIR)}")
print(f"Transcript directory: {os.path.abspath(TRANSCRIPT_DIR)}")
print(f"PHQ file directory: {os.path.abspath(os.path.dirname(PHQ_FILE_PATH))}")
print(f"Processed data directory: {os.path.abspath(PROCESSED_DATA_DIR)}")


In [None]:
extract_daicwoz_transcripts(
    zip_dir=ZIP_DIR,
    out_dir=TRANSCRIPT_DIR,
    start_id=300,
    end_id=492,
    remove_zip=True
)

In [None]:
csv_path = download_phq_file(
    filename="Detailed_PHQ8_Labels.csv",
    output_dir="../data/raw/phq"
)

In [None]:
phq_dict = match_phq_transcripts(
    transcript_dir=TRANSCRIPT_DIR,
    meta_csv=PHQ_FILE_PATH
)
print(phq_dict)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def analyze_phq_distribution(phq_dict, show_plots=True):
    """
    Analyze PHQ score distribution from a dict {Participant_ID: PHQ_Score}.
    
    Returns:
        df: DataFrame with Participant_ID and PHQ_Score
        count_table: PHQ score -> count
        percent_table: PHQ score -> percentage
    """

    # --- Convert dict to dataframe ---
    df = pd.DataFrame(list(phq_dict.items()), columns=["Participant_ID", "PHQ_Score"])
    
    # --- Basic distribution ---
    count_table = df["PHQ_Score"].value_counts().sort_index()
    percent_table = (df["PHQ_Score"].value_counts(normalize=True).sort_index() * 100)

    print("\n=== PHQ Score Count ===")
    print(count_table)

    # --- Visualization ---
    if show_plots:
        plt.figure(figsize=(10,4))
        count_table.plot(kind="bar")
        plt.xlabel("PHQ Score")
        plt.ylabel("Count")
        plt.title("PHQ Score Frequency")
        plt.show()

    return df, count_table, percent_table
phq_df, phq_count_table, phq_percent_table = analyze_phq_distribution(phq_dict)

In [None]:
all_transcripts = generate_dataset(
    transcript_dir=TRANSCRIPT_DIR,
    phq_dict=phq_dict
)
# print(all_transcripts[0])

In [None]:
dataset_word, dataset_sentence, dataset_dialogue = build_text_representations(all_transcripts)
print(dataset_word[0])
print(dataset_sentence[0])
print(dataset_dialogue[0])


In [None]:
dataset_word

In [None]:
save_text_representations(
    dataset_word,
    dataset_sentence,
    dataset_dialogue,
    output_dir=PROCESSED_DATA_DIR
)

## Sun Bin - EDA

- Each entity in format (dataset_length, raw_text, label)
- tokenizer → vocab → convert text

### Previous implementation
- Bahdanau, Cho & Bengio (2015) — Neural Machine Translation by Jointly Learning to Align and Translate (additive attention)
- Yang et al. (2016) — Hierarchical Attention Networks for Document Classification (uses GRU + attention for words → sentences)

Uni-directional Attention

WORDS → Word-GRU → Word-Attention → Sentence vector
SENTENCE VECTORS → Sentence-GRU → Sentence-Attention → Document vector

**DOCUMENT VECTOR → Classifier**