## Imports

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import shapiro, norm, spearmanr, stats
import sys
from pathlib import Path


project_dir = Path().resolve().parents[2]
sys.path.append(str(project_dir / "code/"))

## Get Data

In [None]:

# %% ############################# Variables and data
data_dir = project_dir / "data/"
file_name = ""

data = pd.read_excel(data_dir / file_name)


# Change all "." in variable names to "__"
data = data.rename(columns=lambda x: x.replace(".", "__"))

# Filter for reporter and visit
data = data.query("reporter == 'parent'") #and visnr == 1


sex_filter = input("Filter by sex (1 = male, 2 = female): ").strip()
if sex_filter in ['1', '2']:
    sex_filter = int(sex_filter)
    data = data.query("sex12 == @sex_filter")

sdq_vars = [
    'e_sdq.d00149_hyp_sum'
]
sdq_vars = [c.replace(".", "__") for c in sdq_vars]
print("Modified voice variable names:", sdq_vars)

voice_vars = [
    'stimme.f0_sprech_1',
    'stimme.f0_sprech_2',
    'stimme.f0_sprech_3',
    'stimme.f0_sprech_4',
    'stimme.f0_sprech_5',
    'stimme.spl_sprech_1',
    'stimme.spl_sprech_2',
    'stimme.spl_sprech_3',
    'stimme.spl_sprech_4',
    'stimme.spl_sprech_5',
    'stimme.mpt',
    'stimme.jitter',
    'stimme.dsi'
]
voice_vars = [c.replace(".", "__") for c in voice_vars]
print("Modified voice variable names:", voice_vars)

covariates = [
    'age', 
    'sex12', 
    'soz_winkler_2019.d00408_gesamt_score',
    'c_pub_stat.d00077_pub_status',
    'c_anthro_kh.d00040_bmi_sds',
]
covariates = [c.replace(".", "__") for c in covariates]
print("Modified covariate names:", covariates)

# Keep rows where SDQ is complete AND at there are no NAN for the voice features with the biggest overlap (f0_sprech_1)
data = data[
    data[sdq_vars[0]].notnull() &   # SDQ HI must be present
    data["stimme__f0_sprech_1"].notnull() &  # must have value for this voice feature
    data["soz_winkler_2019__d00408_gesamt_status"].notnull() &
    data["c_anthro_kh__d00040_bmi_sds"].notnull()
    ]

# Select relevant columns
selected_columns = [
    "pseudosic", "sgroup", "visnr", "nvis", "sex", "jahr",
    "soz_winkler_2019__d00408_gesamt_status",
    "c_pub_stat__d00077_stimmbruch",
    "c_pub_stat__d00077_stimmbruch_wann"
] + covariates + sdq_vars + voice_vars

data = data[selected_columns].reset_index(drop=True)

data.head()


In [None]:
# Get the SDQ variable
sdq_var = sdq_vars[0]  # assuming only one SDQ HI variable

# Calculate overlap n between SDQ and each voice variable
overlap_counts = {
    voice_var: data[[sdq_var, voice_var]].dropna().shape[0]
    for voice_var in voice_vars
}

# Convert to DataFrame for readability
overlap_df = pd.DataFrame.from_dict(overlap_counts, orient='index', columns=['n_overlap_with_SDQ'])
overlap_df = overlap_df.sort_values(by='n_overlap_with_SDQ', ascending=False)

# Display and optionally save
print("Overlap counts between SDQ and each voice variable:")
display(overlap_df)

## Descriptive Statistics

### Descriptive analysis

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format)

desc_stats = data.describe(include='all').transpose()

numeric_columns = data.select_dtypes(include=['number']).columns
desc_stats['median'] = data[numeric_columns].median()  
desc_stats['range'] = data[numeric_columns].max() - data[numeric_columns].min()  

desc_stats['mode'] = data.mode().iloc[0]  
desc_stats['missing_values'] = data.isnull().sum()  
desc_stats['missing_percentage'] = (data.isnull().sum() / len(data)) * 100  

print("Descriptive Statistics:")
display(desc_stats)

#output_file = "descriptive_statistics_allvisits_female_02.07.2025.xlsx"
#desc_stats.to_excel(output_file, index=True)
#print(f"Descriptive statistics saved to {output_file}")


### T-Test (boys vs. girls)

In [None]:
variables = [
    'e_sdq__d00149_hyp_sum','stimme__f0_sprech_1', 'stimme__f0_sprech_2', 'stimme__f0_sprech_3', 'stimme__f0_sprech_4', 'stimme__f0_sprech_5',
    'stimme__spl_sprech_1', 'stimme__spl_sprech_2', 'stimme__spl_sprech_3', 'stimme__spl_sprech_4', 'stimme__spl_sprech_5',
    'stimme__mpt', 'stimme__jitter', 'stimme__dsi', 'soz_winkler_2019__d00408_gesamt_score','c_pub_stat__d00077_pub_status',
    'c_anthro_kh__d00040_bmi_sds', 'age'
]

# Filter males and females once (so we don't filter repeatedly inside the loop)
males = data[data['sex'] == 'male']
females = data[data['sex'] == 'female']


# Perform t-tests for each voice feature
results = []

for feature in variables:
    if feature in data.columns:  # Ensure the feature exists in the dataset
        male_values = males[feature].dropna()  # Remove NaN values for males
        female_values = females[feature].dropna()  # Remove NaN values for females

        if not male_values.empty and not female_values.empty:  # Only perform t-test if both groups have data
            t_stat, p_value = stats.ttest_ind(male_values, female_values, equal_var=False)  # Welch’s t-test

            results.append({
                'Feature': feature,
                'T-Statistic': t_stat,
                'P-Value': round(p_value, 3)
            })

# Convert results to DataFrame for easy viewing
results_df = pd.DataFrame(results)

# Calculate sample sizes automatically for each feature
results_df["N-Male"] = results_df["Feature"].map(lambda x: males[x].count() if x in males.columns else 0)
results_df["N-Female"] = results_df["Feature"].map(lambda x: females[x].count() if x in females.columns else 0)

# Calculate and store mean values for each feature for males and females
results_df["Mean-Male"] = results_df["Feature"].map(lambda x: males[x].mean() if x in males.columns else None)
results_df["Mean-Female"] = results_df["Feature"].map(lambda x: females[x].mean() if x in females.columns else None)

# Save results to Excel with mean values included
excel_filename = "t_test_results_AllVisits.xlsx"
results_df.to_excel(excel_filename, index=False)
print(f"Results saved to {excel_filename}.")

# Display results
print(results_df)


variable_name_mapping = {
    "e_sdq__d00149_hyp_sum": "SDQ_HI",
    "stimme__f0_sprech_1": "f0_quiet_I",
    "stimme__f0_sprech_2": "f0_conversation_II",
    "stimme__f0_sprech_3": "f0_presentation_III",
    "stimme__f0_sprech_4": "f0_loud_IV",
    "stimme__f0_sprech_5": "f0_quiet_V",
    "stimme__spl_sprech_1": "spl_quiet_I",
    "stimme__spl_sprech_2": "spl_conversation_II",
    "stimme__spl_sprech_3": "spl_presentation_III",
    "stimme__spl_sprech_4": "spl_loud_IV",
    "stimme__spl_sprech_5": "spl_quiet_V",
    "stimme__mpt": "MPT",
    "stimme__jitter": "Jitter",
    "stimme__dsi": "DSI",
    "age": "Age",
    "soz_winkler_2019__d00408_gesamt_score": "SES",
    "c_pub_stat__d00077_pub_status": "Pubertal status",
    "c_anthro_kh__d00040_bmi_sds": 'BMI_SDS',
}


# Create a copy of results_df for plotting without modifying the original DataFrame
plot_results_df = results_df.copy()

# Apply the mapping only for plotting purposes
plot_results_df["Feature"] = plot_results_df["Feature"].map(lambda x: variable_name_mapping.get(x, x))


# Save the P-Value plot as a PNG file with 300 DPI
plt.figure(figsize=(12, 6))
sns.barplot(x="P-Value", y="Feature", data=plot_results_df, palette="Blues_r", edgecolor="black")
plt.axvline(x=0.05, color="red", linestyle="--", label="Significance Threshold (0.05)")
plt.legend()
plt.title("T-Test Results: P-Values for Features (Males vs. Females)")
plt.xlabel("P-Value")
plt.ylabel("Feature")

# Save the figure
plot_filename = "t_test_p_values.png"
plt.savefig(plot_filename, dpi=300, bbox_inches="tight")