In [9]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Function to load and preprocess the dataset (English or German)
def preprocess_data(feature_file, report_file, n_components=3):
    df_features = pd.read_excel(feature_file)
    df_self_reports = pd.read_excel(report_file)

    feature_cols = df_features.columns.difference(["Recording", "Speaker"])
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_features[feature_cols])

    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(df_scaled)
    df_pca = pd.DataFrame(principal_components, columns=[f"PC{i+1}" for i in range(n_components)])

    df_pca["Recording"] = df_features["Recording"]
    df_pca["Speaker"] = df_features["Speaker"]
    df_merged = pd.merge(df_pca, df_self_reports, on=["Recording", "Speaker"])

    y_resilience = df_merged["Emotional Resilience"]
    y_cognitive_load = df_merged["Cognitive Load"]
    X = df_merged[["PC1", "PC2", "PC3"]]
    X_scaled = scaler.fit_transform(X)

    return X_scaled, y_resilience, y_cognitive_load

# Function for standard regression and evaluation
def train_and_evaluate(X_scaled, y_resilience, y_cognitive_load):
    X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(
        X_scaled, y_resilience, test_size=0.2, random_state=42
    )
    X_train_cog, X_test_cog, y_train_cog, y_test_cog = train_test_split(
        X_scaled, y_cognitive_load, test_size=0.2, random_state=42
    )

    reg_resilience = LinearRegression()
    reg_resilience.fit(X_train_res, y_train_res)
    y_pred_resilience = reg_resilience.predict(X_test_res)
    mse_resilience = mean_squared_error(y_test_res, y_pred_resilience)
    r2_resilience = r2_score(y_test_res, y_pred_resilience)

    reg_cognitive = LinearRegression()
    reg_cognitive.fit(X_train_cog, y_train_cog)
    y_pred_cognitive = reg_cognitive.predict(X_test_cog)
    mse_cognitive = mean_squared_error(y_test_cog, y_pred_cognitive)
    r2_cognitive = r2_score(y_test_cog, y_pred_cognitive)

    return mse_resilience, r2_resilience, mse_cognitive, r2_cognitive

# Bootstrapping function to estimate confidence intervals
def bootstrap_regression(X, y, n_iterations=1000, test_size=0.2, random_state=42):
    mse_scores = []
    n_samples = len(X)
    rng = np.random.RandomState(random_state)

    for _ in range(n_iterations):
        indices = rng.choice(n_samples, n_samples, replace=True)
        X_resample = X[indices]
        y_resample = y.iloc[indices]

        X_train, X_test, y_train, y_test = train_test_split(
            X_resample, y_resample, test_size=test_size, random_state=rng.randint(10000)
        )

        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mse_scores.append(mse)

    mse_mean = np.mean(mse_scores)
    ci_lower = np.percentile(mse_scores, 2.5)
    ci_upper = np.percentile(mse_scores, 97.5)
    return mse_mean, (ci_lower, ci_upper)

# ------------------------------
# English Dataset
# ------------------------------
X_scaled_eng, y_res_eng, y_cog_eng = preprocess_data(
    "C:/Users/vyache/Downloads/Features_English.xlsx", 
    "C:/Users/vyache/Desktop/Questionnaire data_Eng.xlsx"
)

mse_res_eng, r2_res_eng, mse_cog_eng, r2_cog_eng = train_and_evaluate(X_scaled_eng, y_res_eng, y_cog_eng)
mse_res_boot_eng, ci_res_eng = bootstrap_regression(X_scaled_eng, y_res_eng)
mse_cog_boot_eng, ci_cog_eng = bootstrap_regression(X_scaled_eng, y_cog_eng)

print("\n--- English Dataset Results ---")
print(f"Resilience Regression MSE: {mse_res_eng:.2f}, R²: {r2_res_eng:.2f}")
print(f"Cognitive Load MSE: {mse_cog_eng:.2f}, R²: {r2_cog_eng:.2f}")
print(f"Bootstrapped Resilience MSE: {mse_res_boot_eng:.2f} | 95% CI: [{ci_res_eng[0]:.2f}, {ci_res_eng[1]:.2f}]")
print(f"Bootstrapped Cognitive Load MSE: {mse_cog_boot_eng:.2f} | 95% CI: [{ci_cog_eng[0]:.2f}, {ci_cog_eng[1]:.2f}]")

# ------------------------------
# German Dataset
# ------------------------------
X_scaled_ger, y_res_ger, y_cog_ger = preprocess_data(
    "C:/Users/vyache/Downloads/Features_German.xlsx", 
    "C:/Users/vyache/Desktop/Questionnaire data_Ger.xlsx"
)

mse_res_ger, r2_res_ger, mse_cog_ger, r2_cog_ger = train_and_evaluate(X_scaled_ger, y_res_ger, y_cog_ger)
mse_res_boot_ger, ci_res_ger = bootstrap_regression(X_scaled_ger, y_res_ger)
mse_cog_boot_ger, ci_cog_ger = bootstrap_regression(X_scaled_ger, y_cog_ger)

print("\n--- German Dataset Results ---")
print(f"Resilience Regression MSE: {mse_res_ger:.2f}, R²: {r2_res_ger:.2f}")
print(f"Cognitive Load MSE: {mse_cog_ger:.2f}, R²: {r2_cog_ger:.2f}")
print(f"Bootstrapped Resilience MSE: {mse_res_boot_ger:.2f} | 95% CI: [{ci_res_ger[0]:.2f}, {ci_res_ger[1]:.2f}]")
print(f"Bootstrapped Cognitive Load MSE: {mse_cog_boot_ger:.2f} | 95% CI: [{ci_cog_ger[0]:.2f}, {ci_cog_ger[1]:.2f}]")




--- English Dataset Results ---
Resilience Regression MSE: 35.58, R²: 0.01
Cognitive Load MSE: 26.01, R²: 0.01
Bootstrapped Resilience MSE: 41.76 | 95% CI: [17.63, 92.90]
Bootstrapped Cognitive Load MSE: 36.75 | 95% CI: [15.16, 74.23]

--- German Dataset Results ---
Resilience Regression MSE: 27.15, R²: -0.28
Cognitive Load MSE: 28.53, R²: -0.19
Bootstrapped Resilience MSE: 27.39 | 95% CI: [10.54, 51.68]
Bootstrapped Cognitive Load MSE: 23.74 | 95% CI: [6.85, 47.11]


In [11]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Function to load and preprocess the dataset (English or German)
def preprocess_data(feature_file, report_file, n_components=3, return_pca_info=False):
    df_features = pd.read_excel(feature_file)
    df_self_reports = pd.read_excel(report_file)

    feature_cols = df_features.columns.difference(["Recording", "Speaker"])
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_features[feature_cols])

    # Apply PCA
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(df_scaled)
    df_pca = pd.DataFrame(principal_components, columns=[f"PC{i+1}" for i in range(n_components)])

    # Merge with IDs and self-reports
    df_pca["Recording"] = df_features["Recording"]
    df_pca["Speaker"] = df_features["Speaker"]
    df_merged = pd.merge(df_pca, df_self_reports, on=["Recording", "Speaker"])

    y_resilience = df_merged["Emotional Resilience"]
    y_cognitive_load = df_merged["Cognitive Load"]
    X = df_merged[["PC1", "PC2", "PC3"]]
    X_scaled = scaler.fit_transform(X)

    if return_pca_info:
        explained_variance = pca.explained_variance_ratio_
        loadings = pd.DataFrame(
            pca.components_.T,
            columns=[f"PC{i+1}" for i in range(n_components)],
            index=feature_cols
        )
        return X_scaled, y_resilience, y_cognitive_load, explained_variance, loadings

    return X_scaled, y_resilience, y_cognitive_load

# Function for standard regression and evaluation
def train_and_evaluate(X_scaled, y_resilience, y_cognitive_load):
    X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(
        X_scaled, y_resilience, test_size=0.2, random_state=42
    )
    X_train_cog, X_test_cog, y_train_cog, y_test_cog = train_test_split(
        X_scaled, y_cognitive_load, test_size=0.2, random_state=42
    )

    reg_resilience = LinearRegression()
    reg_resilience.fit(X_train_res, y_train_res)
    y_pred_resilience = reg_resilience.predict(X_test_res)
    mse_resilience = mean_squared_error(y_test_res, y_pred_resilience)
    r2_resilience = r2_score(y_test_res, y_pred_resilience)

    reg_cognitive = LinearRegression()
    reg_cognitive.fit(X_train_cog, y_train_cog)
    y_pred_cognitive = reg_cognitive.predict(X_test_cog)
    mse_cognitive = mean_squared_error(y_test_cog, y_pred_cognitive)
    r2_cognitive = r2_score(y_test_cog, y_pred_cognitive)

    return mse_resilience, r2_resilience, mse_cognitive, r2_cognitive

# Bootstrapping function to estimate confidence intervals
def bootstrap_regression(X, y, n_iterations=1000, test_size=0.2, random_state=42):
    mse_scores = []
    n_samples = len(X)
    rng = np.random.RandomState(random_state)

    for _ in range(n_iterations):
        indices = rng.choice(n_samples, n_samples, replace=True)
        X_resample = X[indices]
        y_resample = y.iloc[indices]

        X_train, X_test, y_train, y_test = train_test_split(
            X_resample, y_resample, test_size=test_size, random_state=rng.randint(10000)
        )

        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mse_scores.append(mse)

    mse_mean = np.mean(mse_scores)
    ci_lower = np.percentile(mse_scores, 2.5)
    ci_upper = np.percentile(mse_scores, 97.5)
    return mse_mean, (ci_lower, ci_upper)

# ------------------------------
# English Dataset
# ------------------------------
X_scaled_eng, y_res_eng, y_cog_eng, explained_var_eng, loadings_eng = preprocess_data(
    "C:/Users/vyache/Downloads/Features_English.xlsx", 
    "C:/Users/vyache/Desktop/Questionnaire data_Eng.xlsx",
    return_pca_info=True
)

mse_res_eng, r2_res_eng, mse_cog_eng, r2_cog_eng = train_and_evaluate(X_scaled_eng, y_res_eng, y_cog_eng)
mse_res_boot_eng, ci_res_eng = bootstrap_regression(X_scaled_eng, y_res_eng)
mse_cog_boot_eng, ci_cog_eng = bootstrap_regression(X_scaled_eng, y_cog_eng)

print("\n--- English Dataset Results ---")
print(f"Resilience Regression MSE: {mse_res_eng:.2f}, R²: {r2_res_eng:.2f}")
print(f"Cognitive Load MSE: {mse_cog_eng:.2f}, R²: {r2_cog_eng:.2f}")
print(f"Bootstrapped Resilience MSE: {mse_res_boot_eng:.2f} | 95% CI: [{ci_res_eng[0]:.2f}, {ci_res_eng[1]:.2f}]")
print(f"Bootstrapped Cognitive Load MSE: {mse_cog_boot_eng:.2f} | 95% CI: [{ci_cog_eng[0]:.2f}, {ci_cog_eng[1]:.2f}]")

print("\n--- PCA Explained Variance (English) ---")
for i, var in enumerate(explained_var_eng):
    print(f"PC{i+1}: {var:.2%}")

print("\n--- Top 5 Loadings for PC1 (English) ---")
print(loadings_eng["PC1"].abs().sort_values(ascending=False).head(5))

# -------------------



--- English Dataset Results ---
Resilience Regression MSE: 35.58, R²: 0.01
Cognitive Load MSE: 26.01, R²: 0.01
Bootstrapped Resilience MSE: 41.76 | 95% CI: [17.63, 92.90]
Bootstrapped Cognitive Load MSE: 36.75 | 95% CI: [15.16, 74.23]

--- PCA Explained Variance (English) ---
PC1: 33.93%
PC2: 18.16%
PC3: 13.65%

--- Top 5 Loadings for PC1 (English) ---
pcm_loudness_sma_amean_mean    0.482997
F0_sma_de_skewness_mean        0.440076
pcm_zcr_sma_amean_mean         0.403097
voiceProb_sma_amean_mean       0.392124
F0_sma_de_amean_mean           0.370373
Name: PC1, dtype: float64


In [13]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Function to load and preprocess the dataset (English or German)
def preprocess_data(feature_file, report_file, n_components=3, return_pca_info=False):
    df_features = pd.read_excel(feature_file)
    df_self_reports = pd.read_excel(report_file)

    feature_cols = df_features.columns.difference(["Recording", "Speaker"])
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_features[feature_cols])

    # Apply PCA
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(df_scaled)
    df_pca = pd.DataFrame(principal_components, columns=[f"PC{i+1}" for i in range(n_components)])

    # Merge with IDs and self-reports
    df_pca["Recording"] = df_features["Recording"]
    df_pca["Speaker"] = df_features["Speaker"]
    df_merged = pd.merge(df_pca, df_self_reports, on=["Recording", "Speaker"])

    y_resilience = df_merged["Emotional Resilience"]
    y_cognitive_load = df_merged["Cognitive Load"]
    X = df_merged[["PC1", "PC2", "PC3"]]
    X_scaled = scaler.fit_transform(X)

    if return_pca_info:
        explained_variance = pca.explained_variance_ratio_
        loadings = pd.DataFrame(
            pca.components_.T,
            columns=[f"PC{i+1}" for i in range(n_components)],
            index=feature_cols
        )
        return X_scaled, y_resilience, y_cognitive_load, explained_variance, loadings

    return X_scaled, y_resilience, y_cognitive_load

# Function for standard regression and evaluation
def train_and_evaluate(X_scaled, y_resilience, y_cognitive_load):
    X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(
        X_scaled, y_resilience, test_size=0.2, random_state=42
    )
    X_train_cog, X_test_cog, y_train_cog, y_test_cog = train_test_split(
        X_scaled, y_cognitive_load, test_size=0.2, random_state=42
    )

    reg_resilience = LinearRegression()
    reg_resilience.fit(X_train_res, y_train_res)
    y_pred_resilience = reg_resilience.predict(X_test_res)
    mse_resilience = mean_squared_error(y_test_res, y_pred_resilience)
    r2_resilience = r2_score(y_test_res, y_pred_resilience)

    reg_cognitive = LinearRegression()
    reg_cognitive.fit(X_train_cog, y_train_cog)
    y_pred_cognitive = reg_cognitive.predict(X_test_cog)
    mse_cognitive = mean_squared_error(y_test_cog, y_pred_cognitive)
    r2_cognitive = r2_score(y_test_cog, y_pred_cognitive)

    return mse_resilience, r2_resilience, mse_cognitive, r2_cognitive

# Bootstrapping function to estimate confidence intervals
def bootstrap_regression(X, y, n_iterations=1000, test_size=0.2, random_state=42):
    mse_scores = []
    n_samples = len(X)
    rng = np.random.RandomState(random_state)

    for _ in range(n_iterations):
        indices = rng.choice(n_samples, n_samples, replace=True)
        X_resample = X[indices]
        y_resample = y.iloc[indices]

        X_train, X_test, y_train, y_test = train_test_split(
            X_resample, y_resample, test_size=test_size, random_state=rng.randint(10000)
        )

        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mse_scores.append(mse)

    mse_mean = np.mean(mse_scores)
    ci_lower = np.percentile(mse_scores, 2.5)
    ci_upper = np.percentile(mse_scores, 97.5)
    return mse_mean, (ci_lower, ci_upper)

# ------------------------------
# English Dataset
# ------------------------------
X_scaled_eng, y_res_eng, y_cog_eng, explained_var_eng, loadings_eng = preprocess_data(
    "C:/Users/vyache/Downloads/Features_German.xlsx", 
    "C:/Users/vyache/Desktop/Questionnaire data_Ger.xlsx",
    return_pca_info=True
)

mse_res_eng, r2_res_eng, mse_cog_eng, r2_cog_eng = train_and_evaluate(X_scaled_eng, y_res_eng, y_cog_eng)
mse_res_boot_eng, ci_res_eng = bootstrap_regression(X_scaled_eng, y_res_eng)
mse_cog_boot_eng, ci_cog_eng = bootstrap_regression(X_scaled_eng, y_cog_eng)

print("\n--- English Dataset Results ---")
print(f"Resilience Regression MSE: {mse_res_eng:.2f}, R²: {r2_res_eng:.2f}")
print(f"Cognitive Load MSE: {mse_cog_eng:.2f}, R²: {r2_cog_eng:.2f}")
print(f"Bootstrapped Resilience MSE: {mse_res_boot_eng:.2f} | 95% CI: [{ci_res_eng[0]:.2f}, {ci_res_eng[1]:.2f}]")
print(f"Bootstrapped Cognitive Load MSE: {mse_cog_boot_eng:.2f} | 95% CI: [{ci_cog_eng[0]:.2f}, {ci_cog_eng[1]:.2f}]")

print("\n--- PCA Explained Variance (English) ---")
for i, var in enumerate(explained_var_eng):
    print(f"PC{i+1}: {var:.2%}")

print("\n--- Top 5 Loadings for PC1 (English) ---")
print(loadings_eng["PC1"].abs().sort_values(ascending=False).head(5))

# -------------------



--- English Dataset Results ---
Resilience Regression MSE: 27.15, R²: -0.28
Cognitive Load MSE: 28.53, R²: -0.19
Bootstrapped Resilience MSE: 27.39 | 95% CI: [10.54, 51.68]
Bootstrapped Cognitive Load MSE: 23.74 | 95% CI: [6.85, 47.11]

--- PCA Explained Variance (English) ---
PC1: 35.67%
PC2: 25.57%
PC3: 11.69%

--- Top 5 Loadings for PC1 (English) ---
mfcc_sma_de[1]_skewness_mean       0.443730
F0_sma_de_skewness_mean            0.434749
mfcc_sma_de[2]_skewness_mean       0.404697
pcm_intensity_sma_de_amean_mean    0.368075
F0_sma_de_amean_mean               0.318037
Name: PC1, dtype: float64
