In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Function to load and preprocess the dataset (English or German)
def preprocess_data(feature_file, report_file, n_components=3):
    # Load the feature dataset
    df_features = pd.read_excel(feature_file)
    
    # Load the self-report dataset
    df_self_reports = pd.read_excel(report_file)
    
    # Select only the feature columns (excluding 'Recording' and 'Speaker')
    feature_cols = df_features.columns.difference(["Recording", "Speaker"])
    
    # Standardize the features before applying PCA
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_features[feature_cols])
    
    # Apply PCA to reduce the dimensionality to the specified number of components
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(df_scaled)
    
    # Convert PCA output to a DataFrame with appropriate column names
    df_pca = pd.DataFrame(principal_components, columns=[f"PC{i+1}" for i in range(n_components)])
    
    # Add back identifiers (Recording, Speaker) for merging later
    df_pca["Recording"] = df_features["Recording"]
    df_pca["Speaker"] = df_features["Speaker"]
    
    # Merge PCA features with the self-report dataset on 'Recording' and 'Speaker'
    df_merged = pd.merge(df_pca, df_self_reports, on=["Recording", "Speaker"])
    
    # Define target variables
    y_resilience = df_merged["Emotional Resilience"]
    y_cognitive_load = df_merged["Cognitive Load"]
    
    # Select PCA features (PC1, PC2, PC3) as predictors
    X = df_merged[["PC1", "PC2", "PC3"]]
    
    # Standardize PCA features (for consistent scaling)
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, y_resilience, y_cognitive_load

# Function to train and evaluate regression models (Resilience and Cognitive Load)
def train_and_evaluate(X_scaled, y_resilience, y_cognitive_load):
    # Split data for regression (Resilience)
    X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(
        X_scaled, y_resilience, test_size=0.2, random_state=42
    )
    
    # Split data for regression (Cognitive Load)
    X_train_cog, X_test_cog, y_train_cog, y_test_cog = train_test_split(
        X_scaled, y_cognitive_load, test_size=0.2, random_state=42
    )
    
    # --- Resilience Regression ---
    # Initialize and train the regression model for resilience
    regressor_resilience = LinearRegression()
    regressor_resilience.fit(X_train_res, y_train_res)
    
    # Predict on test data for resilience
    y_pred_resilience = regressor_resilience.predict(X_test_res)
    
    # Evaluate the performance of the resilience regression model
    mse_resilience = mean_squared_error(y_test_res, y_pred_resilience)
    r2_resilience = r2_score(y_test_res, y_pred_resilience)
    
    # --- Cognitive Load Regression ---
    # Initialize and train the regression model for cognitive load
    regressor_cognitive_load = LinearRegression()
    regressor_cognitive_load.fit(X_train_cog, y_train_cog)
    
    # Predict on test data for cognitive load
    y_pred_cognitive_load = regressor_cognitive_load.predict(X_test_cog)
    
    # Evaluate the performance of the cognitive load regression model
    mse_cognitive_load = mean_squared_error(y_test_cog, y_pred_cognitive_load)
    r2_cognitive_load = r2_score(y_test_cog, y_pred_cognitive_load)
    
    # Return the evaluation results
    return mse_resilience, r2_resilience, mse_cognitive_load, r2_cognitive_load

# --- Process English Dataset ---
X_scaled_eng, y_resilience_eng, y_cognitive_load_eng = preprocess_data(
    "C:/Users/vyache/Downloads/Features_English.xlsx", 
    "C:/Users/vyache/Desktop/Questionnaire data_Eng.xlsx"
)
mse_resilience_eng, r2_resilience_eng, mse_cognitive_load_eng, r2_cognitive_load_eng = train_and_evaluate(
    X_scaled_eng, y_resilience_eng, y_cognitive_load_eng
)

# Print evaluation results for English dataset
print("English Dataset Results:")
print(f"Resilience Regression MSE: {mse_resilience_eng}")
print(f"Resilience Regression R-squared: {r2_resilience_eng}")
print(f"Cognitive Load Regression MSE: {mse_cognitive_load_eng}")
print(f"Cognitive Load Regression R-squared: {r2_cognitive_load_eng}")
print("\n")

# --- Process German Dataset ---
X_scaled_ger, y_resilience_ger, y_cognitive_load_ger = preprocess_data(
    "C:/Users/vyache/Downloads/Features_German.xlsx", 
    "C:/Users/vyache/Desktop/Questionnaire data_Ger.xlsx"
)
mse_resilience_ger, r2_resilience_ger, mse_cognitive_load_ger, r2_cognitive_load_ger = train_and_evaluate(
    X_scaled_ger, y_resilience_ger, y_cognitive_load_ger
)

# Print evaluation results for German dataset
print("German Dataset Results:")
print(f"Resilience Regression MSE: {mse_resilience_ger}")
print(f"Resilience Regression R-squared: {r2_resilience_ger}")
print(f"Cognitive Load Regression MSE: {mse_cognitive_load_ger}")
print(f"Cognitive Load Regression R-squared: {r2_cognitive_load_ger}")


English Dataset Results:
Resilience Regression MSE: 35.58286913018928
Resilience Regression R-squared: 0.008354564266017439
Cognitive Load Regression MSE: 26.010612165641465
Cognitive Load Regression R-squared: 0.011425250249034846


German Dataset Results:
Resilience Regression MSE: 27.146335296528306
Resilience Regression R-squared: -0.28220532842741286
Cognitive Load Regression MSE: 28.52858149430663
Cognitive Load Regression R-squared: -0.19339858231134177
