In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Load the feature dataset
df_eng = pd.read_excel("C:/Users/vyache/Downloads/Features_English.xlsx")

# Select only feature columns (replace 'feature_cols' with the actual list of columns)
feature_cols = df_eng.columns.difference(["Recording", "Speaker"])  # Exclude non-feature columns

# Standardize the features before applying PCA
scaler = StandardScaler()
df_eng_scaled = scaler.fit_transform(df_eng[feature_cols])

# Apply PCA
pca = PCA(n_components=3)  # Adjust number of components as needed
principal_components = pca.fit_transform(df_eng_scaled)

# Convert PCA output to DataFrame
df_pca = pd.DataFrame(principal_components, columns=[f"PC{i+1}" for i in range(3)])

# Add back identifiers (Recording, Speaker) for merging
df_pca["Recording"] = df_eng["Recording"]
df_pca["Speaker"] = df_eng["Speaker"]

# Load self-report dataset
df_self_reports = pd.read_excel("C:/Users/vyache/Desktop/Questionnaire data_Eng.xlsx")

# Merge PCA-transformed features with self-reports
df_merged = pd.merge(df_pca, df_self_reports, on=["Recording", "Speaker"])

# Define thresholds for resilience and cognitive load
low_threshold_resilience = df_self_reports["Emotional Resilience"].quantile(0.33)
high_threshold_resilience = df_self_reports["Emotional Resilience"].quantile(0.66)

low_threshold_load = df_self_reports["Cognitive Load"].quantile(0.33)
high_threshold_load = df_self_reports["Cognitive Load"].quantile(0.66)

# Categorization functions
def categorize_resilience(score):
    if score < low_threshold_resilience:
        return 0  # Low
    elif score < high_threshold_resilience:
        return 1  # Medium
    else:
        return 2  # High

def categorize_cognitive_load(score):
    if score < low_threshold_load:
        return 0  # Low
    elif score < high_threshold_load:
        return 1  # Medium
    else:
        return 2  # High

# Apply categorization
df_merged["resilience_category"] = df_merged["Emotional Resilience"].apply(categorize_resilience)
df_merged["cognitive_load_category"] = df_merged["Cognitive Load"].apply(categorize_cognitive_load)

# Verify categories
print(df_merged["resilience_category"].value_counts())
print(df_merged["cognitive_load_category"].value_counts())

# Select PCA features
X = df_merged[["PC1", "PC2", "PC3"]]
y_resilience = df_merged["resilience_category"]
y_cognitive_load = df_merged["cognitive_load_category"]

# Standardize PCA features
X_scaled = scaler.fit_transform(X)

# Train-Test Split for Resilience Classification
X_train_resilience, X_test_resilience, y_train_resilience, y_test_resilience = train_test_split(X_scaled, y_resilience, test_size=0.3, random_state=42)

# Initialize Gradient Boosting Classifier for Resilience
gbm_resilience = GradientBoostingClassifier(random_state=42)

# Fit the model on training data for Resilience
gbm_resilience.fit(X_train_resilience, y_train_resilience)

# Predict on test set for Resilience
y_pred_resilience = gbm_resilience.predict(X_test_resilience)

# Evaluate the model for Resilience
print("Resilience Classification Report:")
print(classification_report(y_test_resilience, y_pred_resilience))

# Evaluate accuracy for Resilience
accuracy_resilience = accuracy_score(y_test_resilience, y_pred_resilience)
print(f"Resilience Accuracy: {accuracy_resilience}")

# Train-Test Split for Cognitive Load Classification
X_train_load, X_test_load, y_train_load, y_test_load = train_test_split(X_scaled, y_cognitive_load, test_size=0.3, random_state=42)

# Initialize Gradient Boosting Classifier for Cognitive Load
gbm_load = GradientBoostingClassifier(random_state=42)

# Fit the model on training data for Cognitive Load
gbm_load.fit(X_train_load, y_train_load)

# Predict on test set for Cognitive Load
y_pred_load = gbm_load.predict(X_test_load)

# Evaluate the model for Cognitive Load
print("\nCognitive Load Classification Report:")
print(classification_report(y_test_load, y_pred_load))

# Evaluate accuracy for Cognitive Load
accuracy_load = accuracy_score(y_test_load, y_pred_load)
print(f"Cognitive Load Accuracy: {accuracy_load}")


resilience_category
2    25
1    21
0    20
Name: count, dtype: int64
cognitive_load_category
2    26
0    22
1    18
Name: count, dtype: int64
Resilience Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.50      0.55         6
           1       0.00      0.00      0.00         7
           2       0.27      0.43      0.33         7

    accuracy                           0.30        20
   macro avg       0.29      0.31      0.29        20
weighted avg       0.28      0.30      0.28        20

Resilience Accuracy: 0.3

Cognitive Load Classification Report:
              precision    recall  f1-score   support

           0       0.29      0.40      0.33         5
           1       0.29      0.29      0.29         7
           2       0.17      0.12      0.14         8

    accuracy                           0.25        20
   macro avg       0.25      0.27      0.25        20
weighted avg       0.24      0.25      0.24        

In [3]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

gbm = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(gbm, param_grid, cv=5)
grid_search.fit(X_train_resilience, y_train_resilience)

print(f"Best parameters for Resilience classification: {grid_search.best_params_}")


Best parameters for Resilience classification: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}


In [13]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Train-Test Split for Resilience Classification
X_train_resilience, X_test_resilience, y_train_resilience, y_test_resilience = train_test_split(X_scaled, y_resilience, test_size=0.3, random_state=42)

# Initialize Gradient Boosting Classifier with the best parameters
gbm_resilience = GradientBoostingClassifier(
    learning_rate=0.1, 
    max_depth=3,  # Or try even smaller values like 2
    n_estimators=50,
    random_state=42,
  # Handle class imbalance
)


# Fit the model on training data for Resilience
gbm_resilience.fit(X_train_resilience, y_train_resilience)

# Predict on test set for Resilience
y_pred_resilience = gbm_resilience.predict(X_test_resilience)

# Evaluate the model for Resilience
print("Resilience Classification Report:")
print(classification_report(y_test_resilience, y_pred_resilience))

# Evaluate accuracy for Resilience
accuracy_resilience = accuracy_score(y_test_resilience, y_pred_resilience)
print(f"Resilience Accuracy: {accuracy_resilience}")


Resilience Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.33      0.36         6
           1       0.20      0.14      0.17         7
           2       0.40      0.57      0.47         7

    accuracy                           0.35        20
   macro avg       0.33      0.35      0.33        20
weighted avg       0.33      0.35      0.33        20

Resilience Accuracy: 0.35
