# Text sentiment emotion analysis

## Patient

In [None]:
#bond: questions 3,5,7,9
#goal: 1,4,8,11
#task: 2,6,10,12

In [1]:
import pandas as pd

df = pd.read_csv('/content/emotiontext_incl_patWAI.csv', delimiter=',')

# List of column names 1-12
columns_to_check = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

# Convert the specified columns to numeric, replacing invalid values with NaN
df[columns_to_check] = df[columns_to_check].apply(pd.to_numeric, errors='coerce')

# Check for non-float values in the specified columns
non_float_mask = ~df[columns_to_check].applymap(lambda x: isinstance(x, float))

# Find rows where any of the specified columns have non-float values
rows_with_non_floats = non_float_mask.any(axis=1)

df = df.drop_duplicates()
df = df.dropna(subset=columns_to_check, how='any')

# df = df.drop_duplicates(subset=['ppnr', 't1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10'])
df = df.drop_duplicates(subset=['ppnr','1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'])

# If needed, reset the index of the filtered DataFrame
df.reset_index(drop=True, inplace=True)
df.fillna(0, inplace=True)

In [2]:
import pandas as pd


# Assuming you have a DataFrame called 'df' with the desired columns
bond_columns = ['3', '5', '7','9']
goal_columns = ['1', '4', '8','11']
task_columns = ['2', '6', '10','12']

# Calculate the row-wise average for the specified columns
bond_df = df[bond_columns].mean(axis=1)
goal_df = df[goal_columns].mean(axis=1)
task_df = df[task_columns].mean(axis=1)



In [None]:
df #39 rows

## RF

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict
from sklearn.impute import SimpleImputer

# Load data from CSV
all_df = df


# Convert numeric columns to float
numeric_columns = ['ppnr', 'session']
all_df[numeric_columns] = all_df[numeric_columns].astype(float)

# Group the rows by 'ppnr' and 'sessions' columns and calculate the mean for other columns
# df = all_df.groupby(['ppnr', 'session'], as_index=False).mean()
df = all_df

# Select features and target variable
features = df[['neutral', 'curiosity', 'sadness', 'admiration',
       'fear', 'disgust', 'amusement', 'confusion', 'approval', 'joy', 'love',
       'realization', 'desire', 'annoyance', 'disapproval', 'nervousness',
       'remorse', 'excitement', 'anger', 'disappointment', 'surprise',
       'caring', 'grief', 'embarrassment', 'gratitude', 'pride', 'optimism',
       'relief', 'nr_positive', 'nr_negative']]


for i in range(12):
    nr = str(i+1) #'t' +

    outcome = all_df[nr]
    print("WAI question:", nr)

    # Handle missing values in features using an imputer
    imputer = SimpleImputer(strategy='mean')
    features_imputed = imputer.fit_transform(features)

    # Train the model
    t = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=1)
    t.fit(features_imputed, outcome)

    # Calculate out-of-bag MSE for Outcome 1
    oob_error = 1 - t.score(features_imputed, outcome)
    print("Out-of-Bag MSE for Outcome 1:", oob_error)

    # Get feature importances
    impOOB = t.feature_importances_

    # Plot feature importances
    plt.bar(range(len(impOOB)), impOOB)
    plt.title('Unbiased Predictor Importance Estimates')
    plt.xlabel('Predictor variable')
    plt.ylabel('Importance')
    plt.xticks(range(len(impOOB)), features.columns, rotation=90)
    # plt.show()

    # Check by predicting held back data
    ncells = features.shape[0]
    perc = int(0.8 * ncells)

    nboots = 10
    Acc = np.empty(nboots)
    for b in range(nboots):
        shuf = np.random.permutation(ncells)
        incl = shuf[:perc]
        holdback = shuf[perc:]

        Mdl = t.fit(features.iloc[incl], outcome.iloc[incl])
        label = Mdl.predict(features.iloc[holdback])
        Acc[b] = mean_squared_error(outcome.iloc[holdback], label)

    mean_mse = np.mean(Acc)
    print("Mean MSE for Outcome 1:", mean_mse)

    # Feature selection
    sorted_indices = np.argsort(impOOB)
    bestfeat = sorted_indices[7:]
    t = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=1)
    MdlBF = t.fit(features.iloc[:, bestfeat], outcome)
    oob_error_bf = 1 - MdlBF.score(features.iloc[:, bestfeat], outcome)
    print("Out-of-Bag MSE (Feature Selection):", oob_error_bf)

    # Print the selected features
    selected_features = features.columns[bestfeat]
    print("Selected Features:", selected_features)




## Pearson correlation

In [None]:
import numpy as np
from scipy.stats import pearsonr, linregress
from tabulate import tabulate

correlation_results = []

# Calculate correlations and slopes
for feature in features:
    correlated_wai_questions = []

    for wai_question in range(1, 13):
        question_col = str(wai_question)
        val = feature

        # Calculate Pearson correlation coefficient and p-value
        correlation_coefficient, p_value = pearsonr(df[val].astype(float), df[question_col].astype(float))

        if p_value < 0.05:
            # Calculate the slope of linear regression
            slope, _, _, _, _ = linregress(df[val].astype(float), df[question_col].astype(float))
            correlated_wai_questions.append([f'WAI Question {wai_question}', f"{correlation_coefficient:.2f} ({slope:.2f})", f"{p_value:.4f}"])

    if correlated_wai_questions:
        correlation_results.append([feature, correlated_wai_questions])

# Print results in a table
if correlation_results:
    headers = ["Feature", "WAI (Factors)","Correlation Coefficient (Slope)", "p-value"]
    table_data = []
    for result in correlation_results:
        feature_name = result[0]
        for corr_data in result[1]:
            table_data.append([feature_name] + corr_data)

    table = tabulate(table_data, headers=headers, tablefmt="pretty")
    # Save the table as a vector PDF
    plt.figure(figsize=(10, 6))
    plt.axis('off')
    plt.table(cellText=table_data, colLabels=headers, cellLoc='center', loc='center', colColours=['#f5f5f5']*len(headers))
    plt.savefig('correlation_table_p_textfeat.pdf', format='pdf', bbox_inches='tight')
    plt.close()

    print("Correlation Results for Patient scores Text Features:")
    print(table)
else:
    print("No correlations with p-value < 0.05 found.")


## ANOVA

In [None]:
from scipy.stats import f_oneway

for i in range(12):
  nr =   str(i + 1)
  wai_scores = df[nr].astype('category')
  for feature in features:
      groups = []
      for category in wai_scores.cat.categories:
          groups.append(df[feature][wai_scores == category])

      # Perform ANOVA test
      f_statistic, p_value = f_oneway(*groups)

      if p_value < 0.05:
          print("Question: ",nr)
          print("Feature:", feature)
          print("F-Statistic:", f_statistic)
          print("p-value:", p_value)


## heatmap for correlation between features

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame with the features
features = ['neutral', 'curiosity', 'sadness', 'admiration',
       'fear', 'disgust', 'amusement', 'confusion', 'approval', 'joy', 'love',
       'realization', 'desire', 'annoyance', 'disapproval', 'nervousness',
       'remorse', 'excitement', 'anger', 'disappointment', 'surprise',
       'caring', 'grief', 'embarrassment', 'gratitude', 'pride', 'optimism',
       'relief', 'nr_positive', 'nr_negative']

# Create a correlation matrix
correlation_matrix = df[features].corr()

# Create a heatmap to visualize the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix Heatmap")
plt.show()


# Therapist

In [None]:
# Therapist
#bond: 2,5,7,9
#goal: 3,4,8,6
#task: 1,2,10

# Patient + Observer
#bond: 3,5,7,9
#goal: 1,4,8,11
#task: 2,6,10,12

In [4]:
import pandas as pd

df = pd.read_csv('/content/emotiontext_incl_tWAI.csv', delimiter=',')

# List of column names
columns_to_check = ['t1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10']

# Convert the specified columns to numeric, replacing invalid values with NaN
df[columns_to_check] = df[columns_to_check].apply(pd.to_numeric, errors='coerce')

# Check for non-float values in the specified columns
non_float_mask = ~df[columns_to_check].applymap(lambda x: isinstance(x, float))

# Find rows where any of the specified columns have non-float values
rows_with_non_floats = non_float_mask.any(axis=1)

df = df.drop_duplicates()
df = df.dropna(subset=columns_to_check, how='any')

df = df.drop_duplicates(subset=['ppnr', 't1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10'])

# If needed, reset the index of the filtered DataFrame
df.reset_index(drop=True, inplace=True)
df.fillna(0, inplace=True)

In [5]:
import pandas as pd


# Assuming you have a DataFrame called 'df' with the desired columns
bond_columns = ['t2', 't5', 't7','t9']
goal_columns = ['t3', 't4', 't8']
task_columns = ['t1', 't2', 't6', 't10']

# Calculate the row-wise average for the specified columns
bond_df = df[bond_columns].mean(axis=1)
goal_df = df[goal_columns].mean(axis=1)
task_df = df[task_columns].mean(axis=1)



In [None]:
df #48 rows

## RF

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict
from sklearn.impute import SimpleImputer

# Load data from CSV
all_df = df

# Convert numeric columns to float
numeric_columns = ['ppnr', 'session']
all_df[numeric_columns] = all_df[numeric_columns].astype(float)

df = all_df

# Select features and target variable
features = df[['neutral', 'curiosity', 'sadness', 'admiration',
       'fear', 'disgust', 'amusement', 'confusion', 'approval', 'joy', 'love',
       'realization', 'desire', 'annoyance', 'disapproval', 'nervousness',
       'remorse', 'excitement', 'anger', 'disappointment', 'surprise',
       'caring', 'grief', 'embarrassment', 'gratitude', 'pride', 'optimism',
       'relief', 'nr_positive', 'nr_negative']]


for i in range(10):
    nr = 't' + str(i+1) #'t' +

    outcome = all_df[nr]
    print("WAI question:", nr)

    # Handle missing values in features using an imputer
    imputer = SimpleImputer(strategy='mean')
    features_imputed = imputer.fit_transform(features)

    # Train the model
    t = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=1)
    t.fit(features_imputed, outcome)

    # Calculate out-of-bag MSE for Outcome 1
    oob_error = 1 - t.score(features_imputed, outcome)
    print("Out-of-Bag MSE for Outcome 1:", oob_error)

    # Get feature importances
    impOOB = t.feature_importances_

    # Plot feature importances
    plt.bar(range(len(impOOB)), impOOB)
    plt.title('Unbiased Predictor Importance Estimates')
    plt.xlabel('Predictor variable')
    plt.ylabel('Importance')
    plt.xticks(range(len(impOOB)), features.columns, rotation=90)
    # plt.show()

    # Check by predicting held back data
    ncells = features.shape[0]
    perc = int(0.8 * ncells)

    nboots = 10
    Acc = np.empty(nboots)
    for b in range(nboots):
        shuf = np.random.permutation(ncells)
        incl = shuf[:perc]
        holdback = shuf[perc:]

        Mdl = t.fit(features.iloc[incl], outcome.iloc[incl])
        label = Mdl.predict(features.iloc[holdback])
        Acc[b] = mean_squared_error(outcome.iloc[holdback], label)

    mean_mse = np.mean(Acc)
    print("Mean MSE for Outcome 1:", mean_mse)

    # Feature selection
    sorted_indices = np.argsort(impOOB)
    bestfeat = sorted_indices[7:]
    t = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=1)
    MdlBF = t.fit(features.iloc[:, bestfeat], outcome)
    oob_error_bf = 1 - MdlBF.score(features.iloc[:, bestfeat], outcome)
    print("Out-of-Bag MSE (Feature Selection):", oob_error_bf)

    # Print the selected features
    selected_features = features.columns[bestfeat]
    print("Selected Features:", selected_features)

## Pearson Correlation

In [None]:
import numpy as np
from scipy.stats import pearsonr, linregress
from tabulate import tabulate

correlation_results = []

# Calculate correlations and slopes
for feature in features:
    correlated_wai_questions = []

    for wai_question in range(1, 11):
        question_col = 't' + str(wai_question)
        val = feature

        # Calculate Pearson correlation coefficient and p-value
        correlation_coefficient, p_value = pearsonr(df[val].astype(float), df[question_col].astype(float))

        if p_value < 0.05:
            # Calculate the slope of linear regression
            slope, _, _, _, _ = linregress(df[val].astype(float), df[question_col].astype(float))
            correlated_wai_questions.append([f'WAI Question {wai_question}', f"{correlation_coefficient:.2f} ({slope:.2f})", f"{p_value:.4f}"])

    if correlated_wai_questions:
        correlation_results.append([feature, correlated_wai_questions])

# Print results in a table
if correlation_results:
    headers = ["Feature", "WAI (Factors)","Correlation Coefficient (Slope)", "p-value"]
    table_data = []
    for result in correlation_results:
        feature_name = result[0]
        for corr_data in result[1]:
            table_data.append([feature_name] + corr_data)

    table = tabulate(table_data, headers=headers, tablefmt="pretty")
    # Save the table as a vector PDF
    plt.figure(figsize=(10, 6))
    plt.axis('off')
    plt.table(cellText=table_data, colLabels=headers, cellLoc='center', loc='center', colColours=['#f5f5f5']*len(headers))
    plt.savefig('correlation_table_t_textfeat.pdf', format='pdf', bbox_inches='tight')
    plt.close()

    print("Correlation Results for Therapist scores Text Features:")
    print(table)
else:
    print("No correlations with p-value < 0.05 found.")

## ANOVA

In [None]:
from scipy.stats import f_oneway

for i in range(10):
  nr =  't' + str(i + 1)
  wai_scores = df[nr].astype('category')
  for feature in features:
      groups = []
      for category in wai_scores.cat.categories:
          groups.append(df[feature][wai_scores == category])

      # Perform ANOVA test
      f_statistic, p_value = f_oneway(*groups)

      if p_value < 0.05:
          print("Question: ",nr)
          print("Feature:", feature)
          print("F-Statistic:", f_statistic)
          print("p-value:", p_value)


## heatmap for correlation between features

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame with the features
features = ['valence', 'arousal', 'dominance']

# Create a correlation matrix
correlation_matrix = df[features].corr()

# Create a heatmap to visualize the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix Heatmap")
plt.show()


# Observer scores

In [7]:
import pandas as pd

df = pd.read_csv('/content/emotiontext_incl_oWAI.csv', delimiter=',')

new_column_names = {str(i): f'o{i}' for i in range(1, 13)}
df.rename(columns=new_column_names, inplace=True)

# List of column names
columns_to_check = ['o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7', 'o8', 'o9', 'o10','o11', 'o12']

# Convert the specified columns to numeric, replacing invalid values with NaN
df[columns_to_check] = df[columns_to_check].apply(pd.to_numeric, errors='coerce')

# Check for non-float values in the specified columns
non_float_mask = ~df[columns_to_check].applymap(lambda x: isinstance(x, float))

# Find rows where any of the specified columns have non-float values
rows_with_non_floats = non_float_mask.any(axis=1)

df = df.drop_duplicates()
df = df.dropna(subset=columns_to_check, how='any')

df = df.drop_duplicates(subset=['ppnr', 'o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7', 'o8', 'o9', 'o10','o11', 'o12'])

# If needed, reset the index of the filtered DataFrame
df.reset_index(drop=True, inplace=True)
df.fillna(0, inplace=True)

In [None]:
df #75 rows

In [8]:
import pandas as pd


# Assuming you have a DataFrame called 'df' with the desired columns
bond_columns = ['o3', 'o5', 'o7','o9']
goal_columns = ['o1', 'o4', 'o8','o11']
task_columns = ['o2', 'o6', 'o10','o12']

# Calculate the row-wise average for the specified columns
bond_df = df[bond_columns].mean(axis=1)
goal_df = df[goal_columns].mean(axis=1)
task_df = df[task_columns].mean(axis=1)

# Calculate the row-wise average for the specified columns
bond_df = df[bond_columns].mean(axis=1)
goal_df = df[goal_columns].mean(axis=1)
task_df = df[task_columns].mean(axis=1)



## RF

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict
from sklearn.impute import SimpleImputer

# Load data from CSV
all_df = df

# Convert numeric columns to float
numeric_columns = ['ppnr', 'session']
all_df[numeric_columns] = all_df[numeric_columns].astype(float)

df = all_df

# Select features and target variable
features = df[['neutral', 'curiosity', 'sadness', 'admiration',
       'fear', 'disgust', 'amusement', 'confusion', 'approval', 'joy', 'love',
       'realization', 'desire', 'annoyance', 'disapproval', 'nervousness',
       'remorse', 'excitement', 'anger', 'disappointment', 'surprise',
       'caring', 'grief', 'embarrassment', 'gratitude', 'pride', 'optimism',
       'relief', 'nr_positive', 'nr_negative']]


for i in range(12):
    nr = 'o' + str(i+1)

    outcome = all_df[nr]
    print("WAI question:", nr)

    # Handle missing values in features using an imputer
    imputer = SimpleImputer(strategy='mean')
    features_imputed = imputer.fit_transform(features)

    # Train the model
    t = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=1)
    t.fit(features_imputed, outcome)

    # Calculate out-of-bag MSE for Outcome 1
    oob_error = 1 - t.score(features_imputed, outcome)
    print("Out-of-Bag MSE for Outcome 1:", oob_error)

    # Get feature importances
    impOOB = t.feature_importances_

    # Plot feature importances
    plt.bar(range(len(impOOB)), impOOB)
    plt.title('Unbiased Predictor Importance Estimates')
    plt.xlabel('Predictor variable')
    plt.ylabel('Importance')
    plt.xticks(range(len(impOOB)), features.columns, rotation=90)
    # plt.show()

    # Check by predicting held back data
    ncells = features.shape[0]
    perc = int(0.8 * ncells)

    nboots = 10
    Acc = np.empty(nboots)
    for b in range(nboots):
        shuf = np.random.permutation(ncells)
        incl = shuf[:perc]
        holdback = shuf[perc:]

        Mdl = t.fit(features.iloc[incl], outcome.iloc[incl])
        label = Mdl.predict(features.iloc[holdback])
        Acc[b] = mean_squared_error(outcome.iloc[holdback], label)

    mean_mse = np.mean(Acc)
    print("Mean MSE for Outcome 1:", mean_mse)


    # Feature selection
    sorted_indices = np.argsort(impOOB)
    bestfeat = sorted_indices[7:]
    t = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=1)
    MdlBF = t.fit(features.iloc[:, bestfeat], outcome)
    oob_error_bf = 1 - MdlBF.score(features.iloc[:, bestfeat], outcome)
    print("Out-of-Bag MSE (Feature Selection):", oob_error_bf)

    # Print the selected features
    selected_features = features.columns[bestfeat]
    print("Selected Features:", selected_features)



## Pearson Correlation

In [None]:
import numpy as np
from scipy.stats import pearsonr, linregress
from tabulate import tabulate

correlation_results = []

# Calculate correlations and slopes
for feature in features:
    correlated_wai_questions = []

    for wai_question in range(1, 13):
        question_col = 'o' + str(wai_question)
        val = feature

        # Calculate Pearson correlation coefficient and p-value
        correlation_coefficient, p_value = pearsonr(df[val].astype(float), df[question_col].astype(float))

        if p_value < 0.05:
            # Calculate the slope of linear regression
            slope, _, _, _, _ = linregress(df[val].astype(float), df[question_col].astype(float))
            correlated_wai_questions.append([f'WAI Question {wai_question}', f"{correlation_coefficient:.2f} ({slope:.2f})", f"{p_value:.4f}"])

    if correlated_wai_questions:
        correlation_results.append([feature, correlated_wai_questions])

# Print results in a table
if correlation_results:
    headers = ["Feature", "WAI (Factors)","Correlation Coefficient (Slope)", "p-value"]
    table_data = []
    for result in correlation_results:
        feature_name = result[0]
        for corr_data in result[1]:
            table_data.append([feature_name] + corr_data)

    table = tabulate(table_data, headers=headers, tablefmt="pretty")
    # Save the table as a vector PDF
    plt.figure(figsize=(10, 6))
    plt.axis('off')
    plt.table(cellText=table_data, colLabels=headers, cellLoc='center', loc='center', colColours=['#f5f5f5']*len(headers))
    plt.savefig('correlation_table_o_textfeat.pdf', format='pdf', bbox_inches='tight')
    plt.close()

    print("Correlation Results for the observer scores Text Features:")
    print(table)
else:
    print("No correlations with p-value < 0.05 found.")

In [None]:
from scipy.stats import f_oneway

for i in range(12):
  nr =  'o' + str(i + 1)
  wai_scores = df[nr].astype('category')
  for feature in features:
      groups = []
      for category in wai_scores.cat.categories:
          groups.append(df[feature][wai_scores == category])

      # Perform ANOVA test
      f_statistic, p_value = f_oneway(*groups)

      if p_value < 0.05:
          print("Question: ",nr)
          print("Feature:", feature)
          print("F-Statistic:", f_statistic)
          print("p-value:", p_value)


## heatmap for correlation between features

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame with the features
features = ['valence', 'arousal', 'dominance']

# Create a correlation matrix
correlation_matrix = df[features].corr()

# Create a heatmap to visualize the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix Heatmap")
plt.show()
