In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

def set_css_in_cell_output():
    display(HTML('''
        <style>
            .jupyter-widgets {color: #d5d5d5 !important;}
            .widget-label {color: #d5d5d5 !important;}
        </style>
    '''))

get_ipython().events.register('pre_run_cell', set_css_in_cell_output)
from IPython.core import ultratb
ultratb.VerboseTB._tb_highlight = "bg:#0D0D0D"

In [2]:
import pandas as pd
import numpy as np
import pandas as pd
from os import listdir
from os.path import isfile, join
import time
from scipy import stats
from tqdm.notebook import tqdm
import json

pd.options.display.max_columns = None

In [None]:
DATA_ROOT = './data/AAPECS/'  # please contact the authors for access to the data

In [3]:
pheno_df = pd.read_csv(f'{DATA_ROOT}/eod_new_time.csv')
data_root = f'{DATA_ROOT}/raw_video_logs'
data_files = [f for f in listdir(data_root) if isfile(join(data_root, f))]

sub_transcripts = {}
sub_lengths = {}

for file in tqdm(data_files):
    df = pd.read_csv(f'{data_root}/{file}')
    df = df[(df.values[:, -1] != 'NO_ANSWER') & (df.values[:, -1] != 'SKIPPED')]

    dates = [x.replace('/', '_') for x in df['Survey Submitted Date'].values]  # dd_mm_yyyy
    times = [x.replace(':', '-') for x in df['Survey Submitted Time'].values]  # dd_mm_yyyy
    addresses = df.values[:, -1]
    userid = df['User Id'].values
    usernum = file.lower().replace('eod', '').replace('vids', '').replace('videos', '').replace('.csv', '').replace('video', '')
    usernum = int(usernum)
    triggers = [x.replace(' ', '') for x in df['Trigger Type'].values]

    assert np.all(['http' in x for x in addresses])
    assert np.all((df['Trigger Type'].values == "DAILY") | (df['Trigger Type'].values == "DELETED TRIGGER") | (df['Trigger Type'].values == "ONCE"))
    assert np.all([x.split('/')[-1].split('.')][-1] == 'mp4' for x in addresses)
    
    sub_pheno_df = pheno_df[pheno_df.participantID == usernum]
    if sub_pheno_df.shape[0] == 0: continue

    
    for i in range(len(addresses)):
        txt = json.load(open(f'{DATA_ROOT}/transcripts_json/{usernum}/{usernum}_{dates[i]}_{times[i]}_{triggers[i]}_{userid[i]}.json'))['text'].strip()
        
        day, month, year = dates[i].split('_')
        sub = usernum
        if sub not in sub_transcripts:
            sub_transcripts[sub] = []
        sub_transcripts[sub].append(txt)
        
        recording_length = json.load(open(f'{DATA_ROOT}/transcripts_json/{usernum}/{usernum}_{dates[i]}_{times[i]}_{triggers[i]}_{userid[i]}.json'))['segments'][-1]['end']
        if sub not in sub_lengths:
            sub_lengths[sub] = []
        sub_lengths[sub].append(recording_length)


A Jupyter Widget

In [4]:
name_mapping_aapecs = {
    'O': 'neoOpenness',
    'C': 'neoConscientiousness',
    'E': 'neoExtraversion',
    'A': 'neoAgreeableness',
    'N': 'neoNeuroticism'
}
cols_of_interest = sorted(name_mapping_aapecs.values())

In [5]:
scales_df = pd.read_csv(f'{DATA_ROOT}/selfReport.csv')
scales_df = scales_df[scales_df.participantID.isin([int(x) for x in sub_lengths])]
scales_df = scales_df[['participantID']+list(cols_of_interest)]
print(scales_df.shape)
scales_df.head()

(108, 6)


Unnamed: 0,participantID,neoAgreeableness,neoConscientiousness,neoExtraversion,neoNeuroticism,neoOpenness
0,1,2.75,2.875,1.291667,2.875,1.875
1,2,2.666667,2.708333,1.75,2.416667,2.166667
2,3,2.430556,1.083333,2.0,3.625,3.166667
3,4,2.25,2.666667,2.958333,2.833333,3.208333
4,5,3.0,3.083333,1.25,2.75,1.625


In [6]:
liwc_df = pd.read_csv(f'{DATA_ROOT}/AAPECS_LIWC_2015.csv')
liwc_df['participantID'] = [int(x.split('_')[0]) for x in liwc_df.Filename.values.flatten()]
liwc_df = liwc_df[liwc_df.participantID.isin(scales_df.participantID.values.flatten())]
liwc_df = liwc_df.drop(columns=['Filename', 'Segment'])
liwc_df = liwc_df.groupby('participantID', as_index=False).mean()
print(liwc_df.shape)
liwc_df.head()

(108, 94)


Unnamed: 0,participantID,WC,Analytic,Clout,Authentic,Tone,WPS,Sixltr,Dic,function,pronoun,ppron,i,we,you,shehe,they,ipron,article,prep,auxverb,adverb,conj,negate,verb,adj,compare,interrog,number,quant,affect,posemo,negemo,anx,anger,sad,social,family,friend,female,male,cogproc,insight,cause,discrep,tentat,certain,differ,percept,see,hear,feel,bio,body,health,sexual,ingest,drives,affiliation,achieve,power,reward,risk,focuspast,focuspresent,focusfuture,relativ,motion,space,time,work,leisure,home,money,relig,death,informal,swear,netspeak,assent,nonflu,filler,AllPunc,Period,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
0,1,115.294118,41.665294,31.838824,90.496471,44.923529,13.054706,10.347059,96.483529,56.887647,17.508824,12.078235,8.935294,1.25,0.24,1.555882,0.096471,5.431176,6.471176,13.371765,10.751176,7.502353,6.569412,2.368824,19.808235,1.897059,0.950588,0.561176,1.455882,3.543529,5.827647,3.428235,2.367059,0.96,1.077647,0.032941,7.495294,0.307059,1.068235,1.55,0.595882,8.41,0.544118,1.154706,0.426471,2.210588,1.131765,3.271176,1.123529,0.444118,0.300588,0.378235,2.788824,0.487059,0.511765,0.0,1.845882,10.574118,3.183529,1.829412,1.626471,3.975294,0.188824,9.394706,13.085882,2.592941,21.197647,2.634706,6.294118,12.684118,2.077059,1.091765,1.055882,0.697059,0.148824,0.0,0.675294,0.048235,0.0,0.530588,0.048235,0.0,25.293529,9.021765,10.814118,0.0,0.0,0.221176,0.0,0.0,0.0,5.235882,0.0,0.0
1,2,144.428571,17.546667,23.048095,88.160952,32.511429,16.728095,12.862381,96.456667,63.14,22.07381,15.87381,12.687619,0.848571,0.513333,1.414762,0.41,6.199524,4.570476,13.096667,11.674286,8.927143,7.96619,2.59619,20.6,4.989048,3.267143,1.276667,0.54381,4.094762,6.098095,2.837143,3.009524,0.80619,1.159524,0.456667,8.583333,1.561429,0.22381,2.29381,0.335238,14.391429,2.222857,1.304286,1.163333,4.281905,1.40381,4.709048,2.251905,0.487143,0.480952,1.28381,2.055238,0.692857,1.053333,0.032381,0.267143,7.240952,2.692857,1.54,1.424762,1.77619,0.186667,7.179524,11.14,1.571905,13.760476,2.495714,5.749048,5.657619,0.935238,0.607619,0.626667,0.443333,0.159048,0.0,0.827619,0.092381,0.028571,0.257143,0.449524,0.0,19.088095,7.040952,7.640476,0.0,0.0,0.1,0.0,0.0,0.0,4.305714,0.0,0.0
2,3,61.333333,18.16,10.781667,97.015,37.816667,12.678333,12.818333,94.311667,59.888333,20.03,15.513333,14.793333,0.173333,0.0,0.546667,0.0,4.515,5.268333,9.7,9.071667,8.658333,10.75,1.715,16.856667,6.563333,3.701667,1.076667,0.498333,3.915,9.091667,4.406667,4.396667,2.536667,0.575,0.535,6.37,0.273333,1.498333,0.82,0.0,15.95,3.541667,0.793333,0.421667,5.488333,0.535,5.743333,2.821667,0.0,0.0,2.821667,1.306667,0.248333,0.771667,0.0,0.286667,8.565,2.941667,1.093333,1.825,2.698333,0.5,9.226667,7.095,1.59,12.855,1.998333,3.923333,7.218333,1.475,1.796667,0.286667,0.0,0.0,0.0,0.671667,0.248333,0.173333,0.421667,0.0,0.0,17.258333,8.501667,6.23,0.0,0.0,0.0,0.0,0.0,0.0,2.523333,0.0,0.0
3,4,166.944444,10.802222,22.613333,76.110556,45.163889,21.277778,13.885556,94.066667,60.402778,20.606111,12.670556,9.758889,0.617222,0.291667,1.768889,0.235556,7.934444,4.333333,10.147222,11.88,9.051111,11.586667,1.803889,20.716667,5.265556,2.775,1.432778,0.751667,2.394444,8.500556,4.387222,3.96,1.343889,0.763889,0.976667,6.962778,0.0,0.807222,0.061111,2.455,15.148333,3.295556,1.787778,1.181111,4.11,1.712222,4.149444,2.676667,0.64,0.468333,1.453889,2.243333,0.666667,0.768889,0.412778,0.408889,5.27,1.348333,1.171111,0.848889,1.639444,0.542222,6.353333,12.808333,1.840556,12.783889,2.033889,4.668889,6.073889,1.285,1.127778,0.515,0.159444,0.041667,0.087222,2.236111,0.533889,0.0,0.422222,0.846667,0.116111,28.24,7.569444,14.231111,0.0,0.0,0.0,0.105,0.305,0.0,5.999444,0.0,0.027778
4,5,87.789474,44.479474,44.299474,69.36,49.663684,14.744737,13.730526,93.993684,59.138947,17.836316,12.561579,8.292105,1.506316,0.063158,1.475263,1.225789,5.275789,6.921579,13.182632,10.672105,4.559474,9.557368,1.239474,19.391053,7.313684,2.938421,0.949474,0.817895,2.510526,7.835263,4.513158,3.321053,1.662105,0.225263,0.746316,8.788421,1.517368,0.254737,1.340526,1.505263,10.5,2.270526,1.506316,0.857368,2.793684,0.592105,3.04,2.25,0.384211,0.400526,1.238947,1.427895,0.262632,0.643158,0.0,0.489474,10.49,3.679474,2.414211,2.105263,2.42,0.452632,11.708947,5.932105,0.837368,15.108421,3.056842,5.751579,6.332105,2.103158,1.656842,0.763158,0.083684,0.124211,0.077368,0.482105,0.0,0.0,0.0,0.482105,0.0,15.013158,7.532632,5.214737,0.0,0.0,0.0,0.0,0.221053,0.0,2.043684,0.0,0.0


In [7]:
from sklearn.model_selection import KFold
import statsmodels.api as sm
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, sem

def fit_ols_models_cv(scales_df, liwc_df, outer_splits=5, random_state=42):
    """
    Perform k-fold cross-validation to compute multiple correlation (R)
    between LIWC predictors and each trait.

    Parameters:
        scales_df: DataFrame with subject and trait columns.
        liwc_df: DataFrame with subject and LIWC features.
        outer_splits: Number of outer CV folds.
    
    Returns:
        final_r: dict of mean and std_err cross-validated R per trait.
    """
    merged = pd.merge(scales_df, liwc_df, on='subject').dropna()
    assert merged.shape[0] == 60

    predictor_cols = [col for col in liwc_df.columns if col != 'subject']
    X_all = merged[predictor_cols].values
    y_all = {trait: merged[trait].values for trait in scales_df.columns if trait != 'subject'}

    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=random_state)
    avg_cv_r = {trait: [] for trait in y_all}

    for train_idx, test_idx in outer_cv.split(X_all):
        X_train, X_test = X_all[train_idx], X_all[test_idx]

        X_train_w_intercept = sm.add_constant(X_train)
        X_test_w_intercept = sm.add_constant(X_test)

        for trait, y in y_all.items():
            y_train, y_test = y[train_idx], y[test_idx]
            model = sm.OLS(y_train, X_train_w_intercept).fit()
            y_pred = model.predict(X_test_w_intercept)

            r, _ = pearsonr(y_test, y_pred)
            avg_cv_r[trait].append(r)

    final_r = {trait: (np.mean(r_vals), sem(r_vals)) for trait, r_vals in avg_cv_r.items()}
    return final_r

In [11]:
def pretty_print_ols_cv_results(results):
    """
    Pretty prints 5-fold CV results for OLS without PCA.

    Parameters:
        results (dict): Trait name → cross-validated R
    """
    max_len = max(len(trait) for trait in results)
    lines = []
    lines.append("5-Fold Cross-Validation Results (simple OLS)")
    lines.append("Multiple Correlation Coefficient: (trait ~ all LIWC predictors)")
    for trait in results:
        r_ci_upper = results[trait][0]+(1.96*results[trait][1])
        r_ci_lower = results[trait][0]-(1.96*results[trait][1])
        r_ci = f'({r_ci_lower:>5.3f} - {r_ci_upper:.3f})'
        lines.append(f"• {trait:<{max_len}}  R = {results[trait][0]:>5.3f}  R-SE = {results[trait][1]:>5.3f}   R-95%CI = {r_ci}")
    return "\n".join(lines)

cv_r = fit_ols_models_cv(scales_df, liwc_df)
print(pretty_print_ols_cv_results(cv_r))


5-Fold Cross-Validation Results (simple OLS)
Multiple Correlation Coefficient: (trait ~ all LIWC predictors)
• neoAgreeableness      R = -0.063  R-SE = 0.111   R-95%CI = (-0.281 - 0.154)
• neoConscientiousness  R = 0.194  R-SE = 0.028   R-95%CI = (0.140 - 0.249)
• neoExtraversion       R = 0.096  R-SE = 0.119   R-95%CI = (-0.137 - 0.329)
• neoNeuroticism        R = 0.160  R-SE = 0.099   R-95%CI = (-0.034 - 0.354)
• neoOpenness           R = -0.150  R-SE = 0.106   R-95%CI = (-0.359 - 0.058)
