# Prior Parameters Estimation
for Bayesian Model Comparison

In [95]:
# Basic imports and setup.

import sys
import logging
from pathlib import Path

from IPython.display import display
%load_ext autoreload
%autoreload 2

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

from neuropsymodelcomparison.dataprocessing import analysis
from neuropsymodelcomparison import plot

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
# Get raw data.
data_path = Path("../data/raw")

users = pd.read_csv(data_path / 'users.csv', dtype={'gaming_exp': pd.Int8Dtype()})
blocks = pd.read_csv(data_path / 'blocks.csv', index_col='id', parse_dates=['time_iso'], dtype={'rating': pd.Int8Dtype()})
trials = pd.read_csv(data_path / 'trials.csv', index_col='id')

Pilot data was collected up until July 31st 2020. We use these data to estimate our prior believes about the model parameters.

In [38]:
end_date = '2020-08-01'  # Day starts at 00:00:00, so at the end of the previous day.

# Keep only blocks within piloting time period.
blocks = blocks.loc[blocks['time_iso'] < end_date, :]

# Keep only those users that participated in the given time period.
users = users.loc[users['id'].isin(blocks['user_id'].unique()), :]

In [39]:
# If a subsequent block is completed within 2 seconds after the previous one, there was a malfunction in the app.
blocks, n_errors, invalid_sessions = analysis.remove_erroneous_blocks(blocks, delta_time=2.0, n_blocks=3)
print(f"There were {n_errors} malfunctions during testing. {len(invalid_sessions)} sessions had to be removed.")
# Merge data to 1 table.
df = analysis.join_data(users, blocks, trials)
df[['user', 'session', 'block', 'block_id', 'condition', 'task']] = df[['user', 'session', 'block', 'block_id', 'condition', 'task']].astype('category')
# Remove trials for which sliders where grabbed with too much time apart.
# The arbitrary choice for a threshold is set to a third of the available time.
n_trials = len(df)
df = df.loc[df['grab_diff'] < (blocks['trial_duration'].median()/3), :]
n_trials_removed = n_trials - len(df)
print(f"{n_trials_removed} trials were removed because slider activation was more than a third of the available time apart.")

There were 0 malfunctions during testing. 0 sessions had to be removed.
45 trials were removed because slider activation was more than a third of the available time apart.


## Outlier Detection
Outlier detection by covariance estimation in a Gaussian distributed dataset.

In [40]:
# Detect outliers in the dataset.
contamination = 0.024  # Proportion of outliers in the data set. Manually adjusted upon looking at the data.

outliers, z = analysis.get_outlyingness(df[['df1', 'df2']].values, contamination=contamination)
df['outlier'] = outliers.astype(bool)
n_trials_outliers = df['outlier'].value_counts()[True]
print(f"There are {n_trials_outliers} outliers in the data set at a contamination of {contamination*100:.2f}%.")

There are 14 outliers in the data set at a contamination of 2.40%.


# Plot Pilot Data Set

In [34]:
fig_trials_scatter = plot.generate_trials_figure(df, contour_data=z)
fig_trials_scatter.show()

Check if we have enough samples.

In [41]:
# Aggregate valid trial counts.
df_counts = df.groupby(['user', 'session', 'block_id', 'block', 'condition'], observed=True).size().rename('valid trials count').reset_index()
df_counts

Unnamed: 0,user,session,block_id,block,condition,valid trials count
0,0,1,1,1,df1,25
1,0,1,2,2,df1,21
2,0,1,3,3,df1,25
3,1,1,4,1,df2,25
4,1,1,5,2,df2,24
5,1,1,6,3,df2,25
6,2,1,7,1,df2,24
7,2,1,8,2,df2,25
8,2,1,9,3,df2,22
9,3,1,10,1,df2,25


## Calculate Squared Projection Lengths
The model comparison uses squared projection lengths internally. The transformation is a step towards calculating variance. The mean of the projections is already 0. Since its all positive values, we can then use gamma distributions to describe the parameters.

In [45]:
# Calculate projections onto vectors parallel and orthogonal to UCM.
ucm_vec = analysis.get_ucm_vec()
projections = df.groupby(['user', 'session', 'task'], observed=True)[['df1', 'df2']].apply(analysis.get_projections, ucm_vec)
projections = projections.transform('square')
df = pd.concat([df[['user', 'block_id', 'block']], projections], axis='columns')

## Estimate Parameters for Models
We parameterize the gamma distributions for the data by the mean (mu) and standard deviation (sigma). We sample these parameters from gamma distributions as well, since they have to be positive.

### Null Model
Assume all projections are sampled from the same distribution. Each participants may have a different baseline.

In [52]:
samples = df.melt(id_vars='user', value_vars=['parallel', 'orthogonal'], var_name='projection', value_name='proj_sq').drop('projection', axis='columns')
# First compute means and standard deviation for each user across all projections and blocks. Then take the mean and standard deviation from these results.
null_params = samples.groupby('user')['proj_sq'].agg(['mean', 'std']).agg(['mean', 'std'])
null_params

Unnamed: 0,mean,std
mean,72.104866,243.362728
std,61.787912,189.30291


### Main Effect Projection
Assume a constant difference between parallel and orthogonal projections to the UCM. The difference can vary between participants.

In [96]:
projection_effect_params = df.groupby('user').agg(['mean', 'std'])
display(projection_effect_params)
# Difference of parallel squared projections to orthogonal squared projections.
projection_effect_params.xs('mean', level=1, axis='columns').diff(-1, axis='columns').dropna(axis='columns').agg(['mean', 'std'])

Unnamed: 0_level_0,parallel,parallel,orthogonal,orthogonal
Unnamed: 0_level_1,mean,std,mean,std
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,24.176605,39.018029,3.695711,9.223856
1,21.640181,126.440421,26.433303,147.79047
2,162.505849,419.330665,95.493054,386.215401
3,64.336062,182.063745,25.118699,71.855827
4,135.322462,418.756395,53.131031,163.615041
5,227.275858,492.3772,133.371138,288.237727
6,164.878179,709.831206,8.237312,54.060492
7,3.601251,4.207189,4.461157,7.698016


Unnamed: 0,parallel
mean,56.72438
std,54.563912
