In [1]:
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
import seaborn.objects as so

# configure pandas
pd.set_option('display.width', 500)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 150)

In [2]:
data = pd.read_csv('ydi_rsa_stress.csv')
print(data.shape)
data.head()

(45956, 5)


Unnamed: 0,id,epoch,study,rsa_y_ms,rsa_p_ms
0,2,1,pdm,6.980297,5.363268
1,2,2,pdm,7.009551,5.337959
2,2,3,pdm,7.003428,5.294412
3,2,4,pdm,6.967709,5.275212
4,2,5,pdm,6.900535,5.253063


In [14]:
data.id.nunique()

146

In [15]:
data.id.value_counts()

id
1061    453
1021    386
1083    384
1073    362
24      345
64      345
4       341
14      341
71      339
90      337
102     335
1085    334
61      332
23      330
45      330
1059    329
1070    328
1066    328
1016    328
1055    328
72      328
1056    327
40      327
1076    327
1054    325
1060    325
7       325
1057    323
1042    323
1022    322
8       322
1002    322
44      322
54      321
63      321
1015    321
67      320
1092    320
1110    320
1069    320
37      320
94      320
1041    319
1068    319
1108    318
1024    318
36      318
28      318
30      318
76      318
57      318
22      317
25      317
1086    317
1078    317
1058    316
95      316
1012    316
101     316
65      316
41      316
80      315
51      315
1018    315
1030    315
1100    315
1031    315
17      315
32      315
62      315
49      314
1013    314
1117    314
82      314
92      314
1040    314
1049    314
42      314
1096    313
18      313
43      313
93      313
26      313
1

### Most participant (ID) pairs have different number of timepoints

In [16]:
data.study.value_counts()

study
pdm      23456
dorry    22500
Name: count, dtype: int64

In [25]:
n_subs = data.id.value_counts()
n_thresh = n_subs[data.id.value_counts()>=300].sort_index()
print(len(n_thresh))
n_thresh

136


id
3       300
4       341
5       311
6       305
7       325
8       322
11      310
13      308
14      341
17      315
18      313
20      310
22      317
23      330
24      345
25      317
26      313
28      318
29      304
30      318
31      311
32      315
33      312
36      318
37      320
38      309
39      311
40      327
41      316
42      314
43      313
44      322
45      330
46      307
49      314
51      315
52      303
54      321
56      310
57      318
58      312
59      313
60      313
61      332
62      315
63      321
64      345
65      316
66      303
67      320
68      309
69      311
71      339
72      328
73      304
76      318
80      315
81      311
82      314
87      313
90      337
91      309
92      314
93      313
94      320
95      316
96      310
99      308
101     316
102     335
1002    322
1005    307
1007    308
1008    307
1012    316
1013    314
1014    309
1015    321
1016    328
1017    307
1018    315
1020    305
1021    386
1

### Data Description

* Baseline: n=142 participants have >=150 timepoints
* Stress: n=136 participants have >=300 timepoints

### Load Baseline data

In [28]:
bas = pd.read_csv('ydi_rsa_baseline.csv')
bas_subs = bas.id.value_counts()
bas_thresh = bas_subs[bas.id.value_counts()>=150].sort_index()
len(bas_thresh)

142

### Get participants who have enough trials in baseline and stress conditions
* N=134 with at least 150 epochs baseline and 300 epochs stress conditions

In [32]:
subs_use = list(set(n_thresh.index).intersection(bas_thresh.index))

In [38]:
print(len(subs_use))
# subs_use

134


In [34]:
pd.Series(subs_use).to_csv('rsa_subs_to_use.csv', index=False)

### Select data for Good Participants
* check for any null values

In [39]:
stress = data[data.id.isin(subs_use)]
print(stress.shape)
stress.isnull().any()

(42698, 5)


id          False
epoch       False
study       False
rsa_y_ms    False
rsa_p_ms    False
dtype: bool

In [40]:
base = bas[bas.id.isin(subs_use)]
print(base.shape)
base.isnull().any()

(33319, 5)


id         False
epoch      False
study      False
rsa_y_b    False
rsa_p_b    False
dtype: bool

### Select epochs of interest 
* Stress: <=300
* Base: <=150

In [41]:
stress = stress[stress.epoch<=300]
base = base[base.epoch<=150]

In [42]:
print(stress.shape)
print(base.shape)

(40200, 5)
(20100, 5)


In [44]:
stress.epoch.max(), base.epoch.max()

(300, 150)

In [46]:
print('Sample Size')
stress.id.nunique(), base.id.nunique()

Sample Size


(134, 134)

### Save Cleaned data

In [45]:
stress.to_csv('ydi_rsa_stress_clean.csv', index=False)
base.to_csv('ydi_rsa_baseline_clean.csv', index=False)