In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
from scipy import stats
from sklearn.decomposition import FactorAnalysis

In [2]:
# Read in processed data.
df = pd.read_csv('summary_spring2020.csv')
print(df.columns)

# Transformed values of relevant variables.
df = df.assign(sqrt_eta0 = np.sqrt(df.eta0))
df = df.assign(log_eta0 = np.log(df.eta0))

df_good = df.loc[df.good_perf].copy() # only participants who pass the performance criterion

df_norm = pd.DataFrame(0.0,
                       index = df.index,
                       columns = ['eta0', 'sqrt_eta0', 'log_eta0', 'threat_benign_os', 'promis_anx', 'promis_ang', 'promis_dep',  'promis_pos'])
for var_name in df_norm.columns.values:
    df_norm[var_name] = (df[var_name] - df[var_name].mean())/df[var_name].std()
df_norm['good_perf'] = df['good_perf']
df_good_norm = df_norm.loc[df_norm.good_perf]
print(df.columns)
print(df.shape[0])
print(df_good.shape[0])

Index(['ident', 'schedule', 'tutorial_0a_last8_pct_correct',
       'tutorial_0b_last8_pct_correct', 'tutorial_0c_last8_pct_correct',
       'training_last8_pct_correct', 'transfer_last8_pct_correct', 'rel_irl',
       'threat_benign_os', 'threat_benign_ri', 'Section Sum', 'anx1', 'anx2',
       'anx3', 'anx4', 'anx5', 'anx6', 'anx7', 'anx8', 'ang1', 'ang2', 'ang3',
       'ang4', 'ang5', 'dep1', 'dep2', 'dep3', 'dep4', 'dep5', 'dep6', 'dep7',
       'dep8', 'pos1', 'pos2', 'pos3', 'pos4', 'pos5', 'pos6', 'pos7', 'pos8',
       'pos9', 'pos10', 'pos11', 'pos12', 'pos13', 'pos14', 'pos15',
       'promis_anx_sum', 'promis_anx_std', 'promis_anx', 'promis_ang_sum',
       'promis_ang_std', 'promis_ang', 'promis_dep_sum', 'promis_dep_std',
       'promis_dep', 'promis_pos_sum', 'promis_pos_std', 'promis_pos', 'eta0',
       'lrate', 'lrate_atn', 'metric', 'resp_scale', 'prop_log_post',
       'good_perf'],
      dtype='object')
Index(['ident', 'schedule', 'tutorial_0a_last8_pct_correct',
 

In [3]:
print(df[['lrate', 'lrate_atn']].head())

      lrate  lrate_atn
0  0.140468   1.520684
1  0.821032   1.999000
2  0.532302   1.999000
3  0.280143   0.293351
4  0.994677   0.183114


In [4]:
print('CORRELATIONS \n')
data = df_good
x = ['eta0'] + 3*['promis_anx'] + 2*['promis_ang'] + 2*['promis_anx']
y = ['threat_benign_os', 'threat_benign_ri'] + 2*['threat_benign_os', 'eta0'] + ['promis_ang', 'promis_pos']
n_rows = len(x)
print(x[4])
print(y[4])
rtests = pd.DataFrame(index = range(n_rows),
                      columns = ['x', 'y', 'r', 'p', 'n'])
for i in range(n_rows):
    data_ok = data[[x[i], y[i]]].dropna()
    x_var = data_ok[x[i]]
    y_var = data_ok[y[i]]
    rtests.iloc[i]['x'] = x[i]
    rtests.iloc[i]['y'] = y[i]
    (r, p) = stats.pearsonr(x_var, y_var)
    rtests.iloc[i]['r'] = np.round(r, 2)
    rtests.iloc[i]['p'] = np.round(p, 4)
    rtests.iloc[i]['n'] = data_ok.shape[0]
print(rtests)

CORRELATIONS 

promis_ang
threat_benign_os
            x                 y     r       p   n
0        eta0  threat_benign_os  0.21  0.0497  89
1  promis_anx  threat_benign_ri -0.09  0.3987  89
2  promis_anx  threat_benign_os -0.05  0.6672  89
3  promis_anx              eta0  0.05  0.6657  89
4  promis_ang  threat_benign_os -0.03  0.7921  87
5  promis_ang              eta0  0.06  0.5532  87
6  promis_anx        promis_ang   0.6       0  87
7  promis_anx        promis_pos -0.46       0  88


In [5]:
print(df[['eta0', 'ident']].head())

       eta0      ident
0  0.815658  0222rehom
1  4.582733  0111gariv
2  1.245213  0128meher
3  1.515109  0410legra
4  1.034777  0510shfai


In [6]:
print(np.mean(df['prop_log_post']))

-82.88648518951628
