## Pre-Processing

In [512]:
import pandas as pd
import numpy as np
import warnings
import scipy.stats as stats
import datetime

warnings.filterwarnings("ignore")

pre_intervention = pd.read_csv('datasets/pre_intervention.csv')
post_intervention = pd.read_csv('datasets/post_intervention.csv')

In [513]:
# Gets rid of unvalid participants
def pre_process_df(df):
    df = df.drop(df.index[0])
    df = df.drop(df.index[0])
    filtered_df = df[df['PROLIFIC_PID'].str.match(r'^[0-9a-fA-F]+$',na=False)]
    filtered_df = filtered_df[filtered_df["PROLIFIC_PID"].str.len() == 24]
    return filtered_df

In [514]:
pre_intervention_filtered = pre_process_df(pre_intervention)
post_intervention_filtered = pre_process_df(post_intervention)

In [515]:
merged_df = pd.merge(pre_intervention_filtered, post_intervention_filtered, on="PROLIFIC_PID",how="inner")

In [516]:
def compute_practical_score(answer, resp):
    return 2 if answer == resp else 0

def classify_use_frequency_category(frequency_mail):
    return 0 if frequency_mail in ["less than once a week", "approximately once a week"] else 1

def classify_mail_quantity_category(quantity_mail):
    return 0 if quantity_mail == "Less than 5" else 1

def get_points_from_self_report(self_report):
    if self_report == "very high":
            return 1
    elif self_report == "rather high":
            return 0.5
    elif self_report == "medium":
            return 0.25
    else:
        return 0


def determine_category(df,i,theory_score, practical_score, training_exp, frequency_mail, quantity_mail):
    if theory_score < 20.5:
        category = 0
        score = theory_score
    else:
        score = theory_score + practical_score
        if training_exp == "Yes, once":
            score += 1
        elif training_exp == "Yes, more than once":
            score += 2
        score += get_points_from_self_report(df["Q95_1_x"].iloc[i]) 
        score += get_points_from_self_report(df["Q95_2_x"].iloc[i]) 
        score += get_points_from_self_report(df["Q95_3_x"].iloc[i]) 
        if score < 27.5:
            category = 0 
        else:
            if score >= 39.5:
                category = 2
            else:
                if frequency_mail == 0:
                    if quantity_mail == 0:
                        category = 2
                    else:
                        category = 1
                else:
                    category = 1                       
    return category, score

In [517]:
# Create new dataset that will only contain useful measures
df = pd.DataFrame(columns=['prolific id', 'theory score', 'practical score', 'total score', 'category', 'new theory score', 'new practical score', 'new total score', 'new category', 'age group', 'education level', 'IT background','occupation', 'personality_1','personality_2','personality_3','personality_4','personality_5','personality_6','personality_7','personality_8','personality_9','personality_10'])
for i in range(len(merged_df)):
    practical_score = compute_practical_score(merged_df["Q111_x"].iloc[i],"Yes")+compute_practical_score(merged_df["Q113_x"].iloc[i],"Yes")+compute_practical_score(merged_df["Q115_x"].iloc[i],"Yes")+compute_practical_score(merged_df["Q117_x"].iloc[i],"No")+compute_practical_score(merged_df["Q119_x"].iloc[i],"Yes")
    new_practical_score = compute_practical_score(merged_df["Q111_y"].iloc[i],"Yes")+compute_practical_score(merged_df["Q113_y"].iloc[i],"Yes")+compute_practical_score(merged_df["Q115_y"].iloc[i],"Yes")+compute_practical_score(merged_df["Q117_y"].iloc[i],"No")+compute_practical_score(merged_df["Q119_y"].iloc[i],"Yes")
    category, total_score = determine_category(merged_df,i,float(merged_df["SC0_x"].iloc[i]), practical_score, merged_df["Q94"].iloc[i], merged_df["Q67"].iloc[i], merged_df["Q69"].iloc[i])
    new_category, new_total_score = determine_category(merged_df,i,float(merged_df["SC0_y"].iloc[i]), new_practical_score, merged_df["Q94"].iloc[i], merged_df["Q67"].iloc[i], merged_df["Q69"].iloc[i])
    df = df.append({'prolific id': merged_df["PROLIFIC_PID"].iloc[i], 'theory score': float(merged_df["SC0_x"].iloc[i]), 'practical score': practical_score, 'total score': total_score, 'category': category, 'new theory score': float(merged_df["SC0_y"].iloc[i]), 'new practical score': new_practical_score, 'new total score': new_total_score, 'new category': new_category, 'age group': merged_df["Q79"].iloc[i], "education level": merged_df["Q3"].iloc[i], "IT background": merged_df["Q6.8"].iloc[i], "occupation": merged_df["Q6.7"].iloc[i], 'personality_1': merged_df["personality_prov_1"].iloc[i],'personality_2': merged_df["personality_prov_2"].iloc[i],'personality_3': merged_df["personality_prov_3"].iloc[i],'personality_4': merged_df["personality_prov_4"].iloc[i],'personality_5': merged_df["personality_prov_5"].iloc[i],'personality_6': merged_df["personality_prov_6"].iloc[i],'personality_7': merged_df["personality_prov_7"].iloc[i],'personality_8': merged_df["personality_prov_8"].iloc[i],'personality_9': merged_df["personality_prov_9"].iloc[i],'personality_10': merged_df["personality_prov_10"].iloc[i]}, ignore_index=True)

## Descriptive Statistics

In [518]:
#proportion of each category
value_counts = df['category'].value_counts()
value_counts / 100

1    0.56
2    0.35
0    0.09
Name: category, dtype: float64

In [519]:
# proportion of each category after interventions 
value_counts = df['new category'].value_counts()
value_counts / 100

2    0.88
1    0.10
0    0.02
Name: new category, dtype: float64

In [520]:
def compute_score_diff(df):
    df["total score diff"] = df['new total score'] - df['total score']
    df["theory score diff"] = df['new theory score'] - df['theory score']
    df["practical score diff"] = df['new practical score'] - df['practical score']
    df["category diff"] = df["new category"] - df["category"]
    return df

In [521]:
df = compute_score_diff(df)

In [522]:
def compute_mean_std_for_score_diff(df, variable):
    df_stats = df.groupby('category')[variable].agg(['mean', 'std'])
    confidence = 0.95
    data_grouped = df.groupby('category')[variable]
    df_stats['Confidence_Interval'] = data_grouped.apply(lambda x: stats.t.interval(confidence, len(x)-1, loc=x.mean(), scale=stats.sem(x)))
    return df_stats

In [523]:
compute_mean_std_for_score_diff(df,'total score diff')

Unnamed: 0_level_0,mean,std,Confidence_Interval
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,22.105556,14.476175,"(10.97818214966566, 33.23292896144545)"
1,10.853571,5.659874,"(9.337847389506427, 12.369295467636428)"
2,6.694286,4.402602,"(5.181938889006475, 8.206632539564954)"


In [524]:
compute_mean_std_for_score_diff(df,'practical score diff')

Unnamed: 0_level_0,mean,std,Confidence_Interval
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.888889,2.666667,"(-1.160892564474107, 2.9386703422518847)"
1,0.821429,2.115803,"(0.2548127623663914, 1.3880443804907514)"
2,-0.285714,1.296407,"(-0.7310458249782839, 0.1596172535497124)"


In [525]:
compute_mean_std_for_score_diff(df,'theory score diff')

Unnamed: 0_level_0,mean,std,Confidence_Interval
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,15.022222,13.361023,"(4.752030539238158, 25.29241390520629)"
1,10.032143,4.737898,"(8.763325548274926, 11.300960166010787)"
2,6.98,3.872512,"(5.649745418918194, 8.310254581081805)"


In [526]:
def compute_desc_stats(df):   
    stats = df.groupby('category')['total score'].agg(['mean', 'std', 'median', 'min', 'max'])
    stats['25th Percentile'] = df.groupby('category')['total score'].quantile(0.25)
    stats['75th Percentile'] = df.groupby('category')['total score'].quantile(0.75)
    return stats

In [527]:
compute_desc_stats(df)

Unnamed: 0_level_0,mean,std,median,min,max,25th Percentile,75th Percentile
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,15.866667,6.23077,17.9,0.0,19.9,17.1,18.6
1,34.76875,3.096722,35.325,28.5,39.15,32.375,37.675
2,43.338571,2.825698,42.85,39.5,50.1,40.8,44.925


## Hypothesis Testing

In [528]:
# one-sample t-test for score differences for each group
grouped_data = df.groupby('category')['total score diff']
t_statistic_0, p_value_0 = stats.ttest_1samp(grouped_data.get_group(0), popmean=0)
t_statistic_1, p_value_1 = stats.ttest_1samp(grouped_data.get_group(1), popmean=0)
t_statistic_2, p_value_2 = stats.ttest_1samp(grouped_data.get_group(2), popmean=0)

n_0 = len(grouped_data.get_group(0))
n_1 = len(grouped_data.get_group(1))
n_2 = len(grouped_data.get_group(2))

# Perform chi-square test for category changes
observed = pd.crosstab(df['category'], df['new category'])
chi2_stat, p_value, _, _ = stats.chi2_contingency(observed)

# Print the test results
print("T-Test: t({}) = {:.3f}, p = {:.3f} Group 0".format(n_0 - 1,t_statistic_0, p_value_0))
print("T-Test: t({}) = {:.3f}, p = {:.3f} Group 1".format(n_1 - 1,t_statistic_1, p_value_1))
print("T-Test: t({}) = {:.3f}, p = {:.3f} Group 2".format(n_2 - 1,t_statistic_2, p_value_2))

print("Chi-Square Test: chi2 = {:.3f}, p = {:.3f}".format(chi2_stat, p_value))

T-Test: t(8) = 4.581, p = 0.002 Group 0
T-Test: t(55) = 14.350, p = 0.000 Group 1
T-Test: t(34) = 8.996, p = 0.000 Group 2
Chi-Square Test: chi2 = 21.169, p = 0.000


In [529]:
def  ind_2_samp_t_test(df, cat1, cat2, n1, n2):
    category_0_scores = df.loc[df['category'] == cat1, 'total score diff']
    category_1_scores = df.loc[df['category'] == cat2, 'total score diff']
    
    # Calculate degrees of freedom
    degf = n1 + n2 - 2
    
    # Perform independent two-sample t-test
    t_statistic, p_value = stats.ttest_ind(category_0_scores, category_1_scores)

    # Perform independ two-sample t-test
    print("Independent 2-Sample t-Test: t({}) = {:.3f}, p = {:.3f}".format(degf,t_statistic,p_value))

In [530]:
ind_2_samp_t_test(df,0,1,n_0,n_1)
ind_2_samp_t_test(df,1,2,n_1,n_2)
ind_2_samp_t_test(df,0,2,n_0,n_2)

Independent 2-Sample t-Test: t(63) = 4.241, p = 0.000
Independent 2-Sample t-Test: t(89) = 3.701, p = 0.000
Independent 2-Sample t-Test: t(42) = 5.530, p = 0.000


## Correlation Analysis (UNUSED)

In [531]:
from scipy.stats import f_oneway

def anova(df,mapping,variable1,variable2):
    # Perform one-way ANOVA test
    grouped_data = [df[df[variable1] == v][variable2] for v in mapping.keys()]
    f_statistic, p_value = f_oneway(*grouped_data)
    print("ANOVA: F = {:.3f}, p-value = {:.3f}".format(f_statistic, p_value))

In [532]:
age_mapping = {
    "18-25": 1,
    "26-35": 2,
    "36-45": 3,
    "46-55": 4,
    "56-65": 5,
    "66-75": 6,
    ">75": 7
}
# Imbalanced groups -> invalid ANOVA
anova(df_,age_mapping,'age group','total score diff')

ANOVA: F = nan, p-value = nan


In [533]:
merged_group = pd.concat([df_[df_['age group'] == '56-65'],
                          df_[df_['age group'] == '66-75'],
                          df_[df_['age group'] == '>75']])

merged_group['age group'] = '>56'

df_ = df_[df_['age group'] != '56-65']
df_ = df_[df_['age group'] != '66-75']
df_ = df_[df_['age group'] != '>75']

df_ = pd.concat([df_, merged_group])

In [534]:
age_mapping = {
    "18-25": 1,
    "26-35": 2,
    "36-45": 3,
    "46-55": 4,
    ">56": 5,
}

In [535]:
def compute_anovas(df,mapping,variable):
    print("Total score diff:")
    anova(df,mapping,variable,'total score diff')
    print("Category diff:")
    anova(df,mapping,variable,'category diff')
    print("Total score:")
    anova(df,mapping,variable,'total score')
    print("Category:")
    anova(df,mapping,variable,'category')

In [536]:
compute_anovas(df_,age_mapping,"age group")

Total score diff:
ANOVA: F = 0.431, p-value = 0.786
Category diff:
ANOVA: F = 0.743, p-value = 0.565
Total score:
ANOVA: F = 0.270, p-value = 0.897
Category:
ANOVA: F = 0.647, p-value = 0.631


## Regression Analysis

In [540]:
import statsmodels.api as sm

def ols_regression(df):
    X_age = pd.get_dummies(df['age group'], prefix='age')
    X_education = pd.get_dummies(df['education level'], prefix='education')
    X_background = pd.get_dummies(df['IT background'], prefix='IT background')
    X_occupation = pd.get_dummies(df['occupation'], prefix='occupation')

    X = pd.concat([X_age, X_education, X_background,X_occupation], axis=1)

    X = sm.add_constant(X)
    y = df['total score diff']
    model = sm.OLS(y, X)
    results = model.fit()
    print(results.summary())

In [541]:
def ols_regression_personality(df,var):
    X_personality_1 = pd.get_dummies(df['personality_1'],prefix="personality1")
    X_personality_2 = pd.get_dummies(df['personality_2'],prefix="personality2")
    X_personality_3 = pd.get_dummies(df['personality_3'],prefix="personality3")
    X_personality_4 = pd.get_dummies(df['personality_4'],prefix="personality4")
    X_personality_5 = pd.get_dummies(df['personality_5'],prefix="personality5")
    X_personality_6 = pd.get_dummies(df['personality_6'],prefix="personality6")
    X_personality_7 = pd.get_dummies(df['personality_7'],prefix="personality7")
    X_personality_8 = pd.get_dummies(df['personality_8'],prefix="personality8")
    X_personality_9 = pd.get_dummies(df['personality_9'],prefix="personality9")
    X_personality_10 = pd.get_dummies(df['personality_10'],prefix="personality10")
    X = pd.concat([X_personality_1, X_personality_2, X_personality_3, X_personality_4, X_personality_5, X_personality_6, X_personality_7, X_personality_8, X_personality_9, X_personality_10], axis=1)
    X = sm.add_constant(X)
    y = df[var]
    model = sm.OLS(y, X)
    results = model.fit()
    print(results.summary())

In [542]:
ols_regression(df)

                            OLS Regression Results                            
Dep. Variable:       total score diff   R-squared:                       0.191
Model:                            OLS   Adj. R-squared:                 -0.013
Method:                 Least Squares   F-statistic:                    0.9354
Date:                Sun, 25 Jun 2023   Prob (F-statistic):              0.546
Time:                        14:28:23   Log-Likelihood:                -334.48
No. Observations:                 100   AIC:                             711.0
Df Residuals:                      79   BIC:                             765.7
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                                                                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------

In [543]:
ols_regression(df[df["category"] == 0])

                            OLS Regression Results                            
Dep. Variable:       total score diff   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  0.998
Method:                 Least Squares   F-statistic:                     748.3
Date:                Sun, 25 Jun 2023   Prob (F-statistic):             0.0281
Time:                        14:28:24   Log-Likelihood:                 2.2445
No. Observations:                   9   AIC:                             11.51
Df Residuals:                       1   BIC:                             13.09
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                                                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------

In [544]:
ols_regression(df[df["category"]==1])

                            OLS Regression Results                            
Dep. Variable:       total score diff   R-squared:                       0.402
Model:                            OLS   Adj. R-squared:                  0.086
Method:                 Least Squares   F-statistic:                     1.272
Date:                Sun, 25 Jun 2023   Prob (F-statistic):              0.260
Time:                        14:28:24   Log-Likelihood:                -161.64
No. Observations:                  56   AIC:                             363.3
Df Residuals:                      36   BIC:                             403.8
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                                                                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------

In [545]:
ols_regression(df[df["category"]==2])

                            OLS Regression Results                            
Dep. Variable:       total score diff   R-squared:                       0.566
Model:                            OLS   Adj. R-squared:                  0.223
Method:                 Least Squares   F-statistic:                     1.651
Date:                Sun, 25 Jun 2023   Prob (F-statistic):              0.150
Time:                        14:28:24   Log-Likelihood:                -86.430
No. Observations:                  35   AIC:                             204.9
Df Residuals:                      19   BIC:                             229.7
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                                                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------

In [546]:
ols_regression_personality(df,'total score diff')

                            OLS Regression Results                            
Dep. Variable:       total score diff   R-squared:                       0.300
Model:                            OLS   Adj. R-squared:                 -0.175
Method:                 Least Squares   F-statistic:                    0.6318
Date:                Sun, 25 Jun 2023   Prob (F-statistic):              0.937
Time:                        14:28:25   Log-Likelihood:                -327.29
No. Observations:                 100   AIC:                             736.6
Df Residuals:                      59   BIC:                             843.4
Df Model:                          40                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

In [547]:
df["category"] = df['category'].astype(float)
df["category diff"] = df["category diff"].astype(float)

In [548]:
ols_regression_personality(df,'total score')

                            OLS Regression Results                            
Dep. Variable:            total score   R-squared:                       0.363
Model:                            OLS   Adj. R-squared:                 -0.069
Method:                 Least Squares   F-statistic:                    0.8396
Date:                Sun, 25 Jun 2023   Prob (F-statistic):              0.718
Time:                        14:28:29   Log-Likelihood:                -329.80
No. Observations:                 100   AIC:                             741.6
Df Residuals:                      59   BIC:                             848.4
Df Model:                          40                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

In [549]:
ols_regression_personality(df,'new total score')

                            OLS Regression Results                            
Dep. Variable:        new total score   R-squared:                       0.462
Model:                            OLS   Adj. R-squared:                  0.098
Method:                 Least Squares   F-statistic:                     1.269
Date:                Sun, 25 Jun 2023   Prob (F-statistic):              0.200
Time:                        14:28:29   Log-Likelihood:                -310.12
No. Observations:                 100   AIC:                             702.2
Df Residuals:                      59   BIC:                             809.1
Df Model:                          40                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

In [550]:
ols_regression_personality(df[df["category"]==2.0],'total score diff')

                            OLS Regression Results                            
Dep. Variable:       total score diff   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.536e+27
Date:                Sun, 25 Jun 2023   Prob (F-statistic):           2.02e-14
Time:                        14:28:29   Log-Likelihood:                 1055.6
No. Observations:                  35   AIC:                            -2043.
Df Residuals:                       1   BIC:                            -1990.
Df Model:                          33                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

In [551]:
ols_regression_personality(df[df["category"]==1.0],'total score diff')

                            OLS Regression Results                            
Dep. Variable:       total score diff   R-squared:                       0.732
Model:                            OLS   Adj. R-squared:                  0.132
Method:                 Least Squares   F-statistic:                     1.219
Date:                Sun, 25 Jun 2023   Prob (F-statistic):              0.338
Time:                        14:28:30   Log-Likelihood:                -139.20
No. Observations:                  56   AIC:                             356.4
Df Residuals:                      17   BIC:                             435.4
Df Model:                          38                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

## Threshold computation for draft study

In [554]:
def compute_mean_std(scores):
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    return mean_score, std_score

def compute_thresholds(mean, std):
    threshold1 = mean - std
    threshold2 = mean + std
    return threshold1, threshold2

In [555]:
mean_theory_score, std_theory_score = compute_mean_std(df["theory score"])
mean_practical_score, std_practical_score = compute_mean_std(df["practical score"])
mean_total_score, std_total_score = compute_mean_std(df["total score"])

print("total score:", mean_total_score, std_total_score)
print("theory score:", mean_theory_score, std_theory_score)
print("practical score:", mean_practical_score, std_practical_score)

total score: 36.067 8.201832782982105
theory score: 27.757 5.494110574060191
practical score: 7.16 1.653602128687551


In [556]:
threshold1_theory, threshold2_theory = compute_thresholds(mean_theory_score, std_theory_score)
threshold1_practical, threshold2_practical = compute_thresholds(mean_practical_score, std_practical_score)
threshold1_total, threshold2_total = compute_thresholds(mean_total_score, std_total_score)

print("total score:", threshold1_total, threshold2_total)
print("theory score:", threshold1_theory, threshold2_theory)
print("practical score:", threshold1_practical, threshold2_practical)

total score: 27.865167217017895 44.2688327829821
theory score: 22.26288942593981 33.25111057406019
practical score: 5.5063978713124495 8.81360212868755
