## Pre-Processing

In [69]:
import pandas as pd
import numpy as np
import warnings
import scipy.stats as stats


# Disable all warnings
warnings.filterwarnings("ignore")

# Open the CSV file
filename = 'datasets/qualtrics/pre_intervention.csv'
pre_intervention = pd.read_csv('datasets/qualtrics/pre_intervention.csv')
post_intervention = pd.read_csv('datasets/qualtrics/post_intervention.csv')
pd.set_option('display.max_columns', None)

In [2]:
pre_intervention = pre_intervention.drop(pre_intervention.index[0])
pre_intervention = pre_intervention.drop(pre_intervention.index[0])
post_intervention = post_intervention.drop(post_intervention.index[0])
post_intervention = post_intervention.drop(post_intervention.index[0])

In [3]:
pre_intervention['StartDate'] = pd.to_datetime(pre_intervention['StartDate'],format='%Y-%m-%d %H:%M:%S')
pre_intervention['date'] = pre_intervention['StartDate'].dt.date
pre_intervention['time'] = pre_intervention['StartDate'].dt.time
pre_intervention_filtered = pre_intervention[
    (pre_intervention['date'] >= pd.to_datetime('2023-05-26').date())
]

In [4]:
post_intervention['StartDate'] = pd.to_datetime(post_intervention['StartDate'],format='%Y-%m-%d %H:%M:%S')
post_intervention['date'] = post_intervention['StartDate'].dt.date
post_intervention['time'] = post_intervention['StartDate'].dt.time
post_intervention_filtered = post_intervention[
    (post_intervention['date'] >= pd.to_datetime('2023-05-26').date()) 
]

In [5]:
merged_df = pd.merge(pre_intervention_filtered, post_intervention_filtered, on="PROLIFIC_PID").iloc[1:]

In [6]:
def compute_practical_score(answer, resp):
    return 2 if answer == resp else 0

def classify_use_frequency_category(frequency_mail):
    return 0 if frequency_mail in ["less than once a week", "approximately once a week"] else 1

def classify_mail_quantity_category(quantity_mail):
    return 0 if quantity_mail == "Less than 5" else 1

def get_points_from_self_report(self_report):
    if self_report == "very high":
            return 1
    elif self_report == "rather high":
            return 0.5
    elif self_report == "medium":
            return 0.25
    else:
        return 0


def determine_category(df,i,theory_score, practical_score, training_exp, frequency_mail, quantity_mail):
    if theory_score < 20.5:
        category = 0
        score = theory_score
    else:
        score = theory_score + practical_score
        if training_exp == "Yes, once":
            score += 1
        elif training_exp == "Yes, more than once":
            score += 2
        score += get_points_from_self_report(df["Q95_1_x"].iloc[i]) 
        score += get_points_from_self_report(df["Q95_2_x"].iloc[i]) 
        score += get_points_from_self_report(df["Q95_3_x"].iloc[i]) 
        if score < 27.5:
            category = 0 
        else:
            if score >= 39.5:
                category = 2
            else:
                if frequency_mail == 0:
                    if quantity_mail == 0:
                        category = 2
                    else:
                        category = 1
                else:
                    category = 1                       
    return category, score

In [7]:
df = pd.DataFrame(columns=['prolific id', 'theory score', 'practical score', 'total score', 'category', 'new theory score', 'new practical score', 'new total score', 'new category', 'age group', 'education level', 'IT occupation'])
for i in range(len(merged_df)):
    practical_score = compute_practical_score(merged_df["Q111_x"].iloc[i],"Yes")+compute_practical_score(merged_df["Q113_x"].iloc[i],"Yes")+compute_practical_score(merged_df["Q115_x"].iloc[i],"Yes")+compute_practical_score(merged_df["Q117_x"].iloc[i],"No")+compute_practical_score(merged_df["Q119_x"].iloc[i],"Yes")
    new_practical_score = compute_practical_score(merged_df["Q111_y"].iloc[i],"Yes")+compute_practical_score(merged_df["Q113_y"].iloc[i],"Yes")+compute_practical_score(merged_df["Q115_y"].iloc[i],"Yes")+compute_practical_score(merged_df["Q117_y"].iloc[i],"No")+compute_practical_score(merged_df["Q119_y"].iloc[i],"Yes")
    category, total_score = determine_category(merged_df,i,float(merged_df["SC0_x"].iloc[i]), practical_score, merged_df["Q94"].iloc[i], merged_df["Q67"].iloc[i], merged_df["Q69"].iloc[i])
    new_category, new_total_score = determine_category(merged_df,i,float(merged_df["SC0_y"].iloc[i]), new_practical_score, merged_df["Q94"].iloc[i], merged_df["Q67"].iloc[i], merged_df["Q69"].iloc[i])
    df = df.append({'prolific id': merged_df["PROLIFIC_PID"].iloc[i], 'theory score': float(merged_df["SC0_x"].iloc[i]), 'practical score': practical_score, 'total score': total_score, 'category': category, 'new theory score': float(merged_df["SC0_y"].iloc[i]), 'new practical score': new_practical_score, 'new total score': new_total_score, 'new category': new_category, 'age group': merged_df["Q79"].iloc[i], "education level": merged_df["Q3"].iloc[i], "IT occupation": merged_df["Q6.8"].iloc[i]}, ignore_index=True)

## Descriptive Statistics

In [29]:
#proportion of each category
value_counts = df['category'].value_counts()
value_counts / 70

1    0.571429
2    0.342857
0    0.085714
Name: category, dtype: float64

In [50]:
# proportion of each category after interventions 
value_counts = df['new category'].value_counts()
value_counts / 70

2    0.900000
1    0.085714
0    0.014286
Name: new category, dtype: float64

In [None]:
def compute_score_diff(df):
    df["total score diff"] = df['new total score'] - df['total score']
    df["theory score diff"] = df['new theory score'] - df['theory score']
    df["practical score diff"] = df['new practical score'] - df['practical score']
    df["category diff"] = df["new category"] - df["category"]
    return df

In [59]:
df = compute_score_diff(df)

In [72]:
def compute_mean_std_for_score_diff(df, variable):
    df_stats = df.groupby('category')[variable].agg(['mean', 'std'])
    confidence = 0.95
    data_grouped = df.groupby('category')[variable]
    df_stats['Confidence_Interval'] = data_grouped.apply(lambda x: stats.t.interval(confidence, len(x)-1, loc=x.mean(), scale=stats.sem(x)))
    return df_stats

In [73]:
compute_mean_std_for_score_diff(df,'total score diff')

Unnamed: 0_level_0,mean,std,Confidence_Interval
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,18.533333,11.842199,"(6.105708108018311, 30.960958558648358)"
1,10.8975,4.870502,"(9.339837940266001, 12.455162059733997)"
2,6.691667,4.303479,"(4.874467052471511, 8.508866280861824)"


In [74]:
compute_mean_std_for_score_diff(df,'practical score diff')

Unnamed: 0_level_0,mean,std,Confidence_Interval
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-0.333333,1.505545,"(-1.913306234918613, 1.2466395682519464)"
1,0.85,1.968339,"(0.22049460551027544, 1.4795053944897245)"
2,-0.5,1.063219,"(-0.9489580369132349, -0.05104196308676512)"


In [75]:
compute_mean_std_for_score_diff(df,'theory score diff')

Unnamed: 0_level_0,mean,std,Confidence_Interval
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,11.95,8.599477,"(2.9254026334658647, 20.974597366534137)"
1,10.0475,4.154947,"(8.718683536506049, 11.37631646349395)"
2,7.191667,3.886198,"(5.550669098143643, 8.832664235189691)"


In [81]:
def compute_desc_stats(df):   
    stats = df.groupby('category')['total score'].agg(['mean', 'std', 'median', 'min', 'max'])
    stats['25th Percentile'] = df.groupby('category')['total score'].quantile(0.25)
    stats['75th Percentile'] = df.groupby('category')['total score'].quantile(0.75)
    return stats

In [82]:
compute_desc_stats(df)

Unnamed: 0_level_0,mean,std,median,min,max,25th Percentile,75th Percentile
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,17.6,2.120377,18.15,13.7,19.9,17.3,18.55
1,35.1575,3.043373,36.025,28.5,39.1,32.775,37.7625
2,43.325,3.053864,42.225,39.95,50.1,40.575,45.2


## Hypothesis Testing

In [99]:
# one-sample t-test for score differences for each group
grouped_data = df.groupby('category')['total score diff']
t_statistic_0, p_value_0 = stats.ttest_1samp(grouped_data.get_group(0), popmean=0)
t_statistic_1, p_value_1 = stats.ttest_1samp(grouped_data.get_group(1), popmean=0)
t_statistic_2, p_value_2 = stats.ttest_1samp(grouped_data.get_group(2), popmean=0)

n_0 = len(grouped_data.get_group(0))
n_1 = len(grouped_data.get_group(1))
n_2 = len(grouped_data.get_group(2))

# Perform chi-square test for category changes
observed = pd.crosstab(df['category'], df['new category'])
chi2_stat, p_value, _, _ = stats.chi2_contingency(observed)

# Print the test results
print("T-Test: t({}) = {:.3f}, p = {:.3f} Group 0".format(n_0 - 1,t_statistic_0, p_value_0))
print("T-Test: t({}) = {:.3f}, p = {:.3f} Group 1".format(n_1 - 1,t_statistic_1, p_value_1))
print("T-Test: t({}) = {:.3f}, p = {:.3f} Group 2".format(n_2 - 1,t_statistic_2, p_value_2))

print("Chi-Square Test: chi2 = {:.3f}, p = {:.3f}".format(chi2_stat, p_value))

T-Test: t(5) = 3.834, p = 0.012 Group 0
T-Test: t(39) = 14.151, p = 0.000 Group 1
T-Test: t(23) = 7.618, p = 0.000 Group 2
Chi-Square Test: chi2 = 18.444, p = 0.001


In [107]:
def  ind_2_samp_t_test(df, cat1, cat2, n1, n2):
    category_0_scores = df.loc[df['category'] == cat1, 'total score diff']
    category_1_scores = df.loc[df['category'] == cat2, 'total score diff']
    
    # Calculate degrees of freedom
    degf = n1 + n2 - 2

    # Perform independ two-sample t-test
    print("Independent 2-Sample t-Test: t({}) = {:.3f}, p = {:.3f}".format(degf,t_statistic,p_value))

In [108]:
ind_2_samp_t_test(df,0,1,n_0,n_1)
ind_2_samp_t_test(df,1,2,n_1,n_2)
ind_2_samp_t_test(df,0,2,n_0,n_2)

Independent 2-Sample t-Test: t(44) = 1.560, p = 0.001
Independent 2-Sample t-Test: t(62) = 1.560, p = 0.001
Independent 2-Sample t-Test: t(28) = 1.560, p = 0.001


In [123]:
df

'56-65'

In [130]:
df

Unnamed: 0,prolific id,theory score,practical score,total score,category,new theory score,new practical score,new total score,new category,age group,education level,IT occupation,total score diff,category diff,theory score diff,practical score diff,age_group_code,age_group_numeric
0,5b295f60bbd4880001117976,26.6,8,38.10,1,40.3,8,51.80,2,46-55,University degree (Bachelor/Master),"No, other education or occupation",13.70,1,13.7,0,3,4
1,63d543225e315b94bff0022a,31.3,8,40.30,2,39.7,8,48.70,2,26-35,University degree (Bachelor/Master),"No, other education or occupation",8.40,0,8.4,0,1,2
2,576fc3462984e70001a97fc3,28.1,8,37.85,1,44.1,8,53.85,2,26-35,University degree (Bachelor/Master),"No, other education or occupation",16.00,1,16.0,0,1,2
3,60173c3dd4e20d98d894aa6d,32.7,8,44.20,2,44.5,8,56.00,2,26-35,University degree (Bachelor/Master),"Yes, studies in Computer Science, IT Security,...",11.80,0,11.8,0,1,2
4,63d4dabe9f1dec54b04cc331,30.8,6,37.80,1,37.3,6,44.30,2,46-55,University degree (Bachelor/Master),"No, other education or occupation",6.50,1,6.5,0,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,62dd66998ed529ec120dcb58,28.8,6,36.30,1,43.1,10,54.60,2,36-45,University degree (Bachelor/Master),"No, other education or occupation",18.30,1,14.3,4,2,3
66,644316752492a25dfe0aa2e1,24.4,8,32.90,1,38.8,4,43.30,2,46-55,University degree (Bachelor/Master),"No, other education or occupation",10.40,1,14.4,-4,3,4
67,616310e41e98f007d82e2c2b,24.8,6,31.30,1,43.5,8,52.00,2,26-35,Secondary school diploma,"No, other education or occupation",20.70,1,18.7,2,1,2
68,5fd79eee9feae6104c4e77c6,19.9,6,19.90,0,38.7,8,46.95,2,26-35,University degree (Bachelor/Master),"No, other education or occupation",27.05,2,18.8,2,1,2


In [133]:
df

Unnamed: 0,prolific id,theory score,practical score,total score,category,new theory score,new practical score,new total score,new category,age group,education level,IT occupation,total score diff,category diff,theory score diff,practical score diff,age_group_code,age_group_numeric
0,5b295f60bbd4880001117976,26.6,8,38.10,1,40.3,8,51.80,2,46-55,University degree (Bachelor/Master),"No, other education or occupation",13.70,1,13.7,0,3,4
1,63d543225e315b94bff0022a,31.3,8,40.30,2,39.7,8,48.70,2,26-35,University degree (Bachelor/Master),"No, other education or occupation",8.40,0,8.4,0,1,2
2,576fc3462984e70001a97fc3,28.1,8,37.85,1,44.1,8,53.85,2,26-35,University degree (Bachelor/Master),"No, other education or occupation",16.00,1,16.0,0,1,2
3,60173c3dd4e20d98d894aa6d,32.7,8,44.20,2,44.5,8,56.00,2,26-35,University degree (Bachelor/Master),"Yes, studies in Computer Science, IT Security,...",11.80,0,11.8,0,1,2
4,63d4dabe9f1dec54b04cc331,30.8,6,37.80,1,37.3,6,44.30,2,46-55,University degree (Bachelor/Master),"No, other education or occupation",6.50,1,6.5,0,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,62dd66998ed529ec120dcb58,28.8,6,36.30,1,43.1,10,54.60,2,36-45,University degree (Bachelor/Master),"No, other education or occupation",18.30,1,14.3,4,2,3
66,644316752492a25dfe0aa2e1,24.4,8,32.90,1,38.8,4,43.30,2,46-55,University degree (Bachelor/Master),"No, other education or occupation",10.40,1,14.4,-4,3,4
67,616310e41e98f007d82e2c2b,24.8,6,31.30,1,43.5,8,52.00,2,26-35,Secondary school diploma,"No, other education or occupation",20.70,1,18.7,2,1,2
68,5fd79eee9feae6104c4e77c6,19.9,6,19.90,0,38.7,8,46.95,2,26-35,University degree (Bachelor/Master),"No, other education or occupation",27.05,2,18.8,2,1,2


## Correlation Analysis

In [131]:
from scipy.stats import pearsonr

def pearson(df,mapping,variable):
    corr_coeff, p_value = pearsonr(df[variable].map(mapping), df['total score diff'])
    # Print the correlation coefficient and p-value
    print("Pearson Correlation Coefficient: {:.3f}".format(corr_coeff))
    print("p-value: {:.3f}".format(p_value))

# Convert age group to numeric codes
age_mapping = {
    "18-25": 1,
    "26-35": 2,
    "36-45": 3,
    "46-55": 4,
    "56-65": 5,
    "66-75": 6,
    ">75": 7
}
pearson(df,age_mapping,'age group')

Pearson Correlation Coefficient: 0.037
p-value: 0.759


In [140]:
from scipy.stats import f_oneway


def anova(df,mapping,variable):
    # Perform one-way ANOVA test
    grouped_data = [df[df[variable] == d]['category'] for d in mapping.keys()]
    f_statistic, p_value = f_oneway(*grouped_data)
    print("ANOVA: F = {:.3f}, p-value = {:.3f}".format(f_statistic, p_value))
anova(df,age_mapping,'age group')

ANOVA: F = 0.700, p-value = 0.625


In [129]:
from scipy.stats import f_oneway


ANOVA: F = 0.700, p-value = 0.625


## Regression Analysis

In [169]:
X

Unnamed: 0,const,age_18-25,age_26-35,age_36-45,age_46-55,age_56-65,age_66-75,group_0,group_1,group_2,education_Associate degree,education_Other:,education_PhD or similar,education_Secondary school diploma,education_University degree (Bachelor/Master),"occupation_No, other education or occupation","occupation_Yes, IT specialist, IT security specialist or similar","occupation_Yes, other IT security related education or occupation:","occupation_Yes, studies in Computer Science, IT Security, Cybersecurity or similar"
0,1.0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0
1,1.0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0
2,1.0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0
3,1.0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1
4,1.0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,1.0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0
66,1.0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0
67,1.0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0
68,1.0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0


In [176]:
    def regression_per_category(df,category):
        # Create all dummies for regression
        X = pd.get_dummies(df[df["category"] == category]['age group'], prefix='age')
        #category_dummies = pd.get_dummies(df['category'], prefix='group')
        #education_dummies = pd.get_dummies(df['education level'], prefix='education')
        #occupation_dummies = pd.get_dummies(df['IT occupation'], prefix='occupation')

        # Add a constant term for the intercept
        X = sm.add_constant(X)

        # Define the dependent variable
        y = df[df["category"] == category]['total score diff']

        # Fit the OLS regression model
        model = sm.OLS(y, X)
        results = model.fit()

        # Print the regression results
        print(results.summary())

In [179]:
regression_per_category(df,0)

                            OLS Regression Results                            
Dep. Variable:       total score diff   R-squared:                       0.217
Model:                            OLS   Adj. R-squared:                  0.102
Method:                 Least Squares   F-statistic:                     1.888
Date:                Fri, 09 Jun 2023   Prob (F-statistic):              0.122
Time:                        23:05:59   Log-Likelihood:                -114.68
No. Observations:                  40   AIC:                             241.4
Df Residuals:                      34   BIC:                             251.5
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          9.4478      0.695     13.600      0.0

In [178]:
regression_per_category(df,1)

                            OLS Regression Results                            
Dep. Variable:       total score diff   R-squared:                       0.217
Model:                            OLS   Adj. R-squared:                  0.102
Method:                 Least Squares   F-statistic:                     1.888
Date:                Fri, 09 Jun 2023   Prob (F-statistic):              0.122
Time:                        23:04:29   Log-Likelihood:                -114.68
No. Observations:                  40   AIC:                             241.4
Df Residuals:                      34   BIC:                             251.5
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          9.4478      0.695     13.600      0.0

In [180]:
regression_per_category(df,2)

                            OLS Regression Results                            
Dep. Variable:       total score diff   R-squared:                       0.138
Model:                            OLS   Adj. R-squared:                 -0.043
Method:                 Least Squares   F-statistic:                    0.7624
Date:                Fri, 09 Jun 2023   Prob (F-statistic):              0.563
Time:                        23:07:05   Log-Likelihood:                -66.784
No. Observations:                  24   AIC:                             143.6
Df Residuals:                      19   BIC:                             149.5
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.7708      0.833      6.932      0.0

## Threshold computation for draft study

In [53]:
mean_theory_score, std_theory_score = compute_mean_std(df["theory score"])
mean_practical_score, std_practical_score = compute_mean_std(df["practical score"])
mean_total_score, std_total_score = compute_mean_std(df["total score"])

print("total score:", mean_total_score, std_total_score)
print("theory score:", mean_theory_score, std_theory_score)
print("practical score:", mean_practical_score, std_practical_score)

total score: 36.452857142857155 7.494125726738461
theory score: 28.03857142857143 4.722568697440072
practical score: 7.314285714285714 1.5449720953823742


In [54]:
threshold1_theory, threshold2_theory = compute_thresholds(mean_theory_score, std_theory_score)
threshold1_practical, threshold2_practical = compute_thresholds(mean_practical_score, std_practical_score)
threshold1_total, threshold2_total = compute_thresholds(mean_total_score, std_total_score)

print("total score:", threshold1_total, threshold2_total)
print("theory score:", threshold1_theory, threshold2_theory)
print("practical score:", threshold1_practical, threshold2_practical)

total score: 28.958731416118695 43.946982869595615
theory score: 23.31600273113136 32.761140126011504
practical score: 5.76931361890334 8.859257809668089
