In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import timedelta

In [3]:
extraction_dir = './survey-dataset/'

# List the files in the directory
extracted_files = os.listdir(extraction_dir)
extracted_files

['experiment-feedback.csv',
 'lightsout-treatment.csv',
 'Pharo-experinece-control.png',
 'lightsout-control.csv',
 'data.csv',
 'ammolite-control.csv',
 'demographic-information.csv',
 'lightsout-control-modified.csv',
 'Gen-dev-experience.png',
 'ammolite-treatment.csv']

In [4]:
file_path = os.path.join(extraction_dir, 'data.csv')
print(file_path)

./survey-dataset/data.csv


In [5]:
import pandas as pd

# Read the CSV file
df = pd.read_csv(file_path)

# Print all columns
print(df.columns)

Index(['id', 'control task name', 'control task correctness',
       'control task validity', 'control task time in seconds',
       'control task answer', 'control task actions', 'ct-task-easiness',
       'ct-task-length', 'ct-task-interruption', 'ct-bug-found',
       'ct-bug-description', 'ct-debugger-help', 'ct-debugger-enjoy',
       'ct-debugger-efficient', 'ct-debugger-intuitive',
       'ct-debugger-easiness', 'ct-debugger-learn', 'treatment task name',
       'treatment task correctness', 'treatment task validity',
       'treatment task time in seconds', 'treatment task answer',
       'treatment task actions', 'tt-task-easiness', 'tt-task-length',
       'tt-task-interruption', 'tt-bug-found', 'tt-bug-description',
       'tt-debugger-help', 'tt-debugger-enjoy', 'tt-debugger-efficient',
       'tt-debugger-intuitive', 'tt-debugger-easiness', 'tt-debugger-learn',
       'experiment-easiness', 'experiment-length', 'ocd-help',
       'ocd-performance', 'ocd-productivity', 'ocd

In [20]:
data = pd.read_csv(file_path)

# Define columns to be extracted for control and treatment tasks
user_specific_columns = [
    'id', 'ocd-help', 'ocd-performance', 'ocd-productivity', 'ocd-effectiveness', 'ocd-easiness', 'ocd-usefulness',
    'learn-ocd', 'ocd-tasks-help', 'ocd-interaction', 'ocd-skills', 'ocd-debugging', 'ocd-use', 'ocd-future', 
    'ocd-over-classical', 'education', 'job-position', 'program-exp', 'pharo-exp', 'code-frequency', 
    'pharo-frequency', 'debugger-familiarity', 'debugger-ocd-familarity'
]

control_columns = user_specific_columns + ['control task name', 'control task correctness', 'control task validity', 
                                           'control task time in seconds', 'control task answer', 'control task actions'] + \
                  [col for col in data.columns if col.startswith('ct-')]

treatment_columns = user_specific_columns + ['treatment task name', 'treatment task correctness', 'treatment task validity', 
                                             'treatment task time in seconds', 'treatment task answer', 'treatment task actions'] + \
                    [col for col in data.columns if col.startswith('tt-')]

# Create DataFrame for control tasks
control_data = data[control_columns].copy()
control_data['task'] = control_data['control task name']
control_data['condition'] = 'control'

# Create DataFrame for treatment tasks
treatment_data = data[treatment_columns].copy()
treatment_data['task'] = treatment_data['treatment task name']
treatment_data['condition'] = 'treatment'

# Standardize column names for merging
control_data.columns = user_specific_columns + ['task_name', 'correctness', 'validity', 'time_in_seconds', 'answer', 'actions'] + \
                       [col.replace('ct-', '') for col in control_data.columns[len(user_specific_columns)+6:-2]] + ['task', 'condition']

treatment_data.columns = user_specific_columns + ['task_name', 'correctness', 'validity', 'time_in_seconds', 'answer', 'actions'] + \
                         [col.replace('tt-', '') for col in treatment_data.columns[len(user_specific_columns)+6:-2]] + ['task', 'condition']

# Remove the 'task_name' column as it is now redundant
control_data = control_data.drop(columns=['task_name'])
treatment_data = treatment_data.drop(columns=['task_name'])

# Combine the control and treatment DataFrames
combined_data = pd.concat([control_data, treatment_data], ignore_index=True)

# Save the transformed data to a new CSV file
combined_data.to_csv(f"{extraction_dir}/transformed_data.csv", index=False, quoting=1)

# Display the first few rows of the transformed data
combined_data.head()

Unnamed: 0,id,ocd-help,ocd-performance,ocd-productivity,ocd-effectiveness,ocd-easiness,ocd-usefulness,learn-ocd,ocd-tasks-help,ocd-interaction,...,bug-found,bug-description,debugger-help,debugger-enjoy,debugger-efficient,debugger-intuitive,debugger-easiness,debugger-learn,task,condition
0,User-70721c71-20d5-0d00-a35f-0b230f90ea5645210...,,,,,,,,,,...,,,,,,,,,Lights Out,control
1,User-88a6276f-6bd5-0d00-a386-0d050eac3a2a50500...,Nor disagree or agree,Nor disagree or agree,Disagree,Nor disagree or agree,Nor disagree or agree,Nor disagree or agree,Nor disagree or agree,No choice,No choice,...,No,,No choice,No choice,No choice,No choice,No choice,No choice,Lights Out,control
2,User-a2904209-ccc2-0d00-9778-6c840e6ed7ae51481...,Agree,Agree,Agree,Agree,Agree,Agree,Nor disagree or agree,Agree,Agree,...,Yes,One of the corner switches could not be switch...,Normally,Agree,Strongly Agree,Strongly Agree,Agree,Agree,Lights Out,control
3,User-20d31f99-86c4-0d00-a363-69a60df93b8251470...,Strongly Agree,Agree,Agree,Strongly Agree,Nor disagree or agree,Agree,Strongly Agree,Agree,Strongly Agree,...,Yes,wrong data set with a not necessary space char...,Moderately,Agree,Nor disagree or agree,Disagree,Disagree,No choice,Ammolite,control
4,User-4a048f49-eec8-0d00-89f7-047b0e96ebda52859...,No choice,No choice,No choice,No choice,No choice,No choice,No choice,No choice,No choice,...,No,,No choice,No choice,No choice,No choice,No choice,No choice,Ammolite,control


In [49]:
# Load the data
transformed_data = pd.read_csv(os.path.join(extraction_dir, 'Task-wise.csv'))  
transformed_data.head()

# Trim leading and trailing whitespace from all string columns using apply and map
transformed_data = transformed_data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)


In [55]:
# List of columns to inspect based on the presence
columns_to_inspect = [
    'ocd-help', 'ocd-performance', 'ocd-productivity', 'ocd-effectiveness', 'ocd-easiness',
    'ocd-usefulness', 'learn-ocd', 'ocd-tasks-help', 'ocd-interaction', 'ocd-skills', 'ocd-debugging',
    'ocd-use', 'ocd-future', 'ocd-over-classical', 'education', 'job-position', 'program-exp', 
    'pharo-exp', 'code-frequency', 'pharo-frequency', 'debugger-familiarity', 'debugger-ocd-familarity', 
    'task-easiness', 'task-length', 'task-interruption', 'bug-found', 'debugger-help', 'debugger-enjoy', 
    'debugger-efficient', 'debugger-intuitive', 'debugger-easiness', 'debugger-learn'
]

# Inspect unique values for each column in the list
unique_values = {col: transformed_data[col].unique() if col in transformed_data.columns else f"{col} not found" for col in columns_to_inspect}
unique_values

{'ocd-help': array(['', 'Nor disagree or agree', 'Agree', 'Strongly Agree',
        'No choice', 'Disagree'], dtype=object),
 'ocd-performance': array(['', 'Nor disagree or agree', 'Agree', 'No choice',
        'Strongly Agree'], dtype=object),
 'ocd-productivity': array(['', 'Disagree', 'Agree', 'No choice', 'Nor disagree or agree',
        'Strongly Agree'], dtype=object),
 'ocd-effectiveness': array(['', 'Nor disagree or agree', 'Agree', 'Strongly Agree',
        'No choice'], dtype=object),
 'ocd-easiness': array(['', 'Nor disagree or agree', 'Agree', 'No choice',
        'Strongly Agree', 'Disagree'], dtype=object),
 'ocd-usefulness': array(['', 'Nor disagree or agree', 'Agree', 'No choice',
        'Strongly Agree', 'Disagree'], dtype=object),
 'learn-ocd': array(['', 'Nor disagree or agree', 'Strongly Agree', 'No choice',
        'Agree', 'Disagree'], dtype=object),
 'ocd-tasks-help': array(['', 'No choice', 'Agree', 'Nor disagree or agree', 'Disagree',
        'Strongly Agree']

In [69]:
qualitative_mapping = {
    "Strongly Agree": 5,
    "Agree": 4,
    "Nor disagree or agree": 3,
    "Disagree": 2,
    "Strongly Disagree": 1,
    "No choice": 0,
    "": -1  # Mapping empty values to -1
}

education_mapping = {
    "": -1,  # Mapping empty values to -1
    "No choice": 0,
    "High School but did not graduate": 0,
    "High School": 1,
    "High School graduate or GED": 1,
    "Some college or 2-year degree": 2,
    "Bachelor in STEM (Science, Technology, Engineering and Mathematics)": 3,
    "Bachelor in CS (Computer Science)": 3,
    "Master in CS": 4,
    "PhD or higher academic title in STEM": 5,
    "PhD or higher academic title in CS": 5,
    "PhD or higher academic title in a different area than CS or STEM": 5
}

experience_mapping = {
    "": -1,  # Mapping empty values to -1
    "No choice": 0,
    "< 1": 1,
    "1-2 years": 2,
    "3-5 years": 3,
    "6-10 years": 4,
    "More than 10":5,
    "More than 10 years": 5
}

frequency_mapping = {
    "": -1,  # Mapping empty values to -1
    "No choice": 0,
    "At least once per year":1,
    "Less than once per month": 1,
    "At least once per month": 2,
    "At least once per week": 3,
    "At least once per day": 4,
}

interruption_mapping = {
    "": -1,  # Mapping empty values to -1
    "No choice": 0,
    '1-2 minutes': 1,
    'up to 5 minutes': 2,
    'up to 10 minutes': 3,
    'more than 10 minutes': 4,
    "Not interrupted": 5,
}

job_position_mapping = {
    "": -1,  # Mapping empty values to -1
    "No choice": 0,
    "Student": 1,
    "Full-Time Dev": 2,
    "Part-Time Dev": 3,
    "Unemployed": 4,
    "No choice": 5,
    "Self-Employed": 6
}

bug_found_mapping = {
    "": -1,  # Mapping empty values to -1
    "No choice": 0,
    "Yes": 1,
    "No": 2
}

debugger_help_mapping = {
    "Not at all": 1,
    "Slightly": 2,
    "Moderately": 3,
    "Normally": 4,
    "Extremely": 5,
    "No choice": 0,
    "": -1  # Mapping empty values to -1
}

In [70]:
# Replace empty strings with NaNs and then fill NaNs with default values for specific columns
transformed_data = transformed_data.replace("", pd.NA).fillna({
    'ocd-help': "No choice",
    'ocd-performance': "No choice",
    'ocd-productivity': "No choice",
    'ocd-effectiveness': "No choice",
    'ocd-easiness': "No choice",
    'ocd-usefulness': "No choice",
    'learn-ocd': "No choice",
    'ocd-tasks-help': "No choice",
    'ocd-interaction': "No choice",
    'ocd-skills': "No choice",
    'ocd-debugging': "No choice",
    'ocd-use': "No choice",
    'ocd-future': "No choice",
    'ocd-over-classical': "No choice",
    'education': "No choice",
    'job-position': "No choice",
    'program-exp': "No choice",
    'pharo-exp': "No choice",
    'code-frequency': "No choice",
    'pharo-frequency': "No choice",
    'debugger-familiarity': "No choice",
    'debugger-ocd-familarity': "No choice",
    'task-easiness': "No choice",
    'task-length': "No choice",
    'task-interruption': "No choice",
    'validity': "No choice",
    'correctness': "No choice",
    'bug-found': "No choice",
    'debugger-help': "No choice",
    'debugger-enjoy': "No choice",
    'debugger-efficient': "No choice",
    'debugger-intuitive': "No choice",
    'debugger-easiness': "No choice",
    'debugger-learn': "No choice"
})

In [71]:
# Apply the mappings to the dataset
transformed_data.replace({
    'ocd-help': qualitative_mapping,
    'ocd-performance': qualitative_mapping,
    'ocd-productivity': qualitative_mapping,
    'ocd-effectiveness': qualitative_mapping,
    'ocd-easiness': qualitative_mapping,
    'ocd-usefulness': qualitative_mapping,
    'learn-ocd': qualitative_mapping,
    'ocd-tasks-help': qualitative_mapping,
    'ocd-interaction': qualitative_mapping,
    'ocd-skills': qualitative_mapping,
    'ocd-debugging': qualitative_mapping,
    'ocd-use': qualitative_mapping,
    'ocd-future': qualitative_mapping,
    'ocd-over-classical': qualitative_mapping,
    'education': education_mapping,
    'job-position': job_position_mapping,
    'program-exp': experience_mapping,
    'pharo-exp': experience_mapping,
    'code-frequency': frequency_mapping,
    'pharo-frequency': frequency_mapping,
    'debugger-familiarity': qualitative_mapping,
    'debugger-ocd-familarity': qualitative_mapping,
    'task-easiness': qualitative_mapping,
    'task-length': qualitative_mapping,
    'task-interruption': interruption_mapping,
    'validity': bug_validity_mapping,
    'correctness': bug_validity_mapping,
    'bug-found': bug_found_mapping,
    'debugger-help': debugger_help_mapping,
    'debugger-enjoy': qualitative_mapping,
    'debugger-efficient': qualitative_mapping,
    'debugger-intuitive': qualitative_mapping,
    'debugger-easiness': qualitative_mapping,
    'debugger-learn': qualitative_mapping
}, inplace=True)

# Save the transformed data to a new CSV file for further analysis
transformed_data.to_csv(f"{extraction_dir}/transformed_task_wise.csv", index=False, quoting=1)

# Display the first few rows to confirm the changes
print(transformed_data.head())

                                                  id        task condition  \
0  User-70721c71-20d5-0d00-a35f-0b230f90ea5645210...  Lights Out   control   
1  User-88a6276f-6bd5-0d00-a386-0d050eac3a2a50500...  Lights Out   control   
2  User-a2904209-ccc2-0d00-9778-6c840e6ed7ae51481...  Lights Out   control   
3  User-20d31f99-86c4-0d00-a363-69a60df93b8251470...    Ammolite   control   
4  User-4a048f49-eec8-0d00-89f7-047b0e96ebda52859...    Ammolite   control   

   ocd-help  ocd-performance  ocd-productivity  ocd-effectiveness  \
0         0                0                 0                  0   
1         3                3                 2                  3   
2         4                4                 4                  4   
3         5                4                 4                  5   
4         0                0                 0                  0   

   ocd-easiness  ocd-usefulness  learn-ocd  ...  task-length  \
0             0               0          0  ...     

  transformed_data.replace({


In [75]:
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm
import statsmodels.api as sm

# Calculate the correlation matrix for numerical columns
numerical_columns = transformed_data.select_dtypes(include=['int64', 'float64']).columns
correlation_matrix = transformed_data[numerical_columns].corr()

# Display the correlation matrix
print(correlation_matrix)

# Optionally, save the correlation matrix to a CSV file for easier inspection
correlation_matrix.to_csv(f"{extraction_dir}/correlation_matrix.csv")  # Replace with the desired path to save the CSV file

                         ocd-help  ocd-performance  ocd-productivity  \
ocd-help                 1.000000         0.963101          0.792467   
ocd-performance          0.963101         1.000000          0.805819   
ocd-productivity         0.792467         0.805819          1.000000   
ocd-effectiveness        0.862459         0.843409          0.890249   
ocd-easiness             0.819608         0.797322          0.838385   
ocd-usefulness           0.862711         0.847093          0.866599   
learn-ocd                0.825849         0.770132          0.811808   
ocd-tasks-help           0.799168         0.766508          0.807118   
ocd-interaction          0.780117         0.745041          0.804381   
ocd-skills               0.776315         0.752496          0.789297   
ocd-debugging            0.740999         0.729257          0.797118   
ocd-use                  0.742677         0.708832          0.767487   
ocd-future               0.791515         0.794666          0.82

In [76]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Load the transformed data
data = pd.read_csv(os.path.join(extraction_dir, 'transformed_task_wise.csv'))

# Define the independent variables
independent_vars = ['condition', 'task', 'education', 'job-position', 'program-exp', 'pharo-exp', 
                    'code-frequency', 'pharo-frequency', 'debugger-familiarity', 'debugger-ocd-familarity', 
                    'task-easiness', 'task-length', 'task-interruption']

# Convert categorical variables to 'category' dtype
for var in independent_vars:
    data[var] = data[var].astype('category')

# Perform OLS regression for 'correctness'
formula_correctness = f"correctness ~ {' + '.join(independent_vars)}"
model_correctness = smf.ols(formula_correctness, data=data).fit()
summary_correctness = model_correctness.summary()
summary_correctness


PatsyError: Error evaluating factor: NameError: name 'program' is not defined
    correctness ~ condition + task + education + job-position + program-exp + pharo-exp + code-frequency + pharo-frequency + debugger-familiarity + debugger-ocd-familarity + task-easiness + task-length + task-interruption
                                                                ^^^^^^^