In [3]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import timedelta
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.stats.anova import anova_lm

In [7]:
extraction_dir = './survey-dataset/'

# List the files in the directory
extracted_files = os.listdir(extraction_dir)
extracted_files

['.DS_Store',
 'transformed_data.csv',
 'data.csv',
 'transformed_task_wise.csv',
 'Task-wise.csv',
 'correlation_matrix.csv']

In [24]:
# Load the task wise data
data = pd.read_csv(os.path.join(extraction_dir, 'transformed_task_wise.csv'))
print(data.columns)

Index(['id', 'task', 'condition', 'ocd-help', 'ocd-performance',
       'ocd-productivity', 'ocd-effectiveness', 'ocd-easiness',
       'ocd-usefulness', 'learn-ocd', 'ocd-tasks-help', 'ocd-interaction',
       'ocd-skills', 'ocd-debugging', 'ocd-use', 'ocd-future',
       'ocd-over-classical', 'education', 'job-position', 'program-exp',
       'pharo-exp', 'code-frequency', 'pharo-frequency',
       'debugger-familiarity', 'debugger-ocd-familarity', 'correctness',
       'validity', 'time_in_seconds', 'answer', 'actions', 'task-easiness',
       'task-length', 'task-interruption', 'bug-found', 'bug-description',
       'debugger-help', 'debugger-enjoy', 'debugger-efficient',
       'debugger-intuitive', 'debugger-easiness', 'debugger-learn'],
      dtype='object')


In [25]:

# Calculate the correlation matrix for numerical columns
numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns
correlation_matrix = data[numerical_columns].corr()

# Display the correlation matrix
print(correlation_matrix)

# Optionally, save the correlation matrix to a CSV file for easier inspection
correlation_matrix.to_csv(f"{extraction_dir}/correlation_matrix.csv")  # Replace with the desired path to save the CSV file

                         ocd-help  ocd-performance  ocd-productivity  \
ocd-help                 1.000000         0.963101          0.792467   
ocd-performance          0.963101         1.000000          0.805819   
ocd-productivity         0.792467         0.805819          1.000000   
ocd-effectiveness        0.862459         0.843409          0.890249   
ocd-easiness             0.819608         0.797322          0.838385   
ocd-usefulness           0.862711         0.847093          0.866599   
learn-ocd                0.825849         0.770132          0.811808   
ocd-tasks-help           0.799168         0.766508          0.807118   
ocd-interaction          0.780117         0.745041          0.804381   
ocd-skills               0.776315         0.752496          0.789297   
ocd-debugging            0.740999         0.729257          0.797118   
ocd-use                  0.742677         0.708832          0.767487   
ocd-future               0.791515         0.794666          0.82

In [26]:
# Sanitize column names to avoid special characters issues
data.columns = data.columns.str.replace('-', '_').str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

# Print the columns to inspect
print("Columns in the DataFrame:")
print(data.columns)

Columns in the DataFrame:
Index(['id', 'task', 'condition', 'ocd_help', 'ocd_performance',
       'ocd_productivity', 'ocd_effectiveness', 'ocd_easiness',
       'ocd_usefulness', 'learn_ocd', 'ocd_tasks_help', 'ocd_interaction',
       'ocd_skills', 'ocd_debugging', 'ocd_use', 'ocd_future',
       'ocd_over_classical', 'education', 'job_position', 'program_exp',
       'pharo_exp', 'code_frequency', 'pharo_frequency',
       'debugger_familiarity', 'debugger_ocd_familarity', 'correctness',
       'validity', 'time_in_seconds', 'answer', 'actions', 'task_easiness',
       'task_length', 'task_interruption', 'bug_found', 'bug_description',
       'debugger_help', 'debugger_enjoy', 'debugger_efficient',
       'debugger_intuitive', 'debugger_easiness', 'debugger_learn'],
      dtype='object')


In [28]:
# Define the independent variables
independent_vars = ['condition', 'task', 'education', 'job_position', 'program_exp', 'pharo_exp', 
                    'code_frequency', 'pharo_frequency', 'debugger_familiarity', 'debugger_ocd_familarity', 
                    'task_easiness', 'task_length', 'task_interruption']

# Ensure the dependent variables are numeric
data['correctness'] = pd.to_numeric(data['correctness'], errors='coerce')
data['time_in_seconds'] = pd.to_numeric(data['time_in_seconds'], errors='coerce')
data['actions'] = pd.to_numeric(data['actions'], errors='coerce')

# Drop the user_id column if it exists
if 'id' in data.columns:
    data = data.drop(columns=['id'])

# Check if all independent variables exist in the DataFrame
missing_columns = [col for col in independent_vars if col not in data.columns]
if missing_columns:
    print(f"The following columns are missing from the DataFrame: {missing_columns}")