In [1]:
import pandas as pd

# Load the CSV file
file_path = '../data/selfReported/backgroundDemographic.csv'
df = pd.read_csv(file_path)

# Define the groups to remove
groups_to_remove = [2, 3, 6, 8, 11, 12, 13, 14, 15, 18, 22, 24]

# Filter the dataframe to exclude these groups
filtered_df = df[~df['groupNum'].isin(groups_to_remove)]

# Save the filtered dataframe to a new CSV file
filtered_file_path = '../data/selfReported/backgroundDemographic_AB_LBW.csv'
filtered_df.to_csv(filtered_file_path, index=False)

print(f"Filtered data saved to: {filtered_file_path}")


Filtered data saved to: ../data/selfReported/backgroundDemographic_AB_LBW.csv


In [2]:
import pandas as pd
import numpy as np
from scipy import stats

def load_and_validate_data(file_path):
    """
    Load the CSV file and perform basic data validation
    """
    df = pd.read_csv(file_path)
    print(f"Total number of participants: {len(df)}")
    return df

def analyze_gender_distribution(df):
    """
    Analyze gender distribution in the dataset
    """
    gender_dist = df['gender'].value_counts()
    gender_percentages = df['gender'].value_counts(normalize=True) * 100
    
    print("\nGender Distribution:")
    for gender in gender_dist.index:
        print(f"{gender}: {gender_dist[gender]} participants ({gender_percentages[gender]:.1f}%)")

def analyze_age(df):
    """
    Calculate age statistics
    """
    age_mean = df['age'].mean()
    age_sd = df['age'].std()
    print(f"\nAge Statistics:")
    print(f"Mean: {age_mean:.2f} years (SD = {age_sd:.2f})")
    print(f"Range: {df['age'].min()} - {df['age'].max()} years")

def analyze_education(df):
    """
    Analyze education level distribution
    """
    edu_dist = df['educationLevel'].value_counts()
    edu_percentages = df['educationLevel'].value_counts(normalize=True) * 100
    
    print("\nEducation Level Distribution:")
    for edu in edu_dist.index:
        print(f"{edu}: {edu_dist[edu]} participants ({edu_percentages[edu]:.1f}%)")

def analyze_experience_measures(df):
    """
    Analyze various experience measures
    """
    experience_cols = {
        'workYear': 'Professional Work Experience (years)',
        'groupDecisionExperience': 'Group Decision-Making Experience (1-7)',
        'onlineCoworkExperience': 'Online Collaboration Experience (1-7)',
        'Aiexperience': 'AI Experience (1-7)'
    }
    
    print("\nExperience Measures:")
    for col, description in experience_cols.items():
        mean = df[col].mean()
        sd = df[col].std()
        print(f"\n{description}:")
        print(f"Mean = {mean:.2f} (SD = {sd:.2f})")
        if col != 'workYear':  # For Likert scales
            print(f"Median = {df[col].median()}")

def analyze_ai_group_usage(df):
    """
    Analyze AI usage in group context
    """
    ai_usage = df['usedAI_inGroup'].value_counts()
    ai_percentages = df['usedAI_inGroup'].value_counts(normalize=True) * 100
    
    print("\nAI Use in Group Context:")
    for response in ai_usage.index:
        print(f"{response}: {ai_usage[response]} participants ({ai_percentages[response]:.1f}%)")

def main():
    # Load data
    df = load_and_validate_data('../data/selfReported/backgroundDemographic_AB_LBW.csv')
    
    # Run all analyses
    analyze_gender_distribution(df)
    analyze_age(df)
    analyze_education(df)
    analyze_experience_measures(df)
    analyze_ai_group_usage(df)

if __name__ == "__main__":
    main()

Total number of participants: 48

Gender Distribution:
F: 31 participants (64.6%)
M: 17 participants (35.4%)

Age Statistics:
Mean: 26.17 years (SD = 4.54)
Range: 19 - 39 years

Education Level Distribution:
Bachelor's degree: 24 participants (50.0%)
Master's Degree: 10 participants (20.8%)
High school or equivalent: 7 participants (14.6%)
Some college: 6 participants (12.5%)
doctorate degree: 1 participants (2.1%)

Experience Measures:

Professional Work Experience (years):
Mean = 2.18 (SD = 2.66)

Group Decision-Making Experience (1-7):
Mean = 4.85 (SD = 1.44)
Median = 5.0

Online Collaboration Experience (1-7):
Mean = 4.02 (SD = 1.82)
Median = 4.0

AI Experience (1-7):
Mean = 4.58 (SD = 1.40)
Median = 5.0

AI Use in Group Context:
Y: 26 participants (54.2%)
N: 22 participants (45.8%)
