# Data Summarization

In [6]:
import pandas as pd
import numpy as np
from scipy import stats

def load_and_validate_data(file_path):
    """
    Load the CSV file and perform basic data validation
    """
    df = pd.read_csv(file_path)
    print(f"Total number of participants: {len(df)}")
    return df

def analyze_gender_distribution(df):
    """
    Analyze gender distribution in the dataset
    """
    gender_dist = df['gender'].value_counts()
    gender_percentages = df['gender'].value_counts(normalize=True) * 100
    
    print("\nGender Distribution:")
    for gender in gender_dist.index:
        print(f"{gender}: {gender_dist[gender]} participants ({gender_percentages[gender]:.1f}%)")

def analyze_age(df):
    """
    Calculate age statistics
    """
    age_mean = df['age'].mean()
    age_sd = df['age'].std()
    print(f"\nAge Statistics:")
    print(f"Mean: {age_mean:.2f} years (SD = {age_sd:.2f})")
    print(f"Range: {df['age'].min()} - {df['age'].max()} years")

def analyze_education(df):
    """
    Analyze education level distribution
    """
    edu_dist = df['educationLevel'].value_counts()
    edu_percentages = df['educationLevel'].value_counts(normalize=True) * 100
    
    print("\nEducation Level Distribution:")
    for edu in edu_dist.index:
        print(f"{edu}: {edu_dist[edu]} participants ({edu_percentages[edu]:.1f}%)")

def analyze_experience_measures(df):
    """
    Analyze various experience measures
    """
    experience_cols = {
        'workYear': 'Professional Work Experience (years)',
        'groupDecisionExperience': 'Group Decision-Making Experience (1-7)',
        'onlineCoworkExperience': 'Online Collaboration Experience (1-7)',
        'Aiexperience': 'AI Experience (1-7)'
    }
    
    print("\nExperience Measures:")
    for col, description in experience_cols.items():
        mean = df[col].mean()
        sd = df[col].std()
        print(f"\n{description}:")
        print(f"Mean = {mean:.2f} (SD = {sd:.2f})")
        if col != 'workYear':  # For Likert scales
            print(f"Median = {df[col].median()}")

def analyze_ai_group_usage(df):
    """
    Analyze AI usage in group context
    """
    ai_usage = df['usedAI_inGroup'].value_counts()
    ai_percentages = df['usedAI_inGroup'].value_counts(normalize=True) * 100
    
    print("\nAI Use in Group Context:")
    for response in ai_usage.index:
        print(f"{response}: {ai_usage[response]} participants ({ai_percentages[response]:.1f}%)")

def main():
    # Load data
    df = load_and_validate_data('../data/selfReported/backgroundDemographic.csv')
    
    # Run all analyses
    analyze_gender_distribution(df)
    analyze_age(df)
    analyze_education(df)
    analyze_experience_measures(df)
    analyze_ai_group_usage(df)

if __name__ == "__main__":
    main()

Total number of participants: 96

Gender Distribution:
F: 61 participants (63.5%)
M: 35 participants (36.5%)

Age Statistics:
Mean: 26.60 years (SD = 5.21)
Range: 19 - 42 years

Education Level Distribution:
Bachelor's degree: 45 participants (46.9%)
Master's Degree: 19 participants (19.8%)
Some college: 15 participants (15.6%)
High school or equivalent: 13 participants (13.5%)
doctorate degree: 4 participants (4.2%)

Experience Measures:

Professional Work Experience (years):
Mean = 2.50 (SD = 3.15)

Group Decision-Making Experience (1-7):
Mean = 5.01 (SD = 1.41)
Median = 5.0

Online Collaboration Experience (1-7):
Mean = 4.39 (SD = 1.83)
Median = 5.0

AI Experience (1-7):
Mean = 4.83 (SD = 1.48)
Median = 5.0

AI Use in Group Context:
Y: 51 participants (53.1%)
N: 45 participants (46.9%)


# Data Visualization

In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.colors import LinearSegmentedColormap


# Read the data
df = pd.read_csv('../data/selfReported/backgroundDemographic.csv')

# Set the general style
plt.style.use('seaborn-v0_8')
sns.set_theme(style="whitegrid")

# Create a muted color palette
muted_colors = sns.color_palette("Pastel1")

# 1. Education Level Distribution
plt.figure(figsize=(12, 1.5))  # Further reduced height

# Calculate percentages
education_counts = df['educationLevel'].value_counts()
education_percentages = (education_counts / len(df) * 100).round(1)

# Create a horizontal stacked bar
left = 0
ax1 = plt.gca()

# Create bars and add percentage labels with smaller font
for i, (edu, percentage) in enumerate(education_percentages.items()):
    ax1.barh(0, percentage, left=left, color=muted_colors[i], 
             label=f"{edu} ({percentage}%)")
    ax1.text(left + percentage/2, 0, f'{percentage}%', 
            ha='center', va='center', 
            color='darkslategray', fontweight='bold',
            fontsize=8)
    left += percentage

plt.title('Distribution of Education Level (%)', pad=15, fontsize=11, color='darkslategray')
plt.xlabel('Percentage', fontsize=10, color='darkslategray')
plt.yticks([])
plt.xlim(0, 100)

# Adjust legend
plt.legend(bbox_to_anchor=(0.5, -0.5), loc='upper center', ncol=3, 
          frameon=True, facecolor='white', edgecolor='none',
          fontsize=8)

plt.savefig('education_distribution.png', dpi=300, bbox_inches='tight', 
            facecolor='white', edgecolor='none')
plt.close()

# 2. Experience Distribution (Violin Plots)
plt.figure(figsize=(12, 6))

# Prepare data for plots - excluding workYear
experience_data = pd.melt(df[['groupDecisionExperience', 
                             'onlineCoworkExperience', 
                             'Aiexperience']], 
                         var_name='Prior Experience', 
                         value_name='7-point Likert Scale')

# Create violin plots
sns.violinplot(x='Prior Experience', y='7-point Likert Scale', 
               data=experience_data,
               palette=muted_colors[1:4],
               inner='box',
               cut=0)

# Rename x-axis labels
label_map = {
    'groupDecisionExperience': 'Group Decision-Making',
    'onlineCoworkExperience': 'Online Collaboration',
    'Aiexperience': 'AI (e.g. ChatGPT)'
}
current_xticks = plt.gca().get_xticklabels()
plt.gca().set_xticklabels([label_map[tick.get_text()] for tick in current_xticks])

plt.ylim(0.5, 7.5)
plt.yticks(range(1, 8))

plt.title('Distribution of Prior Experience', pad=20, fontsize=12, color='darkslategray')
plt.xlabel('Prior Experience', fontsize=10, color='darkslategray')
plt.ylabel('7-point Likert Scale', fontsize=10, color='darkslategray')

plt.grid(True, axis='y', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.savefig('experience_distribution.png', dpi=300, bbox_inches='tight',
            facecolor='white', edgecolor='none')
plt.close()

# 3. Likert Scale Distribution (Diverging Bars)
plt.figure(figsize=(14, 3))  # Increased width

# Define experience types in desired order
experience_types = ['groupDecisionExperience', 'onlineCoworkExperience', 'Aiexperience']
nice_names = ['Group Decision-Making', 'Online Collaboration', 'AI (e.g. ChatGPT)']

# Create custom color gradients
reds = sns.color_palette("Reds", n_colors=3)
blues = sns.color_palette("Blues", n_colors=3)
middle_color = 'lightgray'

discrete_colors = reds[::-1] + [middle_color] + blues

for idx, (exp_type, nice_name) in enumerate(zip(experience_types, nice_names)):
    # Calculate percentages for each Likert scale value
    value_counts = df[exp_type].value_counts().sort_index()
    percentages = (value_counts / len(df) * 100).round(1)
    
    # Find the center point (score 4 percentage)
    center_pct = percentages.get(4, 0)
    center_width = center_pct
    
    # Calculate starting position for center
    center_start = 50 - center_width/2
    
    # Add left side (scores 1-3)
    left_pos = center_start
    for score in [3, 2, 1]:
        if score in percentages:
            width = percentages[score]
            left_pos -= width
            plt.barh(idx, width, left=left_pos, 
                    color=reds[3-score])
            # Add smaller percentage label
            if width > 2:  # Only show if bar is wide enough
                plt.text(left_pos + width/2, idx, f'{width:.1f}%',
                        ha='center', va='center', fontsize=7,
                        color='black', fontweight='bold')
    
    # Add center (score 4)
    plt.barh(idx, center_width, left=center_start,
            color=middle_color)
    if center_width > 2:
        plt.text(center_start + center_width/2, idx, f'{center_width:.1f}%',
                ha='center', va='center', fontsize=7,
                color='black', fontweight='bold')
    
    # Add right side (scores 5-7)
    right_pos = center_start + center_width
    for score in [5, 6, 7]:
        if score in percentages:
            width = percentages[score]
            plt.barh(idx, width, left=right_pos,
                    color=blues[score-5])
            if width > 2:
                plt.text(right_pos + width/2, idx, f'{width:.1f}%',
                        ha='center', va='center', fontsize=7,
                        color='black', fontweight='bold')
            right_pos += width

plt.yticks(range(len(nice_names)), nice_names)
plt.title('Distribution of Responses', pad=15, fontsize=11, color='darkslategray')

# Remove x-axis labels but keep the grid
plt.xlabel('')
plt.tick_params(axis='x', which='both', bottom=False, labelbottom=False)
plt.grid(True, axis='x', linestyle='--', alpha=0.3)

# Add a vertical line at center
plt.axvline(x=50, color='gray', linestyle='--', alpha=0.5)

# Manually create a legend
legend_labels = ['1-Strongly Disagree', '2', '3', '4-Neutral', '5', '6', '7-Strongly Agree']
legend_colors = reds[::-1] + [middle_color] + blues
handles = [plt.Rectangle((0, 0), 1, 1, color=color) for color in legend_colors]
plt.legend(handles, legend_labels, title="Likert Scale Points",
           bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8, title_fontsize=9)

# Ensure the plot shows all bars with some padding
plt.tight_layout()
plt.savefig('likert_distribution.png', dpi=300, bbox_inches='tight',
            facecolor='white', edgecolor='none')
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Prior Experience', y='7-point Likert Scale',
  plt.gca().set_xticklabels([label_map[tick.get_text()] for tick in current_xticks])


# Better Visualization

In [63]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset
file_path = '../data/selfReported/backgroundDemographic.csv'
df = pd.read_csv(file_path)

# Set the general style and theme
sns.set_theme(style="darkgrid", font="Arial", font_scale=1.2)
plt.rcParams['figure.dpi'] = 300

In [64]:
# 1. Enhanced Education Level Distribution
plt.figure(figsize=(14, 2))

education_counts = df['educationLevel'].value_counts()
education_percentages = (education_counts / len(df) * 100).round(1)

left = 0
ax1 = plt.gca()

# Improved color palette and bar design
muted_colors = sns.color_palette("coolwarm", len(education_percentages))

for i, (edu, percentage) in enumerate(education_percentages.items()):
    ax1.barh(0, percentage, left=left, color=muted_colors[i], label=f"{edu} ({percentage}%)")
    ax1.text(left + percentage / 2, 0, f'{percentage}%', ha='center', va='center',
             color='white', fontweight='bold', fontsize=10)
    left += percentage

plt.title('Distribution of Education Level (%)', pad=12, fontsize=14, color='darkslategray')
plt.xlabel('Percentage', fontsize=10, color='darkslategray')
plt.yticks([])
plt.xlim(0, 100)
plt.grid(axis='x', linestyle='--', alpha=0.5)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.5), ncol=3, fontsize=10, frameon=False)

plt.tight_layout()
plt.savefig('../dataVisualization/background/enhanced_education_distribution.png', dpi=300, bbox_inches='tight', facecolor='white')
plt.close()

  plt.tight_layout()


In [101]:
# 2. Enhanced Experience Distribution (Violin Plots)
# Further enhancements for the violin plot visualization
plt.figure(figsize=(14, 7))

# Enhance the melted data for visualization
experience_data = pd.melt(
    df[['groupDecisionExperience', 'onlineCoworkExperience', 'Aiexperience']],
    var_name='Prior Experience',
    value_name='7-point Likert Scale'
)

# Updated color palette for a clean and impactful look
violin_palette = sns.color_palette("coolwarm", n_colors=3)

# Enhanced violin plot with modifications
sns.violinplot(
    x='Prior Experience', y='7-point Likert Scale', data=experience_data,
    palette=violin_palette, inner='box', cut=0, linewidth=1.2
)

# Set custom x-axis labels for better understanding
label_map = {
    'groupDecisionExperience': 'Group Decision-Making',
    'onlineCoworkExperience': 'Online Collaboration',
    'Aiexperience': 'AI (e.g., ChatGPT)'
}
plt.gca().set_xticklabels(
    [label_map[label.get_text()] for label in plt.gca().get_xticklabels()],
    rotation=0, ha='center', fontsize=11
)

# Add mean markers for additional context
for i, category in enumerate(['groupDecisionExperience', 'onlineCoworkExperience', 'Aiexperience']):
    means = df[category].mean()
    plt.scatter(
        i, means, color='black', label='Mean' if i == 0 else "",
        zorder=10, edgecolor='white', s=80
    )

# Enhanced labels, title, and grid adjustments
plt.ylim(0.5, 7.5)
plt.yticks(range(1, 8), fontsize=10)
plt.title('Distribution of Prior Experience', pad=15, fontsize=16, color='darkslategray', weight='bold')
plt.xlabel('Prior Experience', fontsize=13, color='darkslategray')
plt.ylabel('7-point Likert Scale', fontsize=13, color='darkslategray')

# Add vertical grid lines for reference
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()

# Save the enhanced plot
plt.savefig('../dataVisualization/background/further_enhanced_experience_distribution.png', dpi=300, bbox_inches='tight', facecolor='white')
plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(
  plt.gca().set_xticklabels(


In [100]:
# # 3. Enhanced Likert Scale Distribution (Diverging Bars)

# Define experience types in desired order
experience_types = ['groupDecisionExperience', 'onlineCoworkExperience', 'Aiexperience']
nice_names = ['Group Decision-Making\n Experience', 'Online Collaboration\n Experience', 'AI (e.g. ChatGPT)\n Experience']

# Create custom color gradients
reds = sns.color_palette("Reds", n_colors=3)
blues = sns.color_palette("Blues", n_colors=3)
middle_color = 'lightgray'

# Begin plot
plt.figure(figsize=(16, 3))  # Adjusted for better readability

for idx, (exp_type, nice_name) in enumerate(zip(experience_types, nice_names)):
    # Calculate percentages for each Likert scale value
    value_counts = df[exp_type].value_counts().sort_index()
    percentages = (value_counts / len(df) * 100).round(1)

    # Calculate the width of the neutral (gray) bar
    center_pct = percentages.get(4, 0)
    center_width = center_pct / 2  # Half to the left and half to the right

    # Center the gray bar at 0
    left_start = -center_width
    plt.barh(idx, center_pct, left=left_start, color=middle_color)
    plt.text(0, idx, f'{center_pct}%', ha='center', va='center', fontsize=8, color='black')

    # Add left-side bars (1-3: Strongly Disagree) with improved spacing
    left_pos = left_start
    for score in [3, 2, 1]:
        if score in percentages:
            width = -percentages[score]  # Negative for left side
            plt.barh(idx, width, left=left_pos, color=reds[3 - score], edgecolor='white', linewidth=0.5)
            plt.text(left_pos + width / 2, idx, f'{percentages[score]}%', ha='center', va='center', fontsize=8, color='black')
            left_pos += width

    # Add right-side bars (5-7: Strongly Agree)
    right_pos = center_width
    for score in [5, 6, 7]:
        if score in percentages:
            width = percentages[score]  # Positive for right side
            plt.barh(idx, width, left=right_pos, color=blues[score - 5], edgecolor='white', linewidth=0.5)
            plt.text(right_pos + width / 2, idx, f'{percentages[score]}%', ha='center', va='center', fontsize=8, color='black')
            right_pos += width

# Styling
plt.yticks(range(len(nice_names)), nice_names, fontsize=12)
plt.axvline(x=0, color='gray', linestyle='--', alpha=0.5)
plt.grid(axis='x', linestyle='--', alpha=0.5)
plt.gca().set_xticks([])  # Remove numeric values from x-axis

plt.title('Likert Scale Responses with Balanced Alignment', fontsize=16, pad=20, weight='bold')
plt.xlabel('', fontsize=14)
plt.xticks(fontsize=12)
plt.tight_layout()

# Custom Legend
handles = [plt.Rectangle((0, 0), 1, 1, color=color) for color in reds[::-1] + [middle_color] + blues]
legend_labels = [
    '1-Strongly Disagree', '2', '3', 
    '4-Neutral', '5', '6', '7-Strongly Agree'
]
plt.legend(
    handles, legend_labels, title="Likert Scale Points",
    bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10, title_fontsize=12, frameon=False
)

# Save the updated plot
output_path = '../dataVisualization/background/likert_distribution.png'
plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
plt.close()
