# Assignment 5, Question 7: Group Operations & Final Analysis

**Points: 15**

Perform grouped analysis and create summary reports.

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import utilities
from q3_data_utils import load_data, summarize_by_group

df = load_data('data/clinical_trial_raw.csv')
print(f"Loaded {len(df)} patients")

# Prewritten visualization function for grouped analysis
def plot_group_comparison(data, x_col, y_col, title):
    """
    Create a bar chart comparing groups.
    
    Args:
        data: DataFrame with grouped data
        x_col: Column name for x-axis (groups)
        y_col: Column name for y-axis (values)
        title: Chart title
    """
    plt.figure(figsize=(10, 6))
    data.plot(x=x_col, y=y_col, kind='bar')
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

Run test on q3_data cleanup...
Test DataFrame created: (5, 3)
Test detect_missing: 2
Test passed!
      site  count
0   site b    742
1   Site B    736
2   SITE B    703
3   SITE A    684
4  Site  A    681
(34, 2)
Loaded 34 patients


## Part 1: Basic Groupby (5 points)

1. Group by 'site' and calculate mean age, BMI, and blood pressure
2. Group by 'intervention_group' and count patients
3. Use the `summarize_by_group()` utility to get overall statistics by site

In [None]:
# TODO: Group by site
site_summary = df_typed.groupby('site')[['age', 'bmi', 'systolic_bp', 'diastolic_bp']].mean()
print("1. Mean values by site:")
print(site_summary)

In [None]:
# TODO: Count by intervention group
intervention_counts = df_typed['intervention_group'].value_counts()
print("2. Counts by intervention group:")
print(intervention_counts)

**Note:** The `summarize_by_group()` function has an optional `agg_dict` parameter for custom aggregations. If you don't specify it, it will use `.describe()` on numeric columns. You can use `agg_dict={'age': ['mean', 'std'], 'bmi': 'mean'}` for custom aggregations.


In [None]:
# TODO: Use summarize_by_group utility
site_stats =summarize_by_group(df_typed, 'site')
print("3. Overall statistics by site:")
print(site_stats)

## Part 2: Multiple Aggregations (5 points)

Group by 'site' and apply multiple aggregations:
- age: mean, std, min, max
- bmi: mean, std
- systolic_bp: mean, median

Display the results in a well-formatted table.

In [None]:
# TODO: Multiple aggregations
site_stats = summarize_by_group (
    df_typed,
    'site',
    agg_dict={
        'age':['mean', 'std', 'min', 'max'],
        'bmi':['mean', 'std'],
        'systolic_bp':['mean', 'median']
    }
)

print ("="*80)
print("Statistics by site")
print ("="*80)
print (site_stats.round(2))
print ("="*80)

## Part 3: Comparative Analysis (5 points)

Compare intervention groups:
1. Calculate mean outcome_cvd rate by intervention_group
2. Calculate mean adherence_pct by intervention_group
3. Create a cross-tabulation of intervention_group vs dropout status
4. Visualize the comparison with a bar plot

In [None]:
# TODO: Intervention group comparisons

print("1. Mean CVD outcome rate by intervention group:")
cvd_by_group = df_typed.groupby('intervention_group')['outcome_cvd'].mean()
print(cvd_by_group.round(4))
print()

print("2. Mean adherence percentage by intervention group:")
adherence_by_group = df_typed.groupby('intervention_group')['adherence_pct'].mean()
print(adherence_by_group.round(2))
print()

print("3. Cross-tabulation of intervention group vs. droupout status:")
crosstab = pd.crosstab(df_typed['intervention_group'], df_typed['dropout'], margins = True)
print(crosstab)
print()
print("Dropout rates by group:")
dropout_rates = pd.crosstab(df_typed['intervention_group'], df_typed['dropout'])dropout_rates = pd.crosstab(df_typed['intervention_group'], df_typed['dropout'], normalize='index') * 100
print(dropout_rates.round(2))
print()

In [None]:
# TODO: Visualization
fig, axes = plt.subplots(1,3,figsize=(15,5))

# cvd_by_group
cvd_by_group.plot(kind='bar', ax=axes[0], color = 'blue')
axes[0].set_title('Mean CVD outcome rate by intervention group')
axes[0].set_xlabel('Intervention group')
axes[0].set_ylabel('CVD rate')
axes[0].tick_params(axis='x', rotation=45)

# adherence_by_group
adherence_by_group.plot(kind='bar', ax=axes[0], color = 'green')
axes[0].set_title('Mean percent adherence by intervention group')
axes[0].set_xlabel('Intervention group')
axes[0].set_ylabel('Adherence percent')
axes[0].tick_params(axis='x', rotation=45)

# droupout status
dropout_rates_plot = df_typed.groupby('intervention_group')['dropout'].apply(lambda x: (x == 1).mean() * 100)
dropout_rates_plot.plot(kind='bar', ax=axes[1, 0], color='coral', alpha=0.7)
axes[1, 0].set_title('Dropout Rate by Intervention Group', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Intervention Group')
axes[1, 0].set_ylabel('Dropout Rate (%)')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('output/intervention_comparison.png', dpi=300, bbox_inches='tight')
print("Visualization saved to output/intervention_comparison.png")
plt.show()

In [None]:
## compare shorter version: (x- cori)

# 1. Mean CVD outcome rate
cvd_by_group = df_typed.groupby('intervention_group')['outcome_cvd'].mean()
print("Mean CVD Rate:", cvd_by_group.round(4))

# 2. Mean adherence
adherence_by_group = df_typed.groupby('intervention_group')['adherence_pct'].mean()
print("Mean Adherence %:", adherence_by_group.round(2))

# 3. Crosstab
crosstab = pd.crosstab(df_typed['intervention_group'], df_typed['dropout'])
print("Crosstab:\n", crosstab)

# 4. Bar plot
cvd_by_group.plot(kind='bar', title='CVD Rate by Intervention Group')
plt.ylabel('CVD Rate')
plt.savefig('output/intervention_comparison.png')
plt.show()

## Part 4: Final Report

Create and save:
1. Summary statistics by site → `output/q7_site_summary.csv`
2. Intervention group comparison → `output/q7_intervention_comparison.csv`
3. Text report with key findings → `output/q7_analysis_report.txt`

In [None]:
# TODO: Save summary outputs
print("1. Site summary:")
site_stats.to_csv('output/q7_site_summary.csv')
print("Site summary saved to output/q7_site_summary.csv")

print("2. Intervention group CVD comparison")
cvd_by_group.to_csv('output/q7_intervention_comparison.csv')
print("Intervention group CVD comparison saved to output/q7_intervention_comparison.csv")

### check
print("3. Creating analysis report...")
with open('output/q7_analysis_report.txt', 'w') as f:
    f.write("="*80 + "\n")
    f.write("CLINICAL TRIAL DATA ANALYSIS REPORT\n")
    f.write("="*80 + "\n\n")
    
    # Dataset Overview
    f.write("DATASET OVERVIEW\n")
    f.write("-"*80 + "\n")
    f.write(f"Total patients: {len(df_typed):,}\n")
    f.write(f"Total variables: {df_typed.shape[1]}\n")
    f.write(f"Analysis date: {pd.Timestamp.now().strftime('%Y-%m-%d')}\n\n")
    
    # Site Summary
    f.write("SUMMARY BY SITE\n")
    f.write("-"*80 + "\n")
    f.write(f"Number of sites: {df_typed['site'].nunique()}\n")
    f.write(f"Sites: {', '.join(df_typed['site'].unique())}\n\n")
    f.write("Patient distribution by site:\n")
    site_counts = df_typed['site'].value_counts()
    for site, count in site_counts.items():
        f.write(f"  - {site}: {count:,} patients ({count/len(df_typed)*100:.1f}%)\n")
    f.write("\n")
    
    # Intervention Group Summary
    f.write("INTERVENTION GROUP COMPARISON\n")
    f.write("-"*80 + "\n")
    f.write(f"Number of intervention groups: {df_typed['intervention_group'].nunique()}\n\n")
    
    f.write("Key Findings:\n")
    for group in intervention_comparison.index:
        f.write(f"\n{group}:\n")
        f.write(f"  - Sample size: {intervention_comparison.loc[group, 'sample_size']:.0f}\n")
        f.write(f"  - CVD outcome rate: {intervention_comparison.loc[group, 'mean_cvd_rate']:.4f}\n")
        f.write(f"  - Mean adherence: {intervention_comparison.loc[group, 'mean_adherence_pct']:.2f}%\n")
        f.write(f"  - Dropout rate: {intervention_comparison.loc[group, 'dropout_rate']:.2f}%\n")
        f.write(f"  - Mean age: {intervention_comparison.loc[group, 'mean_age']:.1f} years\n")
        f.write(f"  - Mean BMI: {intervention_comparison.loc[group, 'mean_bmi']:.1f}\n")
    
    f.write("\n")
    
    # Key Statistics
    f.write("KEY STATISTICS\n")
    f.write("-"*80 + "\n")
    f.write(f"Overall CVD outcome rate: {df_typed['outcome_cvd'].mean():.4f}\n")
    f.write(f"Overall adherence rate: {df_typed['adherence_pct'].mean():.2f}%\n")
    f.write(f"Overall dropout rate: {df_typed['dropout'].mean()*100:.2f}%\n")
    f.write(f"Mean age: {df_typed['age'].mean():.1f} years (SD: {df_typed['age'].std():.1f})\n")
    f.write(f"Mean BMI: {df_typed['bmi'].mean():.1f} (SD: {df_typed['bmi'].std():.1f})\n")
    f.write(f"Mean systolic BP: {df_typed['systolic_bp'].mean():.1f} mmHg\n\n")
    
    # Missing Data Summary
    f.write("MISSING DATA SUMMARY\n")
    f.write("-"*80 + "\n")
    missing = detect_missing(df_typed)
    missing_cols = missing[missing > 0]
    if len(missing_cols) > 0:
        f.write(f"Columns with missing data: {len(missing_cols)}\n\n")
        for col, count in missing_cols.items():
            f.write(f"  - {col}: {count} ({count/len(df_typed)*100:.2f}%)\n")
    else:
        f.write("No missing data detected.\n")
    
    f.write("\n")
    f.write("="*80 + "\n")
    f.write("END OF REPORT\n")
    f.write("="*80 + "\n")

print("   ✓ Analysis report saved to output/q7_analysis_report.txt")
print()

In [None]:
# TEST - cs: compact version, recalculates

# 1. Site summary
site_summary = summarize_by_group(
    df_typed, 'site',
    {'age': ['mean', 'std'], 'bmi': 'mean', 'outcome_cvd': 'mean'}
)
site_summary.round(2).to_csv('output/q7_site_summary.csv')

# 2. Intervention comparison
intervention_comparison = pd.DataFrame({
    'cvd_rate': df_typed.groupby('intervention_group')['outcome_cvd'].mean(),
    'adherence': df_typed.groupby('intervention_group')['adherence_pct'].mean(),
    'dropout_rate': df_typed.groupby('intervention_group')['dropout'].mean() * 100
})
intervention_comparison.round(2).to_csv('output/q7_intervention_comparison.csv')

# 3. Text report
with open('output/q7_analysis_report.txt', 'w') as f:
    f.write("CLINICAL TRIAL ANALYSIS REPORT\n")
    f.write("="*60 + "\n\n")
    f.write(f"Total patients: {len(df_typed)}\n")
    f.write(f"CVD rate: {df_typed['outcome_cvd'].mean():.4f}\n")
    f.write(f"Adherence: {df_typed['adherence_pct'].mean():.2f}%\n")
    f.write(f"Dropout rate: {df_typed['dropout'].mean()*100:.2f}%\n")

print("✓ All outputs saved!")

## Summary

What are the 3 most important findings from your analysis?

**Key Findings:**

1. TODO
2. TODO
3. TODO
