## Setup

# Assignment 5, Question 7: Group Operations & Final Analysis

**Points: 15**

Perform grouped analysis and create summary reports.

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import utilities
from q3_data_utils import load_data, summarize_by_group

df = load_data('data/clinical_trial_raw.csv')
print(f"Loaded {len(df)} patients")

# Prewritten visualization function for grouped analysis
def plot_group_comparison(data, x_col, y_col, title):
    """
    Create a bar chart comparing groups.
    
    Args:
        data: DataFrame with grouped data
        x_col: Column name for x-axis (groups)
        y_col: Column name for y-axis (values)
        title: Chart title
    """
    plt.figure(figsize=(10, 6))
    data.plot(x=x_col, y=y_col, kind='bar')
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

data loaded with success: 10000,rows and 18 columns
Loaded 10000 patients


## Part 1: Basic Groupby (5 points)

1. Group by 'site' and calculate mean age, BMI, and blood pressure
2. Group by 'intervention_group' and count patients
3. Use the `summarize_by_group()` utility to get overall statistics by site

In [42]:
# TODO: Group by site
site_summary = summarize_by_group(df, 'site')
print(site_summary)

summarized by group 'site
           site    age                                                    \
                 count       mean         std    min    25%   50%    75%   
0      SITE A     74.0  23.608108  246.535880 -999.0  71.00  81.0  94.50   
1      SITE B     94.0  59.670213  157.423433 -999.0  73.25  81.0  93.75   
2      SITE C     55.0  60.254545  146.070787 -999.0  69.50  78.0  90.50   
3      SITE D     41.0  77.853659   11.501219   55.0  69.00  78.0  87.00   
4      SITE E     31.0  12.258065  270.211641 -999.0  70.00  81.0  89.00   
5     Site  A     67.0  80.328358   14.431015   49.0  70.50  77.0  92.50   
6      Site A     64.0  49.843750  190.210011 -999.0  73.00  83.0  92.00   
7      Site B     88.0  56.318182  162.423495 -999.0  69.00  80.5  93.00   
8      Site C     83.0  27.831325  232.797453 -999.0  69.00  79.0  88.50   
9      Site D     32.0 -25.312500  318.343742 -999.0  66.00  73.5  81.50   
10     Site E     42.0  75.500000   13.800406   49.0  66.25  7

In [43]:
# TODO: Count by intervention group
intervention_summary = summarize_by_group(df, 'intervention_group')
print(intervention_summary)

summarized by group 'intervention_group
   intervention_group    age                                             \
                       count       mean         std    min    25%   50%   
0           CONTROL     84.0  80.773810   13.546307   52.0  70.00  81.0   
1            Contrl     73.0  51.095890  178.015384 -999.0  69.00  77.0   
2           Control     95.0  34.957895  218.281193 -999.0  72.00  79.0   
3       TREATMENT A     65.0  47.861538  188.526347 -999.0  68.00  81.0   
4       TREATMENT B     83.0  37.831325  202.523941 -999.0  64.00  75.0   
5        Treatmen A     60.0  64.383333  140.204825 -999.0  72.75  82.0   
6      Treatment  B     64.0  63.500000  135.633727 -999.0  69.75  79.0   
7       Treatment A     66.0  48.439394  187.015436 -999.0  70.25  80.0   
8       Treatment B    104.0  68.586538  106.710898 -999.0  67.00  78.0   
9        TreatmentA     66.0  65.227273  133.644856 -999.0  70.25  80.5   
10          control     83.0  67.939759  119.385760 -999.0  

**Note:** The `summarize_by_group()` function has an optional `agg_dict` parameter for custom aggregations. If you don't specify it, it will use `.describe()` on numeric columns. You can use `agg_dict={'age': ['mean', 'std'], 'bmi': 'mean'}` for custom aggregations.


In [44]:
# TODO: Use summarize_by_group utility

summary = summarize_by_group(
    df,
    'site',
    {'age': ['mean', 'std'], 'bmi': 'mean'}
)
print(summary)


summarized by group 'site
           site        age                    bmi
                      mean         std       mean
0      SITE A    23.608108  246.535880  25.872059
1      SITE B    59.670213  157.423433  26.822826
2      SITE C    60.254545  146.070787  25.688889
3      SITE D    77.853659   11.501219  25.892105
4      SITE E    12.258065  270.211641  25.496667
5     Site  A    80.328358   14.431015  25.452239
6      Site A    49.843750  190.210011  26.263934
7      Site B    56.318182  162.423495  26.521429
8      Site C    27.831325  232.797453  26.454054
9      Site D   -25.312500  318.343742  26.387097
10     Site E    75.500000   13.800406  27.102857
11     Site_D     9.870968  269.613024  25.533333
12     site a    67.270270  126.507924  25.609859
13     site b    57.022222  160.594050  25.615909
14     site c    80.929825   14.747711  26.356364
15     site d    81.000000   14.422205  27.180000
16     site e    83.194444   12.825910  26.333333
17       SITE A  60.9108

## Part 2: Multiple Aggregations (5 points)

Group by 'site' and apply multiple aggregations:
- age: mean, std, min, max
- bmi: mean, std
- systolic_bp: mean, median

Display the results in a well-formatted table.

In [24]:
# TODO: Multiple aggregations
agg_dict = {
    'age': ['mean','std', 'min','max'],
    'bmi': ['mean', 'std'],
    'systolic_bp': ['mean', 'median']
}
summary_site_agg = summarize_by_group(df, 'site', agg_dict)
summary_site_agg.head()

summarized by group 'site


Unnamed: 0_level_0,site,age,age,age,age,bmi,bmi,systolic_bp,systolic_bp
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,min,max,mean,std,mean,median
0,SITE A,23.608108,246.53588,-999,100,25.872059,4.805123,116.633803,115.0
1,SITE B,59.670213,157.423433,-999,100,26.822826,3.562445,118.055556,119.0
2,SITE C,60.254545,146.070787,-999,100,25.688889,6.682268,115.942308,115.0
3,SITE D,77.853659,11.501219,55,100,25.892105,8.003154,119.513514,120.0
4,SITE E,12.258065,270.211641,-999,100,25.496667,3.439275,113.066667,111.0


## Part 3: Comparative Analysis (5 points)

Compare intervention groups:
1. Calculate mean outcome_cvd rate by intervention_group
2. Calculate mean adherence_pct by intervention_group
3. Create a cross-tabulation of intervention_group vs dropout status
4. Visualize the comparison with a bar plot

In [45]:
# TODO: Intervention group comparisons
#1-outcome by intervention grp
mean_outcome_cvd= df.groupby(['intervention_group' , 'outcome_cvd']).size().groupby(level=0).apply(lambda x: x/x.sum())
#mean_outcome_cvd = df.groupby('intervention_group')['outcome_cvd'].mean().reset_index()
print(mean_outcome_cvd)
#2-meanadherence_pct
mean_adherence_pct = df.groupby(['intervention_group', 'adherence_pct']).size().groupby(level=0).apply(lambda x: x/x.sum())
#mean_adherence_pct = df.groupby('intervention_group')['adherence_pct'].mean().reset_index()
print(mean_adherence_pct)
# cross tabulation:
cross_tab =pd.crosstab(df['intervention_group'], df['dropout'])
print(cross_tab)


intervention_group  intervention_group  outcome_cvd
  CONTROL             CONTROL           No             0.380952
                                        Yes            0.226190
                                        no             0.261905
                                        yes            0.130952
  Contrl              Contrl            No             0.410959
                                                         ...   
treatment a         treatment a         yes            0.171769
treatment b         treatment b         No             0.469333
                                        Yes            0.148000
                                        no             0.198667
                                        yes            0.184000
Length: 104, dtype: float64
intervention_group  intervention_group  adherence_pct
  CONTROL             CONTROL           20.0             0.044118
                                        28.0             0.014706
                              

In [None]:
# TODO: Visualization



## Part 4: Final Report

Create and save:
1. Summary statistics by site → `output/q7_site_summary.csv`
2. Intervention group comparison → `output/q7_intervention_comparison.csv`
3. Text report with key findings → `output/q7_analysis_report.txt`

In [46]:
# TODO: Save summary outputs
summary_site_agg.to_csv('output/q7_site_summary.csv', index=False)
#intervention group comparision
mean_outcome_cvd.to_csv('output/q7_intervention_comparison.csv', index=False) 
#text report with key findings



## Summary
What are the 3 most important findings from your analysis?

**Key Findings:** 

# TODO:
"""
1- Outcome CVD-Rate:
- Interventiion grp:
- Control grp:
2- Mean Adherence per intervention group:
Intervention grp:
Control grp:
3- Data Distribution by site:

Age :
BMI:
Syst.BP:

"""