In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load your Excel file
file_path = 'your_file.xlsx'  # <-- Change this to your actual file path
df = pd.read_excel(file_path, sheet_name='Sheet1')

# Preview data
df.head()

In [None]:
# Compute conversion rate per provider
df['conversion_rate'] = df['selection_count'] / df['inscope_count'].replace(0, pd.NA)
df.head()

In [None]:
# Category level analysis with provider breakdown
categories = df['category'].unique()

summary_list = []

for category in categories:
    df_cat = df[df['category'] == category].copy()
    total_sel = df_cat['selection_count'].sum()
    total_inscope = df_cat['inscope_count'].sum()
    
    if total_sel == 0:
        continue
    
    # Shares
    df_cat['sel_share'] = df_cat['selection_count'] / total_sel * 100
    df_cat = df_cat.sort_values('selection_count', ascending=False)
    df_cat['cum_share'] = df_cat['selection_count'].cumsum() / total_sel * 100
    df_cat['provider_rank'] = range(1, len(df_cat)+1)

    # Summary metrics
    top5_share = df_cat['sel_share'].head(5).sum()
    hhi = (df_cat['sel_share'] / 100).pow(2).sum()
    conv_rate = total_sel / total_inscope if total_inscope > 0 else None
    
    summary_list.append({
        'Category': category,
        'Providers': df_cat['providertaxid'].nunique(),
        'Total Inscope': total_inscope,
        'Total Selections': total_sel,
        'Conversion Rate': conv_rate,
        'Top 5 Share %': top5_share,
        'HHI': hhi
    })

    # Plot selections bar chart
    plt.figure(figsize=(10,6))
    plt.bar(df_cat['providertaxid'].astype(str), df_cat['selection_count'])
    plt.xticks(rotation=90)
    plt.ylabel('Selections')
    plt.title(f'Selections per Provider - {category}')
    plt.show()

    # Lorenz curve (cumulative concentration)
    plt.figure(figsize=(8,6))
    plt.plot(df_cat['provider_rank'], df_cat['cum_share'], marker='o')
    plt.axhline(80, color='r', linestyle='--', label='80% cutoff')
    plt.xlabel('Provider Rank (sorted)')
    plt.ylabel('Cumulative % of Selections')
    plt.title(f'Provider Concentration - {category}')
    plt.legend()
    plt.show()

    # Distribution of provider conversion rates
    plt.figure(figsize=(8,6))
    sns.histplot(df_cat['conversion_rate'].dropna(), bins=10, kde=True)
    plt.xlabel('Provider Conversion Rate')
    plt.title(f'Conversion Rate Distribution - {category}')
    plt.show()

    print(f'Category: {category}')
    print(f'Top 5 providers account for {top5_share:.1f}% of selections.')
    print(f'Herfindahl Index (HHI): {hhi:.3f}')
    print(f'Overall Conversion Rate: {conv_rate:.2%}' if conv_rate else 'N/A')
    print('-'*60)

summary_df = pd.DataFrame(summary_list)
summary_df

In [None]:
# Export summary to Excel for manager
output_file = 'category_summary.xlsx'
summary_df.to_excel(output_file, index=False)
print(f'Summary exported to {output_file}')