In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ⛳ Filter 100+ audit bucket
df_100plus = df[df['audit_bucket'] == '100+']

# 🧩 Crosstab: provider vs category
provider_category_ct = pd.crosstab(df_100plus['providertaxid'], df_100plus['sel_category'])

# 📌 Optional: Preview the crosstab table
print("🔍 Crosstab of Providers vs Selection Categories (100+ bucket):")
display(provider_category_ct.head())

# 📊 EDA: Countplot to show distribution of categories across all 100+ providers
plt.figure(figsize=(10, 6))
sns.countplot(data=df_100plus, x='sel_category', order=df_100plus['sel_category'].value_counts().index, palette='Set2')
plt.title('Distribution of Selection Categories (100+ Audit Bucket)')
plt.xlabel('Selection Category')
plt.ylabel('Number of Providers')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 🔥 Optional: Heatmap to show density per provider vs category (for large patterns)
plt.figure(figsize=(14, 8))
sns.heatmap(provider_category_ct, cmap='YlGnBu', linewidths=0.5)
plt.title('Heatmap of Provider Participation Across Selection Categories (100+ bucket)')
plt.xlabel('Selection Category')
plt.ylabel('Provider Tax ID')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ⛳ Filter only 100+ bucket
df_100plus = df[df['audit_bucket'] == '100+']

# 🧭 Preview table (optional)
print("📊 Preview of key columns:")
display(df_100plus[['providertaxid', 'sel_category', 'receiving_rate_overall']].head())

# 📈 Boxplot to see distribution of receiving rate by selection category
plt.figure(figsize=(12, 6))
sns.boxplot(data=df_100plus, x='sel_category', y='receiving_rate_overall', palette='Set3')
plt.title('Receiving Rate by Selection Category (100+ Audit Bucket)')
plt.xlabel('Selection Category')
plt.ylabel('Receiving Rate')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 🔎 (Optional) Stripplot overlay for more granularity
plt.figure(figsize=(12, 6))
sns.stripplot(data=df_100plus, x='sel_category', y='receiving_rate_overall', hue='providertaxid', dodge=True, jitter=0.3, palette='tab10', legend=False)
plt.title('Receiving Rate Distribution by Provider & Category (100+ Bucket)')
plt.xlabel('Selection Category')
plt.ylabel('Receiving Rate')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ⛳ Step 1: Filter only providers in '100+' audit bucket
df_100plus = df[df['audit_bucket'] == '100+']

# ⚙️ Step 2: Group by Provider & Category
agg_data = df_100plus.groupby(['providertaxid', 'sel_category']).agg(
    receiving_rate_mean=('receiving_rate_overall', 'mean'),
    total_claims=('claim_id', 'count')  # Replace 'claim_id' with actual claim column
).reset_index()

# 📊 Step 3: Pivot for receiving rate and claim count
pivot_rate = agg_data.pivot(index='providertaxid', columns='sel_category', values='receiving_rate_mean')
pivot_claims = agg_data.pivot(index='providertaxid', columns='sel_category', values='total_claims')

# 📈 Step 4: Add variation metric (std dev of rate)
pivot_rate['receiving_rate_stddev'] = pivot_rate.std(axis=1)

# 🔝 Step 5: Top providers by rate variation
high_var = pivot_rate.sort_values('receiving_rate_stddev', ascending=False).head(10)

print("📉 Top 10 Providers with Highest Receiving Rate Variation Across Categories:")
display(high_var)

# 🎨 Step 6: Heatmap of Receiving Rate
plt.figure(figsize=(14, 6))
sns.heatmap(pivot_rate.drop(columns='receiving_rate_stddev'), cmap='coolwarm', linewidths=0.5, linecolor='grey')
plt.title('Receiving Rate by Provider and Category (100+ Bucket)')
plt.xlabel('Selection Category')
plt.ylabel('Provider Tax ID')
plt.tight_layout()
plt.show()

# 📦 Step 7: Heatmap of Total Claims per Category
plt.figure(figsize=(14, 6))
sns.heatmap(pivot_claims.fillna(0), cmap='YlGnBu', linewidths=0.5, linecolor='grey', annot=True, fmt='.0f')
plt.title('Total Claims by Provider and Category')
plt.xlabel('Selection Category')
plt.ylabel('Provider Tax ID')
plt.tight_layout()
plt.show()

# 🧮 Step 8: Histogram of variation
plt.figure(figsize=(8,4))
sns.histplot(pivot_rate['receiving_rate_stddev'], bins=30, kde=True, color='coral')
plt.title("Distribution of Receiving Rate Variation Across Categories")
plt.xlabel("Std Dev of Receiving Rate (per Provider)")
plt.ylabel("Number of Providers")
plt.tight_layout()
plt.show()


In [None]:
total_claims=('sel_category', 'size')
