
# 🧠 Provider Behavior EDA Plan

This notebook explores provider behavior using metrics such as hit rate, recovery rate, dispute patterns, and medical record receipt compliance.

---

## 🔍 High-Level Goals of EDA

1. **Segment** providers by behavior (e.g., high/low performing, compliant/risky).
2. **Understand** how provider behavior varies by volume (bucket).
3. **Identify** correlations between audit outcomes and disputes, recovery, MRR.
4. **Find** outliers and high-impact providers.
5. **Prepare** to inform strategy (e.g., prioritize audits, flag risky providers).

---

## ✅ Columns in the Dataset

- `providertaxid`
- `findings`, `no_findings`, `total_audits`
- `dispute_ratio`, `overturned_ratio`
- `mr_receiving_rate`
- `cancelled_count`
- `hit_rate`, `recovery_rate`
- `total_overpay`, `total_recovery`
- `volume_bucket` (0–5, 5–15, 15–50, 50–100, 100+)

---

## 📊 EDA Steps and Visualizations

### 1. Summary Stats per Bucket
- Mean, median, std by `volume_bucket` for key metrics
- Bar, box, violin plots

### 2. Distribution & Correlation
- Histograms, correlation heatmap
- Pairwise scatterplots

### 3. Volume vs Performance
- Focus on providers with >15 audits
- Scatterplots of audits vs metrics

### 4. Risk Profiling & Outliers
- Z-score or IQR method to flag outliers
- Scatter plots for risky behavior

### 5. (Optional) Regression & Clustering
- Linear regression
- KMeans clustering

### 6. Treating Low Volume Providers
- Low-confidence group
- Use summary stats, avoid drawing hard conclusions

---

## 📤 Final Deliverables

- Summary dashboard and plots
- High-risk provider list
- Insight bullets


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy import stats

# Load dataset
df = pd.read_csv("your_provider_data.csv")  # Replace with actual file path


In [None]:
# Summary statistics by volume_bucket
bucket_summary = df.groupby('volume_bucket').agg({
    'hit_rate': ['mean', 'median', 'std'],
    'recovery_rate': ['mean', 'median', 'std'],
    'dispute_ratio': ['mean', 'median'],
    'mr_receiving_rate': ['mean', 'median'],
    'total_overpay': 'sum',
    'total_recovery': 'sum',
    'providertaxid': 'count'
}).reset_index()
bucket_summary.columns = ['_'.join(col).strip('_') for col in bucket_summary.columns.values]
bucket_summary

In [None]:
# Boxplot of hit_rate by bucket
plt.figure(figsize=(10, 6))
sns.boxplot(x='volume_bucket', y='hit_rate', data=df)
plt.title("Hit Rate by Volume Bucket")
plt.show()


In [None]:
# Correlation heatmap
metrics = ['hit_rate', 'recovery_rate', 'dispute_ratio', 'overturned_ratio', 'mr_receiving_rate', 'total_overpay', 'total_recovery']
plt.figure(figsize=(10, 8))
sns.heatmap(df[metrics].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()


In [None]:
# Flag outliers using IQR
def flag_outliers_iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    return ((series < (Q1 - 1.5 * IQR)) | (series > (Q3 + 1.5 * IQR)))

df['outlier_hit_rate'] = flag_outliers_iqr(df['hit_rate'])
df['outlier_recovery_rate'] = flag_outliers_iqr(df['recovery_rate'])
df['outlier_dispute'] = flag_outliers_iqr(df['dispute_ratio'])

# View flagged outliers
df[df[['outlier_hit_rate', 'outlier_recovery_rate', 'outlier_dispute']].any(axis=1)].head()


In [None]:
# Scatter plot: hit_rate vs recovery_rate
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='hit_rate', y='recovery_rate', hue='volume_bucket')
plt.title("Recovery Rate vs Hit Rate by Volume Bucket")
plt.show()


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

# Load dataset
# df = pd.read_csv('your_data.csv')

# Derived columns
df['total_audits'] = df['findings'] + df['no_findings']
df['hit_rate'] = df['findings'] / df['total_audits']
df['volume_bucket'] = pd.Categorical(df['volume_bucket'], 
    categories=["0-5", "5-15", "15-50", "50-100", "100+"], ordered=True)

# Summary by volume bucket
bucket_summary = df.groupby('volume_bucket').agg({
    'hit_rate': ['mean', 'median', 'std'],
    'dispute_ratio': ['mean', 'median', 'std'],
    'overturned_ratio': ['mean', 'median', 'std'],
    'recovery_rate': ['mean', 'median', 'std'],
    'mr_receiving_rate': ['mean', 'median', 'std'],
    'total_overpay': 'sum',
    'total_recovery': 'sum',
    'providertaxid': 'count',
    'cancelled_count': 'sum'
}).reset_index()

# Distribution plots
for col in ['hit_rate', 'dispute_ratio', 'recovery_rate', 'mr_receiving_rate']:
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

# Correlation heatmap
corr = df[['hit_rate', 'recovery_rate', 'dispute_ratio', 'overturned_ratio', 'mr_receiving_rate', 'total_overpay', 'total_recovery']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# Volume vs performance
high_vol_df = df[df['volume_bucket'].isin(["15-50", "50-100", "100+"])]
sns.scatterplot(data=high_vol_df, x='total_audits', y='hit_rate')
plt.title("Total Audits vs Hit Rate")
plt.show()

sns.scatterplot(data=high_vol_df, x='total_overpay', y='recovery_rate')
plt.title("Total Overpay vs Recovery Rate")
plt.show()

# Outlier detection
def find_outliers_iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return (series < lower) | (series > upper)

df['outlier_hit_rate'] = find_outliers_iqr(df['hit_rate'])
df['outlier_recovery_rate'] = find_outliers_iqr(df['recovery_rate'])
df['outlier_dispute_ratio'] = find_outliers_iqr(df['dispute_ratio'])

# Rule-based risk flags
df['risk_high_hit_low_recovery'] = (df['hit_rate'] > 0.8) & (df['recovery_rate'] < 0.3)
df['risk_high_dispute_high_overturn'] = (df['dispute_ratio'] > 0.5) & (df['overturned_ratio'] > 0.5)
df['risk_low_mrr'] = df['mr_receiving_rate'] < 0.5

df['risk_label'] = 'Normal'
df.loc[df['risk_high_hit_low_recovery'], 'risk_label'] = 'HighHit_LowRecovery'
df.loc[df['risk_high_dispute_high_overturn'], 'risk_label'] = 'HighDispute_HighOverturn'
df.loc[df['risk_low_mrr'], 'risk_label'] = 'LowMRR'

# Top outliers table
outlier_providers = df[df[['outlier_hit_rate', 'outlier_recovery_rate', 'outlier_dispute_ratio']].any(axis=1)]
top_outliers = outlier_providers.sort_values('total_overpay', ascending=False).head(10)
print(top_outliers[['providertaxid', 'hit_rate', 'recovery_rate', 'dispute_ratio', 'risk_label']])

# Scatter plot: hit_rate vs recovery_rate
fig = px.scatter(df, x='hit_rate', y='recovery_rate', color='volume_bucket',
                 hover_data=['providertaxid'], title="Hit Rate vs Recovery Rate")
fig.show()

# Regression plot
sns.lmplot(data=df, x='hit_rate', y='recovery_rate', line_kws={"color": "red"})
plt.title("Regression: Hit Rate vs Recovery Rate")
plt.show()

# Clustering (optional)
features = df[['hit_rate', 'recovery_rate', 'mr_receiving_rate', 'dispute_ratio']].dropna()
scaled = StandardScaler().fit_transform(features)
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(scaled)

sns.scatterplot(data=df, x='hit_rate', y='recovery_rate', hue='cluster', palette='Set2')
plt.title("KMeans Clustering of Providers")
plt.show()


📊 EXPLANATION OF CHARTS USED IN YOUR PROVIDER EDA
Chart Type	Used For	Best For Interpreting	Your Use Case
🔲 Boxplot	Distribution, outliers, comparison across categories	Spread, median, and presence of extreme values across volume buckets	Compare metrics like hit_rate, recovery_rate, dispute_ratio across volume_bucket — check if low-volume providers behave differently
📈 Violin Plot (optional alt to boxplot)	Shape + distribution	Skew, multi-modal data	Shows more distributional nuance than boxplots
📊 Bar Plot	Count or average comparison	Total providers, avg rates by group	Count of providers per volume_bucket, or average dispute_ratio per bucket
📉 Histogram	Frequency distribution of continuous variables	Understand data distribution shape	Check if hit_rate, recovery_rate, dispute_ratio are normally distributed or skewed — helps in detecting unusual patterns
🔥 Correlation Heatmap	Relationships between numeric variables	Strength & direction of linear correlation	Understand if hit_rate relates to recovery_rate, dispute_ratio, or MRR — helps in prioritizing metrics
🧮 Scatter Plot (Matplotlib/Seaborn)	Bi-variate relationships	Patterns or clusters between two metrics	Key for analyzing hit_rate vs recovery_rate, or total_audits vs hit_rate
🔍 Plotly Scatter (Interactive)	Zoomable, filterable visualization	Dynamic deep-dive	Lets you see relationships + outliers interactively, colored by volume_bucket
📐 Regression Plot (sns.lmplot)	Fit a trendline	Strength + direction of impact	E.g., does hit_rate drive recovery_rate up? — informs recovery strategy
📊 Table of Top Outliers	Summary of flagged providers	Direct reporting or escalation	Helps identify providers with strange or risky behavior
🔵 Clustered Scatter Plot	Cluster analysis	Behavioral segmentation	Visualizes groups like compliant vs risky providers based on clustering (KMeans)

✅ WHICH CHARTS ARE MOST USEFUL FOR YOU NOW?
1. 📊 Bar Plot: Count of Providers per Volume Bucket
Shows how your providers are spread (e.g., 2100+ in 0–5 audits).

Use this to justify separating analysis by audit volume.

2. 🔲 Boxplot / Violin: Key Metrics by Volume Bucket
Compare hit_rate, recovery_rate, dispute_ratio, etc. across buckets.

Use this to detect whether high-volume providers behave differently.

3. 📉 Histogram:
For each key metric: hit_rate, recovery_rate, dispute_ratio, mr_receiving_rate.

Use this to detect skewness, outliers, or irregular patterns.

4. 🔥 Correlation Heatmap:
Understand which metrics move together.

Use this to build feature understanding for further modeling or clustering.

5. 📈 Scatter Plots (hit_rate vs recovery_rate, total_audits vs hit_rate):
Visualize whether there's a trend or clustering of poor performers.

Use this to isolate providers who show high hit rate but poor recovery (potential fraud).

6. 📊 Table of Outliers / Risk Flags
Quickly see which providers are behaving abnormally.

Use this to prepare for reporting, escalation, or re-audit.

7. 📉 Regression Plot (lmplot):
Fit a line to see relationship between hit_rate and recovery_rate.

Use this to understand how much one metric influences another.

8. 🔵 Clustering Plot:
Behavioral segmentation using clustering.

Use this to group providers into compliant / risky / abnormal groups.

🧠 How to Interpret These in Your Use Case
Metric	If High	If Low	Insight
hit_rate	Likely accurate auditing or fraud-prone	Audits not yielding issues	High + low recovery = possible fraud
recovery_rate	Good collections	Poor collection or disputes	High hit_rate + low recovery_rate = leakage
dispute_ratio	Aggressive providers	Compliant	Combine with overturned_ratio to see if justified
overturned_ratio	Providers win disputes	Audits mostly correct	High value = review audit accuracy
mr_receiving_rate	Good cooperation	Compliance risk	Low MRR = audit resistance or delay tactics

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

# Load your dataset
df = pd.read_csv('your_file.csv')  # Replace with actual filename

# --------------------------------------------
# ✅ Create Additional Fields for Analysis
# --------------------------------------------
df['cancelled_rate'] = df['cancelled'] / df['total_audits']
df['hit_rate'] = df['findings'] / df['total_audits']
df['dispute_to_findings'] = df['total_disputes_overall'] / df['findings'].replace(0, np.nan)
df['overturn_to_findings'] = df['total_overturned_overall'] / df['findings'].replace(0, np.nan)
df['recovery_to_overpay'] = df['total_recovery'] / df['total_overpay'].replace(0, np.nan)
df['invoice_recovery_ratio'] = df['total_recovery'] / df['total_invoiced'].replace(0, np.nan)

# --------------------------------------------
# ✅ Summary Stats by Audit Bucket
# --------------------------------------------
summary_metrics = [
    'hit_rate', 'Overall_hitrate', 'recovery_invoice_percent',
    'Dispute_ratio_overall', 'Overturned_ratio_overall',
    'receiving_rate_overall', 'cancelled_rate', 'recovery_to_overpay',
    'invoice_recovery_ratio'
]
summary_stats = df.groupby("audit_bucket")[summary_metrics].agg(['mean', 'median', 'std', 'min', 'max']).round(3)
print("📊 Summary Statistics by Audit Bucket")
print(summary_stats)

# --------------------------------------------
# ✅ Risk Flags
# --------------------------------------------
df['high_hit_low_recovery'] = (df['hit_rate'] > 0.8) & (df['recovery_invoice_percent'] < 0.3)
df['high_dispute_high_overturn'] = (df['Dispute_ratio_overall'] > 0.4) & (df['Overturned_ratio_overall'] > 0.5)
df['low_receiving'] = df['receiving_rate_overall'] < 0.5

risk_summary = df.groupby("audit_bucket")[['high_hit_low_recovery', 'high_dispute_high_overturn', 'low_receiving']].sum()
print("\n🚩 Risky Provider Counts by Audit Bucket")
print(risk_summary)

# --------------------------------------------
# ✅ Outlier Detection
# --------------------------------------------
outlier_columns = ['hit_rate', 'recovery_invoice_percent', 'Dispute_ratio_overall']
for col in outlier_columns:
    df[f'z_{col}'] = zscore(df[col].fillna(0))
    df[f'outlier_{col}'] = df[f'z_{col}'].abs() > 3

outlier_summary = df.groupby('audit_bucket')[[f'outlier_{col}' for col in outlier_columns]].sum()
print("\n📉 Z-Score Outlier Counts by Audit Bucket")
print(outlier_summary)

# --------------------------------------------
# ✅ Provider Segmentation Logic
# --------------------------------------------
def segment_provider(row):
    if row['audit_bucket'] == '0-5':
        return '🧪 Low Signal'
    if (row['hit_rate'] > 0.8 and row['recovery_invoice_percent'] > 0.7 and row['total_audits'] > 50):
        return '🚀 High Performer'
    if (row['high_hit_low_recovery'] or row['high_dispute_high_overturn']):
        return '⚠️ High Risk'
    if (row['low_receiving']):
        return '🧾 Compliance Concern'
    if (row['total_overpay'] > df['total_overpay'].quantile(0.9) and 
        row['total_recovery'] > df['total_recovery'].quantile(0.9)):
        return '💰 Most Impactful'
    return '✅ Standard'

df['provider_segment'] = df.apply(segment_provider, axis=1)

print("\n🧩 Provider Segments Summary")
print(df['provider_segment'].value_counts())

# --------------------------------------------
# ✅ Top and Bottom Providers
# --------------------------------------------
print("\n🔝 Top 10 Providers by Hit Rate")
print(df.nlargest(10, 'hit_rate')[['providertaxid', 'hit_rate', 'audit_bucket', 'provider_segment']])

print("\n🔻 Bottom 10 Providers by Receiving Rate")
print(df.nsmallest(10, 'receiving_rate_overall')[['providertaxid', 'receiving_rate_overall', 'audit_bucket', 'provider_segment']])

# --------------------------------------------
# ✅ High Volume & High Recovery Providers
# --------------------------------------------
high_vol_high_recovery = df[(df['total_audits'] > 50) & (df['recovery_invoice_percent'] > 0.8)]
print("\n💰 High Volume & High Recovery Providers")
print(high_vol_high_recovery[['providertaxid', 'total_audits', 'recovery_invoice_percent', 'audit_bucket', 'provider_segment']])

# --------------------------------------------
# ✅ Provider Count by Bucket
# --------------------------------------------
bucket_counts = df['audit_bucket'].value_counts().reset_index()
bucket_counts.columns = ['audit_bucket', 'provider_count']
print("\n📦 Provider Counts per Audit Bucket")
print(bucket_counts)


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis

# ---------------------
# Define key metrics
summary_metrics = [
    'hit_rate', 'Overall_hitrate', 'recovery_invoice_percent',
    'Dispute_ratio_overall', 'Overturned_ratio_overall',
    'receiving_rate_overall', 'cancelled_rate', 
    'recovery_to_overpay', 'invoice_recovery_ratio'
]

# 📊 Summary Statistics by Audit Bucket
summary_stats = df.groupby("audit_bucket")[summary_metrics].agg(['mean', 'median', 'std', 'min', 'max']).round(3)
print("📊 Summary Statistics by Audit Bucket:\n", summary_stats)

# 📈 Skewness & Kurtosis
skew_kurt = df[summary_metrics].agg([skew, kurtosis]).T.round(2)
print("\n📈 Skewness & Kurtosis:\n", skew_kurt)

# 📊 Key Percentiles (10th–99th)
percentiles = df[summary_metrics].quantile([0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).T.round(3)
print("\n📊 Key Percentiles (10th–99th):\n", percentiles)

# 🧯 Zero-Value Counts
zero_counts = (df[summary_metrics] == 0).sum().sort_values(ascending=False)
print("\n🧯 Metrics with Many Zero Values:\n", zero_counts)

# 🔗 Correlation Matrix
correlations = df[summary_metrics].corr().round(2)
print("\n🔗 Correlation Matrix:\n", correlations)

# 📉 Top vs Bottom 10% Providers Comparison by hit_rate
df['decile'] = pd.qcut(df['hit_rate'].rank(method='first'), 10, labels=False)

top_decile = df[df['decile'] == 9].mean()
bottom_decile = df[df['decile'] == 0].mean()

decile_comparison = pd.DataFrame({
    'Top 10% Providers': top_decile[summary_metrics],
    'Bottom 10% Providers': bottom_decile[summary_metrics]
}).round(3)

print("\n📉 Top vs Bottom 10% Providers Comparison:\n", decile_comparison)


In [None]:
import pandas as pd
from scipy.stats import zscore

# Load your dataset here if running locally
# df = pd.read_csv("your_data.csv")

# Define key behavioral metrics
metrics_to_check = [
    'hit_rate',
    'recovery_invoice_percent',
    'Dispute_ratio_overall',
    'Overturned_ratio_overall',
    'receiving_rate_overall'
]

# Calculate Z-scores for each metric
for metric in metrics_to_check:
    df[f'{metric}_z'] = zscore(df[metric].fillna(0))

# Flag statistical outliers (Z > 2 or Z < -2)
for metric in metrics_to_check:
    df[f'{metric}_outlier'] = ((df[f'{metric}_z'] > 2) | (df[f'{metric}_z'] < -2))

# Flag risky provider behaviors
df['risky_hit_low_recovery'] = (df['hit_rate_z'] > 1.5) & (df['recovery_invoice_percent_z'] < -1.0)
df['aggressive_disputer'] = (df['Dispute_ratio_overall_z'] > 1.5) & (df['Overturned_ratio_overall_z'] > 1.5)
df['compliance_issue'] = df['receiving_rate_overall_z'] < -1.5

# Preview of flagged risky providers
risky_flags = df[['providertaxid', 'risky_hit_low_recovery', 'aggressive_disputer', 'compliance_issue']]
print(risky_flags.head())


In [None]:
import pandas as pd
from scipy.stats import zscore
import matplotlib.pyplot as plt
import seaborn as sns

# ⛳️ STEP 1: Choose metrics for outlier detection
metrics_to_flag = [
    'hit_rate',
    'recovery_invoice_percent',
    'Dispute_ratio_overall',
    'Overturned_ratio_overall',
    'receiving_rate_overall',
    'total_overpay',
    'total_recovery'
]

# ⛳️ STEP 2: Calculate Z-scores for each metric
z_scores = df[metrics_to_flag].apply(zscore)

# Set threshold (2 = moderate outlier, 3 = strong outlier)
threshold = 3
outlier_flags = (z_scores.abs() > threshold)

# Add individual outlier flags as new columns
for col in metrics_to_flag:
    df[f'outlier_{col}'] = outlier_flags[col]

# ⛳️ STEP 3: Total number of outlier flags per provider
df['outlier_score'] = outlier_flags.sum(axis=1)

# ⛳️ STEP 4: Label providers as Extreme if 2 or more outlier behaviors
df['extreme_flag'] = df['outlier_score'].apply(lambda x: '🚩 Extreme' if x >= 2 else '✅ Normal')

# ⛳️ STEP 5: View top extreme providers
extreme_providers = df[df['extreme_flag'] == '🚩 Extreme'].sort_values(by='outlier_score', ascending=False)

print("\n🚨 Top Extreme Providers Based on Outlier Score:")
print(extreme_providers[['providertaxid', 'outlier_score'] + [f'outlier_{c}' for c in metrics_to_flag]].head(10))

# ⛳️ STEP 6: Optional boxplot to visualize difference in hit_rate
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='extreme_flag', y='hit_rate')
plt.title("🎯 Hit Rate Distribution: Extreme vs Normal Providers")
plt.xlabel("Outlier Segment")
plt.ylabel("Hit Rate")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore

# Set up plot aesthetics
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# ⬇️ Start with Key Metrics
key_metrics = [
    'hit_rate', 'recovery_invoice_percent', 'Dispute_ratio_overall',
    'Overturned_ratio_overall', 'receiving_rate_overall'
]

# 🎯 Audit Bucket Distribution
plt.figure()
sns.countplot(x='audit_bucket', data=df, order=['0-5', '5-15', '15-50', '50-100', '100+'], palette='viridis')
plt.title('Provider Count by Audit Bucket')
plt.xlabel('Audit Bucket')
plt.ylabel('Number of Providers')
plt.tight_layout()
plt.show()

# 🎯 Boxplots for Normalized Metrics
for metric in key_metrics:
    plt.figure()
    sns.boxplot(data=df, x='audit_bucket', y=metric, order=['0-5', '5-15', '15-50', '50-100', '100+'], palette='pastel')
    plt.title(f'{metric} by Audit Bucket')
    plt.xlabel('Audit Volume Bucket')
    plt.ylabel(metric.replace('_', ' ').title())
    plt.tight_layout()
    plt.show()

# 🎯 Histograms (with KDE) for Each Metric
for metric in key_metrics:
    plt.figure()
    sns.histplot(df[metric], bins=30, kde=True, color='skyblue')
    plt.title(f'Distribution of {metric}')
    plt.xlabel(metric.replace('_', ' ').title())
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

# 🎯 Violin plots for Distribution + Density per Bucket
for metric in key_metrics:
    plt.figure()
    sns.violinplot(x='audit_bucket', y=metric, data=df, order=['0-5', '5-15', '15-50', '50-100', '100+'], inner='quartile', palette='muted')
    plt.title(f'{metric} Violin Plot by Bucket')
    plt.xlabel('Audit Volume Bucket')
    plt.ylabel(metric.replace('_', ' ').title())
    plt.tight_layout()
    plt.show()

# 🎯 Correlation Heatmap
corr = df[key_metrics].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Between Key Provider Metrics')
plt.tight_layout()
plt.show()

# 🎯 Scatter Plots Between Key Metrics
# Hit Rate vs Recovery
plt.figure()
sns.scatterplot(data=df, x='hit_rate', y='recovery_invoice_percent', hue='audit_bucket', palette='Dark2')
plt.title('Hit Rate vs Recovery %')
plt.xlabel('Hit Rate')
plt.ylabel('Recovery %')
plt.legend(title='Audit Bucket')
plt.tight_layout()
plt.show()

# Dispute vs Overturn
plt.figure()
sns.scatterplot(data=df, x='Dispute_ratio_overall', y='Overturned_ratio_overall', hue='audit_bucket', palette='Set1')
plt.title('Dispute Ratio vs Overturned Ratio')
plt.xlabel('Dispute Ratio')
plt.ylabel('Overturned Ratio')
plt.tight_layout()
plt.show()

# 📌 Hit Rate vs Receiving Rate
plt.figure()
sns.scatterplot(data=df, x='hit_rate', y='receiving_rate_overall', hue='audit_bucket', palette='Set2')
plt.title('Hit Rate vs Receiving Rate')
plt.xlabel('Hit Rate')
plt.ylabel('MR Receiving Rate')
plt.tight_layout()
plt.show()

# 📉 Outlier Detection using Z-scores
z_df = df[key_metrics].apply(zscore)
df['outlier_score'] = z_df.abs().sum(axis=1)
df['extreme_flag'] = df['outlier_score'].apply(lambda x: '🚩 Extreme' if x > 8 else '✅ Normal')

# Boxplot to compare hit_rate for flagged vs normal
plt.figure()
sns.boxplot(data=df, x='extreme_flag', y='hit_rate', palette='Set3')
plt.title('Hit Rate: Extreme vs Normal Providers')
plt.tight_layout()
plt.show()

# 📊 Bar plot of provider count by flag
plt.figure()
sns.countplot(data=df, x='extreme_flag', palette='Set1')
plt.title("Flagged Extreme vs Normal Providers")
plt.tight_layout()
plt.show()

# 🔍 Radar-like summary plot (requires reshaping)
radar_data = df.groupby('audit_bucket')[key_metrics].mean().T
radar_data.plot(kind='bar', figsize=(12, 6), colormap='viridis')
plt.title("Average Normalized Metrics by Audit Bucket")
plt.ylabel('Mean Metric Value')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# Load your data
# df = pd.read_csv("your_file.csv")

# Define metric categories
normalized_metrics = [
    "hit_rate", "recovery_invoice_percent", "Dispute_ratio_overall",
    "Overturned_ratio_overall", "receiving_rate_overall"
]
volume_metrics = [
    "total_overpay", "total_recovery", "findings", "no_findings",
    "cancelled_count", "total_audits"
]

identifier_column = "providertaxid"

# 1️⃣ Histograms of Normalized Metrics
for col in normalized_metrics:
    plt.figure()
    sns.histplot(df[col], bins=30, kde=True)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.axvline(df[col].mean(), color='red', linestyle='--', label='Mean')
    plt.legend()
    plt.show()

# 2️⃣ Boxplots by Audit Bucket
for col in normalized_metrics:
    plt.figure()
    sns.boxplot(x="audit_bucket", y=col, data=df)
    plt.title(f"{col} by Audit Bucket")
    plt.xticks(rotation=45)
    plt.show()

# 3️⃣ Correlation Heatmap
plt.figure(figsize=(10, 6))
corr = df[normalized_metrics + volume_metrics].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

# 4️⃣ Provider Counts by Bucket
plt.figure()
sns.countplot(x="audit_bucket", data=df, order=["0-5", "5-15", "15-50", "50-100", "100+"])
plt.title("Number of Providers by Audit Volume Bucket")
plt.ylabel("Provider Count")
plt.xlabel("Audit Bucket")
plt.show()

# 5️⃣ Scatter: Hit Rate vs Recovery Rate
plt.figure()
sns.scatterplot(
    x="hit_rate", y="recovery_invoice_percent", hue="audit_bucket",
    size="total_audits", sizes=(20, 200), data=df, alpha=0.7
)
plt.title("Hit Rate vs Recovery Rate by Audit Bucket")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# 6️⃣ Bar: Total Recovery by Bucket
recovery_bucket = df.groupby("audit_bucket")["total_recovery"].sum().reset_index()
sns.barplot(x="audit_bucket", y="total_recovery", data=recovery_bucket)
plt.title("Total Recovery by Audit Bucket")
plt.show()

# 7️⃣ Overturn vs Dispute Relationship
plt.figure()
sns.scatterplot(
    x="Dispute_ratio_overall", y="Overturned_ratio_overall",
    hue="audit_bucket", data=df, alpha=0.7
)
plt.title("Overturn Ratio vs Dispute Ratio")
plt.show()

# 8️⃣ Volume Metrics Boxplot
for col in volume_metrics:
    plt.figure()
    sns.boxplot(data=df, y=col)
    plt.title(f"Distribution of {col}")
    plt.show()


In [None]:
# 📦 Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import pi
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.express as px
import squarify
import warnings
warnings.filterwarnings("ignore")

# 🛠️ Radar Chart Function
def radar_chart(df, group_col, metrics):
    categories = metrics
    labels = df[group_col].unique()
    df_avg = df.groupby(group_col)[metrics].mean().reset_index()

    plt.figure(figsize=(8, 8))
    for i, label in enumerate(labels):
        values = df_avg.loc[i].drop(group_col).values.flatten().tolist()
        values += values[:1]
        angles = [n / float(len(categories)) * 2 * pi for n in range(len(categories))]
        angles += angles[:1]
        ax = plt.subplot(1, 1, 1, polar=True)
        ax.plot(angles, values, linewidth=1, linestyle='solid', label=str(label))
        ax.fill(angles, values, alpha=0.1)
    plt.xticks(angles[:-1], categories, color='black', size=10)
    plt.title(f"Radar Chart by {group_col}", size=15)
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.show()

# 📊 Joint KDE Plot (Hit Rate vs Recovery Rate)
sns.jointplot(
    data=df,
    x='hit_rate',
    y='recovery_invoice_percent',
    kind='kde',
    hue='audit_bucket',
    fill=True
)
plt.suptitle("Joint KDE: Hit Rate vs Recovery Rate", y=1.02)
plt.show()

# 📦 Treemap of Top 50 Providers by Total Recovery
top_providers = df.nlargest(50, 'total_recovery')
sizes = top_providers['total_recovery']
labels = top_providers['providertaxid'].astype(str)

plt.figure(figsize=(12, 8))
squarify.plot(sizes=sizes, label=labels, alpha=.8)
plt.title("Top 50 Providers by Total Recovery")
plt.axis('off')
plt.show()

# 📈 Interactive Scatter Plot
fig = px.scatter(
    df,
    x="hit_rate", y="recovery_invoice_percent",
    size="total_overpay",
    color="audit_bucket",
    hover_data=["providertaxid", "total_audits", "total_recovery"]
)
fig.update_layout(title="Interactive Scatter: Hit Rate vs Recovery Rate")
fig.show()

# 🔍 Clustering with KMeans
cluster_metrics = [
    'hit_rate', 'recovery_invoice_percent', 'Dispute_ratio_overall',
    'Overturned_ratio_overall', 'receiving_rate_overall'
]

X = df[cluster_metrics].dropna()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=4, random_state=42)
df.loc[X.index, 'cluster'] = kmeans.fit_predict(X_scaled)

# 📉 Visualizing Clusters with PCA
pca = PCA(n_components=2)
pca_components = pca.fit_transform(X_scaled)

df_pca = pd.DataFrame(pca_components, columns=['PC1', 'PC2'])
df_pca['cluster'] = df.loc[X.index, 'cluster'].astype(int)

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_pca, x='PC1', y='PC2', hue='cluster', palette='tab10')
plt.title("Provider Clusters (PCA)")
plt.show()

# 🕸️ Radar Chart for Cluster Profiles
cluster_profile = df.groupby("cluster")[cluster_metrics].mean()
radar_chart(cluster_profile.reset_index(), 'cluster', cluster_metrics)

# 🧪 Optional: Radar for Audit Buckets
radar_chart(df, 'audit_bucket', cluster_metrics)
