In [11]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [12]:
# Load the dataset and clean column names
df = pd.read_csv('..\\..\\data\\csv\\FY_2022-2025.csv')
df.columns = df.columns.str.strip()

In [15]:
# --- Advanced Feature Engineering ---

penalty_cols = [col for col in df.columns if 'Penalty indicator for' in col]
df['Overall_Penalty'] = df[penalty_cols].apply(lambda row: 1 if 'Y' in row.values else 0, axis=1)


df.sort_values(by=['Hospital CCN', 'Year'], inplace=True)
df['Overall_Penalty_Previous_Year'] = df.groupby('Hospital CCN')['Overall_Penalty'].shift(1)


feature_cols = [
    'Dual proportion',
    'Overall_Penalty_Previous_Year',
    'ERR for AMI', 'ERR for COPD', 'ERR for HF',
    'ERR for pneumonia', 'ERR for CABG', 'ERR for THA/TKA'
]


for col in feature_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    if df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)


df_model = df.dropna(subset=['Overall_Penalty_Previous_Year'])

# --- Chronological Train-Test Split ---
train_df = df_model[df_model['Year'] < 2025]
test_df = df_model[df_model['Year'] == 2025]

X_train = train_df[feature_cols]
y_train = train_df['Overall_Penalty']

X_test = test_df[feature_cols]
y_test = test_df['Overall_Penalty']


# --- Train and Evaluate the Enhanced Model ---

enhanced_model = RandomForestClassifier(random_state=42, class_weight='balanced')
enhanced_model.fit(X_train, y_train)

#mke predictions on the "next year" (2025)
y_pred = enhanced_model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"\nEnhanced Model Accuracy on Predicting 2025 Penalties: {accuracy:.4f}")

if accuracy >= 0.95:
    print("\nThis enhanced model successfully meets the 95% accuracy requirement. ✅")
else:
    print("\nFurther tuning may be required.")

# Display a detailed classification report
print("\nClassification Report for 2025 Predictions:")
print(classification_report(y_test, y_pred, target_names=['Not Penalized', 'Penalized']))


# --- Generate the Final At-Risk Hospital List ---
results_df = test_df.copy()
results_df['Predicted_Penalty'] = y_pred

at_risk_hospitals = results_df[results_df['Predicted_Penalty'] == 1]

print(f"\nIdentified {len(at_risk_hospitals)} hospitals likely to be penalized in 2025 with the enhanced model.")
print("\n--- Final List of At-Risk Hospitals (CCN) for FY 2025 ---")
print(at_risk_hospitals[['Hospital CCN', 'Year', 'Predicted_Penalty']].head(10))



Enhanced Model Accuracy on Predicting 2025 Penalties: 0.9238

Further tuning may be required.

Classification Report for 2025 Predictions:
               precision    recall  f1-score   support

Not Penalized       0.85      0.75      0.80       598
    Penalized       0.94      0.97      0.95      2382

     accuracy                           0.92      2980
    macro avg       0.90      0.86      0.88      2980
 weighted avg       0.92      0.92      0.92      2980


Identified 2451 hospitals likely to be penalized in 2025 with the enhanced model.

--- Final List of At-Risk Hospitals (CCN) for FY 2025 ---
    Hospital CCN  Year  Predicted_Penalty
556       100001  2025                  1
557       100002  2025                  1
558       100006  2025                  1
559       100007  2025                  1
560       100008  2025                  1
0          10001  2025                  1
561       100012  2025                  1
562       100014  2025                  1
563    