In [2]:
# Step 1: Import Libraries and Load Data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load data
# Load the Excel file
df = pd.read_excel(r'C:\Users\Raju Naik\Downloads\Sample_WBC.xlsx')    

# Optional: Clean column names by removing extra spaces
df.columns = df.columns.str.strip()

# Step 2: Convert date columns to datetime format
date_cols = [
    'Date of Service', 'Primary First Filed Date', 'Date of Batch',
    'Last Payment Date', 'Date Paid in Full'
]
df[date_cols] = df[date_cols].apply(pd.to_datetime, errors='coerce')

# Step 3: Feature Engineering
df['days_to_paid_in_full'] = (df['Date Paid in Full'] - df['Date of Service']).dt.days
df['filed_delay'] = (df['Primary First Filed Date'] - df['Date of Service']).dt.days
df['batch_delay'] = (df['Date of Batch'] - df['Date of Service']).dt.days
df['last_payment_gap'] = (df['Last Payment Date'] - df['Date of Service']).dt.days
df['partial_payment_ratio'] = df['Total Payments Amount'] / df['CHARGE $']
df['balance_remaining'] = df['CHARGE $'] - df['Total Payments Amount']
df['payment_count'] = df[['First Payment Amount', 'Second Payment Amount', 'Third Payment Amount']].gt(0).sum(axis=1)
df['has_secondary'] = df['Secondary Insurance Code'].notnull().astype(int)

# Step 4: Select features
base_features = [
    'filed_delay', 'batch_delay', 'last_payment_gap', 'partial_payment_ratio',
    'balance_remaining', 'payment_count', 'has_secondary'
]

# Additional features to test
additional_features = [
    'CPT Code', 'Units', 'CHARGE $', 'WBC Insurance Expected\'s', 'Primary Insurance Allowed',
    'Primary Insurance Carrier Name', 'Secondary Carrier Name', 'Class',
    'Provider of Service Name', 'Referrer of Service Name', 'Location of Service Name',
    'Provider Category Code'
]

all_features = base_features + additional_features

# Step 5: Prepare model input (training data)
model_df = df.dropna(subset=all_features + ['days_to_paid_in_full']).copy()
X = model_df[all_features]
y = model_df['days_to_paid_in_full']

# Identify categorical columns
cat_features = X.select_dtypes(include='object').columns.tolist()
num_features = [f for f in all_features if f not in cat_features]

# Step 6: Build pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split data and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

# Evaluate
preds = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, preds)
print(f"Mean Absolute Error on test set: {mae:.2f} days")

# Step 7: Predict for current records
predict_df = df[df['days_to_paid_in_full'].isnull()].copy()
X_predict = predict_df[all_features].copy()

# Fill missing values
for col in cat_features:
    X_predict[col] = X_predict[col].fillna('Unknown')
for col in num_features:
    X_predict[col] = X_predict[col].fillna(X_train[col].median())

predicted_days = pipeline.predict(X_predict)
predict_df['Predicted Date Paid in Full'] = predict_df['Date of Service'] + pd.to_timedelta(predicted_days, unit='D')
predict_df['Predicted Date Paid in Full'] = predict_df['Predicted Date Paid in Full'].dt.date

# Output prediction sample
print(predict_df[['ACCOUNT #', 'Date of Service', 'Predicted Date Paid in Full']])


Mean Absolute Error on test set: 7.83 days
    ACCOUNT # Date of Service Predicted Date Paid in Full
1   ACC.74931      2024-01-08                  2024-04-14
15  ACC.59263      2024-01-31                  2024-05-03
17  ACC.62339      2024-01-17                  2024-04-23
20  ACC.25606      2024-01-09                  2024-04-16
25  ACC.45773      2024-01-18                  2024-04-23
26  ACC.11793      2024-01-15                  2024-04-19
36  ACC.66934      2024-01-29                  2024-05-04
40  ACC.98732      2024-01-31                  2024-05-03
44  ACC.97554      2024-01-15                  2024-04-20
46  ACC.79925      2024-01-22                  2024-04-25
49  ACC.49725      2024-01-18                  2024-04-18
52  ACC.65448      2024-01-08                  2024-04-11
54  ACC.84319      2024-01-21                  2024-04-25
58  ACC.68900      2024-01-03                  2024-04-08
59  ACC.27865      2024-01-09                  2024-04-13
64  ACC.64117      2024-01-15