Import Required Libraries

In [2]:
# Import standard libraries for data manipulation and machine learning
import pandas as pd
import numpy as np

# scikit-learn modules for model building and evaluation
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


Load Data and Clean Column Names

In [3]:
# Load Excel file into a DataFrame
df = pd.read_excel(r'C:\Users\Raju Naik\Downloads\Sample_WBC.xlsx')

# Clean column names by stripping leading/trailing whitespaces
df.columns = df.columns.str.strip()

# Display first few rows to verify
df.head()


Unnamed: 0,ACCOUNT #,Date of Service,CPT Code,Modifier Codes,Units,CHARGE $,WBC Insurance Expected's,Primary Insurance Allowed,Total Payments Amount,Due from Patient,...,Medical Record Number,Date of Batch,Primary First Filed Date,Date Paid in Full,Last Payment Date,Provider Category Code,First Payment Amount,Second Payment Amount,Third Payment Amount,Last Payment Amount
0,ACC.60738,2024-01-20,99100,QZQSP1,8.9,5571.4,793.47,730.38,-730.38,0,...,3083012,2024-01-18,2024-01-18,2024-03-21,2024-03-21,O43,-730.38,0,0,-730.38
1,ACC.74931,2024-01-08,99100,P1,8.7,4358.7,1260.66,1168.99,-1168.99,0,...,9288595,2024-01-08,2024-01-08,NaT,NaT,O43,-1168.99,0,0,-1168.99
2,ACC.85341,2024-01-25,731,P1,2.7,1082.7,181.9,166.81,-166.81,0,...,7450913,2024-01-08,2024-01-08,2024-04-27,2024-04-27,O43,-166.81,0,0,-166.81
3,ACC.32508,2024-01-24,99100,QZQSP1,6.7,3959.7,1806.14,1718.93,-1718.93,0,...,7961927,2024-01-11,2024-01-11,2024-08-04,2024-08-04,5,-1718.93,0,0,-1718.93
4,ACC.29915,2024-01-26,731,QZQSP1,1.2,169.2,87.9,86.43,-86.43,0,...,2896099,2024-01-30,2024-01-30,2024-03-07,2024-03-07,O43,-86.43,0,0,-86.43


Convert Date Columns to datetime Format

In [4]:
# Convert relevant columns to datetime format to enable date arithmetic
date_cols = [
    'Date of Service', 'Primary First Filed Date', 'Date of Batch',
    'Last Payment Date', 'Date Paid in Full'
]

df[date_cols] = df[date_cols].apply(pd.to_datetime, errors='coerce')


Feature Engineering

In [5]:
# Create time gap features
df['days_to_paid_in_full'] = (df['Date Paid in Full'] - df['Date of Service']).dt.days
df['filed_delay'] = (df['Primary First Filed Date'] - df['Date of Service']).dt.days
df['batch_delay'] = (df['Date of Batch'] - df['Date of Service']).dt.days
df['last_payment_gap'] = (df['Last Payment Date'] - df['Date of Service']).dt.days

# Payment related features
df['partial_payment_ratio'] = df['Total Payments Amount'] / df['CHARGE $']
df['balance_remaining'] = df['CHARGE $'] - df['Total Payments Amount']
df['payment_count'] = df[['First Payment Amount', 'Second Payment Amount', 'Third Payment Amount']].gt(0).sum(axis=1)

# Insurance-related feature
df['has_secondary'] = df['Secondary Insurance Code'].notnull().astype(int)


Define Feature Set

In [6]:
# Base features used for model training
base_features = [
    'filed_delay', 'batch_delay', 'last_payment_gap', 'partial_payment_ratio',
    'balance_remaining', 'payment_count', 'has_secondary'
]

# Additional features can be added here and can be engineered if needed (e.g., categorical columns)
# additional_features = [
#     'CPT Code', 'Units', 'CHARGE $', 'WBC Insurance Expected\'s', 'Primary Insurance Allowed',
#     'Primary Insurance Carrier Name', 'Secondary Carrier Name', 'Class',
#     'Provider of Service Name', 'Referrer of Service Name', 'Location of Service Name',
#     'Provider Category Code'
# ]

additional_features = []  # Placeholder for additional features, if needed

# Combine base + additional features
all_features = base_features + additional_features


Prepare Data for Model Training

In [7]:
# Drop rows with missing target or required features
model_df = df.dropna(subset=all_features + ['days_to_paid_in_full']).copy()

# Split features and target
X = model_df[all_features]
y = model_df['days_to_paid_in_full']

# Identify categorical and numeric columns
cat_features = X.select_dtypes(include='object').columns.tolist()
num_features = [f for f in all_features if f not in cat_features]


Build the Preprocessing and Modeling Pipeline

In [8]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),                         # Scale numerical features
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)   # One-hot encode categorical features
])

# Combine preprocessor with Random Forest Regressor in a single pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])


Split Data and Train Model

In [9]:
# Split into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on training data
pipeline.fit(X_train, y_train)


Evaluate Model Performance

In [10]:
# Make predictions on test data
preds = pipeline.predict(X_test)

# Evaluate performance using Mean Absolute Error
mae = mean_absolute_error(y_test, preds)
print(f"Mean Absolute Error on test set: {mae:.2f} days")


Mean Absolute Error on test set: 2.47 days


Predict on Unlabeled Records

In [11]:
# Select records with missing actual payment date (target)
predict_df = df[df['days_to_paid_in_full'].isnull()].copy()
X_predict = predict_df[all_features].copy()

# Fill missing values in categorical and numerical columns
for col in cat_features:
    X_predict[col] = X_predict[col].fillna('Unknown')
for col in num_features:
    X_predict[col] = X_predict[col].fillna(X_train[col].median())


Generate Predictions

In [12]:
# Predict number of days to be paid in full
predicted_days = pipeline.predict(X_predict)

# Add predictions to DataFrame
predict_df['Predicted Date Paid in Full'] = predict_df['Date of Service'] + pd.to_timedelta(predicted_days, unit='D')
predict_df['Predicted Date Paid in Full'] = predict_df['Predicted Date Paid in Full'].dt.date


Output Final Results

In [13]:
# Display key prediction results
predict_df[['ACCOUNT #', 'Date of Service', 'Predicted Date Paid in Full']]


Unnamed: 0,ACCOUNT #,Date of Service,Predicted Date Paid in Full
1,ACC.74931,2024-01-08,2024-04-09
15,ACC.59263,2024-01-31,2024-05-02
17,ACC.62339,2024-01-17,2024-04-19
20,ACC.25606,2024-01-09,2024-04-10
25,ACC.45773,2024-01-18,2024-04-19
26,ACC.11793,2024-01-15,2024-04-17
36,ACC.66934,2024-01-29,2024-04-30
40,ACC.98732,2024-01-31,2024-05-01
44,ACC.97554,2024-01-15,2024-04-17
46,ACC.79925,2024-01-22,2024-04-23
