In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("bill charge.csv")

In [3]:
df

Unnamed: 0,Date,Patient Name,Age,Bill Charge
0,1/1/2023,Bob,33,100.5
1,1/4/2023,Bob,24,250.0
2,1/7/2023,Bob,56,75.0
3,1/7/2023,Eve,40,300.0
4,1/9/2023,Charlie,40,150.5
5,1/10/2023,Charlie,24,200.0
6,1/11/2023,Bob,40,175.0
7,1/11/2023,Eve,40,400.0
8,1/11/2023,Bob,40,120.0
9,1/12/2023,Charlie,42,180.0


In [7]:
df['Date'] = pd.to_datetime(df['Date'])
# Extract date features
df['month'] = df['Date'].dt.month
df['dayofweek'] = df['Date'].dt.dayofweek

In [8]:
df

Unnamed: 0,Date,Patient Name,Age,Bill Charge,month,dayofweek
0,2023-01-01,Bob,33,100.5,1,6
1,2023-01-04,Bob,24,250.0,1,2
2,2023-01-07,Bob,56,75.0,1,5
3,2023-01-07,Eve,40,300.0,1,5
4,2023-01-09,Charlie,40,150.5,1,0
5,2023-01-10,Charlie,24,200.0,1,1
6,2023-01-11,Bob,40,175.0,1,2
7,2023-01-11,Eve,40,400.0,1,2
8,2023-01-11,Bob,40,120.0,1,2
9,2023-01-12,Charlie,42,180.0,1,3


In [9]:
df['lag_1_bill_charge'] = df['Bill Charge'].shift(1)
df['lag_7_bill_charge'] = df['Bill Charge'].shift(7)


In [10]:
df

Unnamed: 0,Date,Patient Name,Age,Bill Charge,month,dayofweek,lag_1_bill_charge,lag_7_bill_charge
0,2023-01-01,Bob,33,100.5,1,6,,
1,2023-01-04,Bob,24,250.0,1,2,100.5,
2,2023-01-07,Bob,56,75.0,1,5,250.0,
3,2023-01-07,Eve,40,300.0,1,5,75.0,
4,2023-01-09,Charlie,40,150.5,1,0,300.0,
5,2023-01-10,Charlie,24,200.0,1,1,150.5,
6,2023-01-11,Bob,40,175.0,1,2,200.0,
7,2023-01-11,Eve,40,400.0,1,2,175.0,100.5
8,2023-01-11,Bob,40,120.0,1,2,400.0,250.0
9,2023-01-12,Charlie,42,180.0,1,3,120.0,75.0


In [11]:
df['rolling_mean_7_bill_charge'] = df['Bill Charge'].rolling(window=7, center=True).mean()
df['rolling_std_7_bill_charge'] = df['Bill Charge'].rolling(window=7, center=True).std()
df

Unnamed: 0,Date,Patient Name,Age,Bill Charge,month,dayofweek,lag_1_bill_charge,lag_7_bill_charge,rolling_mean_7_bill_charge,rolling_std_7_bill_charge
0,2023-01-01,Bob,33,100.5,1,6,,,,
1,2023-01-04,Bob,24,250.0,1,2,100.5,,,
2,2023-01-07,Bob,56,75.0,1,5,250.0,,,
3,2023-01-07,Eve,40,300.0,1,5,75.0,,178.714286,79.507786
4,2023-01-09,Charlie,40,150.5,1,0,300.0,,221.5,106.430337
5,2023-01-10,Charlie,24,200.0,1,1,150.5,,202.928571,111.833369
6,2023-01-11,Bob,40,175.0,1,2,200.0,,217.928571,98.001033
7,2023-01-11,Eve,40,400.0,1,2,175.0,100.5,187.928571,100.792869
8,2023-01-11,Bob,40,120.0,1,2,400.0,250.0,173.571429,113.384092
9,2023-01-12,Charlie,42,180.0,1,3,120.0,75.0,148.571429,125.256879


In [14]:
df['expanding_mean_bill_charge'] = df['Bill Charge'].expanding().mean()
df

Unnamed: 0,Date,Patient Name,Age,Bill Charge,month,dayofweek,lag_1_bill_charge,lag_7_bill_charge,rolling_mean_7_bill_charge,rolling_std_7_bill_charge,expanding_mean_bill_charge
0,2023-01-01,Bob,33,100.5,1,6,,,,,100.5
1,2023-01-04,Bob,24,250.0,1,2,100.5,,,,175.25
2,2023-01-07,Bob,56,75.0,1,5,250.0,,,,141.833333
3,2023-01-07,Eve,40,300.0,1,5,75.0,,178.714286,79.507786,181.375
4,2023-01-09,Charlie,40,150.5,1,0,300.0,,221.5,106.430337,175.2
5,2023-01-10,Charlie,24,200.0,1,1,150.5,,202.928571,111.833369,179.333333
6,2023-01-11,Bob,40,175.0,1,2,200.0,,217.928571,98.001033,178.714286
7,2023-01-11,Eve,40,400.0,1,2,175.0,100.5,187.928571,100.792869,206.375
8,2023-01-11,Bob,40,120.0,1,2,400.0,250.0,173.571429,113.384092,196.777778
9,2023-01-12,Charlie,42,180.0,1,3,120.0,75.0,148.571429,125.256879,195.1


In [15]:
# Fill missing values introduced by shifting/rolling
df.fillna(method='bfill', inplace=True)  # Or use 'ffill' or other methods

In [16]:
df

Unnamed: 0,Date,Patient Name,Age,Bill Charge,month,dayofweek,lag_1_bill_charge,lag_7_bill_charge,rolling_mean_7_bill_charge,rolling_std_7_bill_charge,expanding_mean_bill_charge
0,2023-01-01,Bob,33,100.5,1,6,100.5,100.5,178.714286,79.507786,100.5
1,2023-01-04,Bob,24,250.0,1,2,100.5,100.5,178.714286,79.507786,175.25
2,2023-01-07,Bob,56,75.0,1,5,250.0,100.5,178.714286,79.507786,141.833333
3,2023-01-07,Eve,40,300.0,1,5,75.0,100.5,178.714286,79.507786,181.375
4,2023-01-09,Charlie,40,150.5,1,0,300.0,100.5,221.5,106.430337,175.2
5,2023-01-10,Charlie,24,200.0,1,1,150.5,100.5,202.928571,111.833369,179.333333
6,2023-01-11,Bob,40,175.0,1,2,200.0,100.5,217.928571,98.001033,178.714286
7,2023-01-11,Eve,40,400.0,1,2,175.0,100.5,187.928571,100.792869,206.375
8,2023-01-11,Bob,40,120.0,1,2,400.0,250.0,173.571429,113.384092,196.777778
9,2023-01-12,Charlie,42,180.0,1,3,120.0,75.0,148.571429,125.256879,195.1


In [23]:
# For forecasting:
features_for_forecasting = ['lag_1_bill_charge', 'lag_7_bill_charge', 'month', 'dayofweek', 'rolling_mean_7_bill_charge']

# For prediction (assuming you have these columns in your dataset):
features_for_prediction = ['Age']

In [24]:
features_for_forecasting

['lag_1_bill_charge',
 'lag_7_bill_charge',
 'month',
 'dayofweek',
 'rolling_mean_7_bill_charge']

In [25]:
features_for_prediction

['Age']

In [27]:
# Correlation analysis
correlation_matrix = df.corr()
print(correlation_matrix['Bill Charge'].sort_values(ascending=False))

# Feature importance from models (example using Random Forest)
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor()
rf_model.fit(df[features_for_prediction], df['Bill Charge'])
feature_importances = rf_model.feature_importances_

Bill Charge                   1.000000
expanding_mean_bill_charge    0.461833
rolling_mean_7_bill_charge    0.331496
Age                           0.121833
dayofweek                     0.071484
lag_7_bill_charge             0.033085
lag_1_bill_charge            -0.135583
rolling_std_7_bill_charge    -0.231029
month                              NaN
Name: Bill Charge, dtype: float64
