In [14]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.dummy import DummyRegressor

In [15]:
df = pd.read_csv("/content/Personal_Finance_Dataset.csv")

df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Quarter'] = df['Date'].dt.quarter
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['Type'] = df['Type'].astype(str).str.strip().str.lower()

In [16]:
df_exp = df[df['Type'] == 'expense'].copy()

In [18]:
# Monthly expense totals per category
monthly = (
    df_exp.groupby([df_exp['Date'].dt.to_period('M'), 'Category'])
    .agg(total_amount=('Amount', 'sum'))
    .reset_index()
)

# Convert period to timestamp
monthly['Date'] = monthly['Date'].dt.to_timestamp()

In [19]:
# Lags
monthly['lag_1'] = monthly.groupby('Category')['total_amount'].shift(1)
monthly['lag_2'] = monthly.groupby('Category')['total_amount'].shift(2)
monthly['lag_3'] = monthly.groupby('Category')['total_amount'].shift(3)

# Rolling averages (only past data)
monthly['Rolling3'] = (
    monthly.groupby('Category')['total_amount']
    .transform(lambda x: x.shift(1).rolling(3, min_periods=1).mean())
)
monthly['Rolling6'] = (
    monthly.groupby('Category')['total_amount']
    .transform(lambda x: x.shift(1).rolling(6, min_periods=1).mean())
)

# Time features
monthly['month_num'] = monthly['Date'].dt.month
monthly['month_sin'] = np.sin(2*np.pi*monthly['month_num']/12)
monthly['month_cos'] = np.cos(2*np.pi*monthly['month_num']/12)

# Target = next monthâ€™s expense
monthly['target'] = monthly.groupby('Category')['total_amount'].shift(-1)

# Drop rows with NaN
data = monthly.dropna().reset_index(drop=True)

# One-hot encode categories
data = pd.get_dummies(data, columns=['Category'])

In [20]:
FEATURES = [
    'lag_1', 'lag_2', 'lag_3',
    'Rolling3', 'Rolling6',
    'month_num', 'month_sin', 'month_cos'
] + [col for col in data.columns if col.startswith('Category')]

X = data[FEATURES]
y = data['target']

# Use last 12 months as test
test_size = 12
train_X, test_X = X[:-test_size], X[-test_size:]
train_y, test_y = y[:-test_size], y[-test_size:]

In [21]:
# Linear Regression
lr = LinearRegression()
lr.fit(train_X, train_y)
pred_lr = lr.predict(test_X)
print("Linear Regression:")
print(" MAE:", mean_absolute_error(test_y, pred_lr))
print(" RMSE:", np.sqrt(mean_squared_error(test_y, pred_lr)))

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(train_X, train_y)
pred_rf = rf.predict(test_X)
print("\nRandom Forest:")
print(" MAE:", mean_absolute_error(test_y, pred_rf))
print(" RMSE:", np.sqrt(mean_squared_error(test_y, pred_rf)))

# XGBoost
xgb = XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
xgb.fit(train_X, train_y)
pred_xgb = xgb.predict(test_X)
print("\nXGBoost:")
print(" MAE:", mean_absolute_error(test_y, pred_xgb))
print(" RMSE:", np.sqrt(mean_squared_error(test_y, pred_xgb)))

Linear Regression:
 MAE: 1842.4139336395037
 RMSE: 2267.8474321822005

Random Forest:
 MAE: 1882.757241666667
 RMSE: 2404.875010293373

XGBoost:
 MAE: 1915.9760892740887
 RMSE: 2415.412052231002


In [24]:
X_cv = data.drop(columns=['total_amount', 'Date', 'target'])
y_cv = data['target']

tscv = TimeSeriesSplit(n_splits=3)

mse_scores_lr = cross_val_score(lr, X_cv, y_cv, scoring='neg_mean_squared_error', cv=tscv)
mae_scores_lr = cross_val_score(lr, X_cv, y_cv, scoring='neg_mean_absolute_error', cv=tscv)

mse_scores_lr = -mse_scores_lr
mae_scores_lr = -mae_scores_lr
rmse_scores_lr = np.sqrt(mse_scores_lr)

print("\nLinear Regression Time Series CV Results:")
print(f" MAE: {mae_scores_lr.mean():.2f} (+/- {mae_scores_lr.std():.2f})")
print(f" RMSE: {rmse_scores_lr.mean():.2f} (+/- {rmse_scores_lr.std():.2f})")

mse_scores_rf = cross_val_score(rf, X_cv, y_cv, scoring='neg_mean_squared_error', cv=tscv)
mae_scores_rf = cross_val_score(rf, X_cv, y_cv, scoring='neg_mean_absolute_error', cv=tscv)

mse_scores_rf = -mse_scores_rf
mae_scores_rf = -mae_scores_rf
rmse_scores_rf = np.sqrt(mse_scores_rf)

print("\nRandom Forest Time Series CV Results:")
print(f" MAE: {mae_scores_rf.mean():.2f} (+/- {mae_scores_rf.std():.2f})")
print(f" RMSE: {rmse_scores_rf.mean():.2f} (+/- {rmse_scores_rf.std():.2f})")

mse_scores_xgb = cross_val_score(xgb, X_cv, y_cv, scoring='neg_mean_squared_error', cv=tscv)
mae_scores_xgb = cross_val_score(xgb, X_cv, y_cv, scoring='neg_mean_absolute_error', cv=tscv)

mse_scores_xgb = -mse_scores_xgb
mae_scores_xgb = -mae_scores_xgb
rmse_scores_xgb = np.sqrt(mse_scores_xgb)

print("\nXGBoost Time Series CV Results:")
print(f" MAE: {mae_scores_xgb.mean():.2f} (+/- {mae_scores_xgb.std():.2f})")
print(f" RMSE: {rmse_scores_xgb.mean():.2f} (+/- {rmse_scores_xgb.std():.2f})")


Linear Regression Time Series CV Results:
 MAE: 1551.93 (+/- 76.65)
 RMSE: 1951.82 (+/- 124.68)

Random Forest Time Series CV Results:
 MAE: 1509.51 (+/- 146.76)
 RMSE: 1886.01 (+/- 196.44)

XGBoost Time Series CV Results:
 MAE: 1598.76 (+/- 95.38)
 RMSE: 2027.01 (+/- 131.72)


In [23]:
# Baseline dummy model
dummy = DummyRegressor(strategy="mean")
scores = cross_val_score(dummy, X_cv, y_cv, scoring="neg_mean_absolute_error", cv=3)
print("\nBaseline Dummy MAE:", -scores.mean())


Baseline Dummy MAE: 1460.9876825261515
