In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import joblib
import pickle
import os
# Load processed data
df = pd.read_csv(r"../Data/processed_wallet_data.csv")
print("Dataset shape:", df.shape)
df.sample(5)


Dataset shape: (10000, 16)


Unnamed: 0,User_ID,Month,Year,total_kwh,avg_kwh,total_cost,avg_cost,avg_wallet_balance,avg_session_duration,avg_cost_efficiency,peak_hour_ratio,City,Vehicle_Type,Subscription_Type,Payment_Mode,Charger_Type
3264,U03265,6,2024,6.73,6.73,56.46,56.46,1782.96,60.0,8.389302,0.0,Ahmedabad,2W,Pay-as-you-go,UPI,Fast
5726,U05727,12,2024,38.42,38.42,289.69,289.69,719.7,60.0,7.540083,0.0,Pune,4W,Basic,Wallet,Fast
1117,U01118,12,2025,28.25,28.25,271.2,271.2,1932.23,60.0,9.6,0.0,Hyderabad,4W,Basic,UPI,Slow
2846,U02847,9,2025,17.29,17.29,128.29,128.29,1440.4,60.0,7.419896,0.0,Hyderabad,4W,Pay-as-you-go,UPI,Fast
1769,U01770,4,2025,28.09,28.09,335.11,335.11,1530.94,60.0,11.929868,0.0,Bangalore,4W,Basic,UPI,Superfast


In [7]:
# Step 2: Feature Preparation (Fixed, Enriched & Safe)

from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

target = 'total_cost'   # predicting monthly total spend

# Drop mathematically linked or leaky features
drop_cols = ['avg_cost', 'total_cost', 'avg_cost_efficiency', 'total_kwh', 'avg_kwh']
drop_cols = [c for c in drop_cols if c in df.columns]

categorical_cols = ['City', 'Vehicle_Type', 'Subscription_Type',
                    'Payment_Mode', 'Charger_Type', 'Month', 'Year', 'season']

# Build base features and target
X = df.drop(columns=['User_ID', target] + drop_cols)
y = df[target]

# -------------- ENRICH FEATURES HERE -------------- #
X['sessions_per_user_month'] = df.groupby(['User_ID', 'Month', 'Year'])['Month'].transform('count')
X['cost_per_kwh_est'] = df['avg_cost_efficiency'] * df['peak_hour_ratio'].fillna(0)
X['wallet_to_cost_ratio'] = df['avg_wallet_balance'] / (df['avg_cost'] + 1e-6)
X['vehicle_encoded'] = df['Vehicle_Type']
X['subscription_encoded'] = df['Subscription_Type']
# --------------------------------------------------- #

# Replace inf and NaN
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(0)

# Encode all object/categorical columns safely
le_dict = {}
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        le_dict[col] = le

# Scale numerical columns
scaler = StandardScaler()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

print("Feature preparation and enrichment complete.")
print("Final feature shape:", X.shape)
print("Target variable:", target)


Feature preparation and enrichment complete.
Final feature shape: (10000, 15)
Target variable: total_cost


In [8]:
# Step 3: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train size:", X_train.shape, "Test size:", X_test.shape)

Train size: (8000, 15) Test size: (2000, 15)


In [9]:
# Step 4: Train Models (Safe version)

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import numpy as np

# Ensure clean, numeric data
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(0)

# Split again if needed (in case you modified X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Define models
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(
        n_estimators=150,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        verbosity=0,
        n_jobs=-1
    )
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    # Basic metrics
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    mape = mean_absolute_percentage_error(y_test, preds)

    results[name] = {"RMSE": rmse, "R2": r2, "MAPE": mape}
    print(f"{name}: RMSE={rmse:.2f}, R²={r2:.3f}, MAPE={mape:.3f}")



Training LinearRegression...
LinearRegression: RMSE=102.68, R²=0.404, MAPE=0.704

Training RandomForest...
RandomForest: RMSE=4.58, R²=0.999, MAPE=0.012

Training XGBoost...
XGBoost: RMSE=7.52, R²=0.997, MAPE=0.019


In [10]:
results_df = pd.DataFrame(results).T.sort_values("RMSE")
display(results_df)

best_model_name = results_df.index[0]
print(f"\n✅ Best Performing Model: {best_model_name}")

best_model = models[best_model_name]

# Ensure artifacts directory exists
artifacts_dir = os.path.join("..", "artifacts")
os.makedirs(artifacts_dir, exist_ok=True)

# Paths must match main.py
model_path = os.path.join(artifacts_dir, "RandomForest_wallet_model.pkl")
scaler_path = os.path.join(artifacts_dir, "scaler.pkl")

# Save model with pickle
with open(model_path, "wb") as f:
    pickle.dump(best_model, f)

# Save scaler used to transform features
with open(scaler_path, "wb") as f:
    pickle.dump(scaler, f)

print("Saved model to:", model_path)
print("Saved scaler to:", scaler_path)

Unnamed: 0,RMSE,R2,MAPE
RandomForest,4.581218,0.998813,0.012167
XGBoost,7.522683,0.996799,0.0194
LinearRegression,102.683767,0.403513,0.703888



✅ Best Performing Model: RandomForest
Saved model to: ..\artifacts\RandomForest_wallet_model.pkl
Saved scaler to: ..\artifacts\scaler.pkl


In [11]:
# Step 6: Make Sample Predictions

sample = X_test.sample(5, random_state=1)
preds = best_model.predict(sample)
pd.DataFrame({
    "Predicted_Spend": preds.round(2),
    "Actual_Spend": y_test.loc[sample.index].values.round(2)
})


Unnamed: 0,Predicted_Spend,Actual_Spend
0,293.78,293.06
1,43.52,44.27
2,241.63,239.73
3,371.19,369.15
4,103.91,104.38
