In [31]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, accuracy_score, classification_report

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

import joblib


In [32]:
df = pd.read_csv("spending_patterns_detailed.csv")
df.head()


Unnamed: 0,Customer ID,Category,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,CUST_0159,Groceries,Milk,1,1.28,1.28,Debit Card,Mobile App,2024-11-13
1,CUST_0017,Friend Activities,Dinner with Friends,2,74.69,149.39,Debit Card,In-store,2023-05-04
2,CUST_0094,Housing and Utilities,Water Bill,1,76.06,76.06,Digital Wallet,Mobile App,2023-12-16
3,CUST_0162,Fitness,Yoga Class,5,11.24,56.18,Cash,In-store,2024-01-28
4,CUST_0129,Gifts,Flowers,3,43.71,131.12,Debit Card,In-store,2023-07-28


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Customer ID       10000 non-null  object 
 1   Category          10000 non-null  object 
 2   Item              10000 non-null  object 
 3   Quantity          10000 non-null  int64  
 4   Price Per Unit    10000 non-null  float64
 5   Total Spent       10000 non-null  float64
 6   Payment Method    10000 non-null  object 
 7   Location          10000 non-null  object 
 8   Transaction Date  10000 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 703.3+ KB


In [33]:
df.isnull().sum()


Customer ID         0
Category            0
Item                0
Quantity            0
Price Per Unit      0
Total Spent         0
Payment Method      0
Location            0
Transaction Date    0
dtype: int64

In [34]:
df["Transaction Date"] = pd.to_datetime(df["Transaction Date"])

df["Year"] = df["Transaction Date"].dt.year
df["Month"] = df["Transaction Date"].dt.month

df.head()

Unnamed: 0,Customer ID,Category,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date,Year,Month
0,CUST_0159,Groceries,Milk,1,1.28,1.28,Debit Card,Mobile App,2024-11-13,2024,11
1,CUST_0017,Friend Activities,Dinner with Friends,2,74.69,149.39,Debit Card,In-store,2023-05-04,2023,5
2,CUST_0094,Housing and Utilities,Water Bill,1,76.06,76.06,Digital Wallet,Mobile App,2023-12-16,2023,12
3,CUST_0162,Fitness,Yoga Class,5,11.24,56.18,Cash,In-store,2024-01-28,2024,1
4,CUST_0129,Gifts,Flowers,3,43.71,131.12,Debit Card,In-store,2023-07-28,2023,7


In [35]:
label_encoders = {}

for col in ["Category", "Payment Method"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


## Task1— FUTURE SPENDING PREDICTION (REGRESSION)

In [36]:
X_reg = df[["Category", "Payment Method", "Month", "Year"]]
y_reg = df["Total Spent"]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)


In [8]:
scaler_reg = StandardScaler()
X_train_r_scaled = scaler_reg.fit_transform(X_train_r)
X_test_r_scaled = scaler_reg.transform(X_test_r)


In [37]:
reg_models = {
    "Linear Regression": LinearRegression(),
    "KNN Regressor": KNeighborsRegressor(n_neighbors=5),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42)
}

reg_results = {}

for name, model in reg_models.items():
    model.fit(X_train_r_scaled, y_train_r)
    preds = model.predict(X_test_r_scaled)
    reg_results[name] = mean_absolute_error(y_test_r, preds)

print("Regression Model Performance (MAE):")
reg_results
## Random forest performed best with lowest MAE

Regression Model Performance (MAE):


{'Linear Regression': 4985.384370992975,
 'KNN Regressor': 4437.164308,
 'Random Forest Regressor': 4415.296162434467}

In [19]:
from xgboost import XGBRegressor

xgb_reg = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.9,
    random_state=42
)

xgb_reg.fit(X_train_r_scaled, y_train_r)
xgb_pred = xgb_reg.predict(X_test_r_scaled)

xgb_mae = mean_absolute_error(y_test_r, xgb_pred)
print("XGBoost Regression MAE:", xgb_mae)


XGBoost Regression MAE: 4966.3116939938345


In [10]:
best_reg_model = reg_models["Random Forest Regressor"]


In [22]:
reg_results["XGBoost Regressor"] = xgb_mae

reg_table = pd.DataFrame({
    "Model": list(reg_results.keys()),
    "MAE": list(reg_results.values())
})

reg_table


Unnamed: 0,Model,MAE
0,Linear Regression,4985.384371
1,KNN Regressor,4437.164308
2,Random Forest Regressor,4415.296162
3,XGBoost Regressor,4966.311694


## TASK 2 — OVERSPENDING RISK PREDICTION (CLASSIFICATION)

In [12]:
monthly_spend = df.groupby(["Customer ID", "Month"])["Total Spent"].sum().reset_index()

threshold = monthly_spend["Total Spent"].quantile(0.75)

monthly_spend["Overspend"] = (monthly_spend["Total Spent"] > threshold).astype(int)

monthly_spend.head()


Unnamed: 0,Customer ID,Month,Total Spent,Overspend
0,CUST_0001,1,1068.28,0
1,CUST_0001,2,442.96,0
2,CUST_0001,3,1905.73,0
3,CUST_0001,4,549.29,0
4,CUST_0001,5,404.5,0


In [13]:
X_cls = monthly_spend[["Month", "Total Spent"]]
y_cls = monthly_spend["Overspend"]

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=42
)


In [14]:
scaler_cls = StandardScaler()
X_train_c_scaled = scaler_cls.fit_transform(X_train_c)
X_test_c_scaled = scaler_cls.transform(X_test_c)


In [15]:
cls_models = {
    "Logistic Regression": LogisticRegression(),
    "KNN Classifier": KNeighborsClassifier(n_neighbors=5),
    "Random Forest Classifier": RandomForestClassifier(n_estimators=100, random_state=42)
}

cls_results = {}

for name, model in cls_models.items():
    model.fit(X_train_c_scaled, y_train_c)
    preds = model.predict(X_test_c_scaled)
    cls_results[name] = accuracy_score(y_test_c, preds)

print("Classification Model Accuracy:")
cls_results


Classification Model Accuracy:


{'Logistic Regression': 0.8495762711864406,
 'KNN Classifier': 0.9957627118644068,
 'Random Forest Classifier': 1.0}

In [20]:
from xgboost import XGBClassifier

xgb_cls = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.9,
    eval_metric='logloss',
    random_state=42
)

xgb_cls.fit(X_train_c_scaled, y_train_c)
xgb_pred = xgb_cls.predict(X_test_c_scaled)

xgb_accuracy = accuracy_score(y_test_c, xgb_pred)
print("XGBoost Classification Accuracy:", xgb_accuracy)


XGBoost Classification Accuracy: 1.0


In [23]:
cls_results["XGBoost Classifier"] = xgb_accuracy

cls_table = pd.DataFrame({
    "Model": list(cls_results.keys()),
    "Accuracy": list(cls_results.values())
})

cls_table


Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.849576
1,KNN Classifier,0.995763
2,Random Forest Classifier,1.0
3,XGBoost Classifier,1.0


In [16]:
best_cls_model = cls_models["Random Forest Classifier"]

y_pred_best = best_cls_model.predict(X_test_c_scaled)

print(classification_report(y_test_c, y_pred_best))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       342
           1       1.00      1.00      1.00       130

    accuracy                           1.00       472
   macro avg       1.00      1.00      1.00       472
weighted avg       1.00      1.00      1.00       472



In [17]:
joblib.dump(best_reg_model, "spendsense_spending_predictor.pkl")
joblib.dump(best_cls_model, "spendsense_overspend_classifier.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")
joblib.dump(scaler_reg, "scaler_reg.pkl")
joblib.dump(scaler_cls, "scaler_cls.pkl")

print("✅ Models saved successfully!")


✅ Models saved successfully!


In [21]:
joblib.dump(xgb_reg, "xgb_spending_model.pkl")
joblib.dump(xgb_cls, "xgb_overspend_model.pkl")


['xgb_overspend_model.pkl']