In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import joblib
import os

# Load cleaned finance data
df = pd.read_csv("../outputs/cleaned_finance_data.csv")

# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Derive monthly income and expense
monthly_data = df.groupby(['Year', 'Month', 'Type'])['Absolute_Amount'].sum().reset_index()
monthly_pivot = monthly_data.pivot_table(index=['Year', 'Month'], columns='Type', values='Absolute_Amount', fill_value=0).reset_index()

# Rename columns for clarity
monthly_pivot.rename(columns={'Income': 'Monthly_Income', 'Expense': 'Monthly_Expense'}, inplace=True)

# Calculate savings and savings ratio
monthly_pivot['Savings'] = monthly_pivot['Monthly_Income'] - monthly_pivot['Monthly_Expense']
monthly_pivot['Savings_Ratio'] = monthly_pivot['Savings'] / monthly_pivot['Monthly_Income']

# Filter expense data for additional features
df_expense = df[df['Type'] == 'Expense']

# Weekend spending ratio
weekend_expense = df_expense[df_expense['Is_Weekend'] == True].groupby(['Year', 'Month'])['Absolute_Amount'].sum().reset_index(name='Weekend_Expense')
monthly_pivot = pd.merge(monthly_pivot, weekend_expense, on=['Year', 'Month'], how='left')
monthly_pivot['Weekend_Expense'] = monthly_pivot['Weekend_Expense'].fillna(0)
monthly_pivot['Weekend_Spending_Ratio'] = monthly_pivot['Weekend_Expense'] / monthly_pivot['Monthly_Expense']

# High value transaction count
threshold = df_expense['Absolute_Amount'].quantile(0.90)
df_expense['High_Value'] = df_expense['Absolute_Amount'] > threshold
high_value_count = df_expense.groupby(['Year', 'Month'])['High_Value'].sum().reset_index(name='High_Value_Count')
monthly_pivot = pd.merge(monthly_pivot, high_value_count, on=['Year', 'Month'], how='left')
monthly_pivot['High_Value_Count'] = monthly_pivot['High_Value_Count'].fillna(0)

# Category spending ratio (average of Expense_Category_Ratio)
category_ratio = df_expense.groupby(['Year', 'Month'])['Expense_Category_Ratio'].mean().reset_index(name='Category_Spending_Ratio')
monthly_pivot = pd.merge(monthly_pivot, category_ratio, on=['Year', 'Month'], how='left')
monthly_pivot['Category_Spending_Ratio'] = monthly_pivot['Category_Spending_Ratio'].fillna(0)

# Rule-based Labeling
def classify_behavior(row):
    if row['Savings_Ratio'] > 0.3 and row['High_Value_Count'] < 2:
        return 'Saver'
    elif row['Savings_Ratio'] < 0.1 and row['High_Value_Count'] > 5:
        return 'Impulsive'
    elif row['Savings_Ratio'] > 0.2:
        return 'Balanced'
    else:
        return 'Spender'

monthly_pivot['Spending_Behavior'] = monthly_pivot.apply(classify_behavior, axis=1)

# Prepare data for model training
features = ['Savings_Ratio', 'Weekend_Spending_Ratio', 'High_Value_Count', 'Category_Spending_Ratio']
X = monthly_pivot[features]
y = monthly_pivot['Spending_Behavior']

# Train-test split and scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_scaled, y_train)

# Evaluate model
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

# Save model and predictions
os.makedirs("../models", exist_ok=True)
os.makedirs("../outputs", exist_ok=True)
joblib.dump(clf, "../models/classification_model.pkl")

# Save predictions
monthly_pivot['Predicted_Behavior'] = clf.predict(scaler.transform(X))
monthly_pivot.to_csv("../outputs/classified_behavior.csv", index=False)

print("Classification model and predictions saved successfully.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_expense['High_Value'] = df_expense['Absolute_Amount'] > threshold


              precision    recall  f1-score   support

     Spender       1.00      1.00      1.00        12

    accuracy                           1.00        12
   macro avg       1.00      1.00      1.00        12
weighted avg       1.00      1.00      1.00        12

Classification model and predictions saved successfully.
