In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd


spark = SparkSession.builder.appName("UserMonthlySpend").getOrCreate()
spark

In [24]:
df = spark.read.csv("large_expenses.csv", header=True, inferSchema=True)

df.show()

+---------+------+----------+------+----------+-----------------+
|ExpenseID|UserID|CategoryID|Amount|      Date|            Notes|
+---------+------+----------+------+----------+-----------------+
|      201|     1|         1| 300.0|2025-06-01| Fruits & Veggies|
|      202|     1|         3|2200.0|2025-06-02| Electricity Bill|
|      203|     1|         5| 500.0|2025-06-03|       Food Panda|
|      204|     1|         2| 200.0|2025-06-04|              Bus|
|      205|     1|         1| 700.0|2025-06-05|Monthly groceries|
|      206|     2|         4|5000.0|2025-06-01|           Laptop|
|      207|     2|         1|1000.0|2025-06-02|          Grocery|
|      208|     2|         5| 900.0|2025-06-03|      Pizza night|
|      209|     2|         2| 150.0|2025-06-04|         Rickshaw|
|      210|     2|         3|1900.0|2025-06-05|       Water Bill|
|      211|     1|         4|3200.0|2025-06-06|   Shopping Spree|
|      212|     3|         1| 450.0|2025-06-01|        Mini Mart|
|      213

In [25]:
df = df.withColumn("Year", year(col("Date")))
df = df.withColumn("Month", month(col("Date")))

df.show()

+---------+------+----------+------+----------+-----------------+----+-----+
|ExpenseID|UserID|CategoryID|Amount|      Date|            Notes|Year|Month|
+---------+------+----------+------+----------+-----------------+----+-----+
|      201|     1|         1| 300.0|2025-06-01| Fruits & Veggies|2025|    6|
|      202|     1|         3|2200.0|2025-06-02| Electricity Bill|2025|    6|
|      203|     1|         5| 500.0|2025-06-03|       Food Panda|2025|    6|
|      204|     1|         2| 200.0|2025-06-04|              Bus|2025|    6|
|      205|     1|         1| 700.0|2025-06-05|Monthly groceries|2025|    6|
|      206|     2|         4|5000.0|2025-06-01|           Laptop|2025|    6|
|      207|     2|         1|1000.0|2025-06-02|          Grocery|2025|    6|
|      208|     2|         5| 900.0|2025-06-03|      Pizza night|2025|    6|
|      209|     2|         2| 150.0|2025-06-04|         Rickshaw|2025|    6|
|      210|     2|         3|1900.0|2025-06-05|       Water Bill|2025|    6|

Group by user to calculate total monthly spend

In [26]:
monthly_spend = df.groupBy("UserID", "Year", "Month") \
                  .agg(_sum("Amount").alias("TotalSpend"))

monthly_spend.show()
df.show()

+------+----+-----+----------+
|UserID|Year|Month|TotalSpend|
+------+----+-----+----------+
|     2|2025|    6|   21360.0|
|     3|2025|    6|   15670.0|
|     1|2025|    6|   23900.0|
+------+----+-----+----------+

+---------+------+----------+------+----------+-----------------+----+-----+
|ExpenseID|UserID|CategoryID|Amount|      Date|            Notes|Year|Month|
+---------+------+----------+------+----------+-----------------+----+-----+
|      201|     1|         1| 300.0|2025-06-01| Fruits & Veggies|2025|    6|
|      202|     1|         3|2200.0|2025-06-02| Electricity Bill|2025|    6|
|      203|     1|         5| 500.0|2025-06-03|       Food Panda|2025|    6|
|      204|     1|         2| 200.0|2025-06-04|              Bus|2025|    6|
|      205|     1|         1| 700.0|2025-06-05|Monthly groceries|2025|    6|
|      206|     2|         4|5000.0|2025-06-01|           Laptop|2025|    6|
|      207|     2|         1|1000.0|2025-06-02|          Grocery|2025|    6|
|      208| 

In [27]:
stats = df.groupBy("UserID").agg(avg("Amount").alias("avg_amount"),stddev("Amount").alias("stddev_amount"))

stats.show()

+------+------------------+------------------+
|UserID|        avg_amount|     stddev_amount|
+------+------------------+------------------+
|     1|            1195.0| 1383.993078777948|
|     3|1044.6666666666667| 920.5810794327265|
|     2|            1424.0|1242.6975266963168|
+------+------------------+------------------+



In [28]:
df = df.join(stats, on="UserID", how="left")

df.show()

+------+---------+----------+------+----------+-----------------+----+-----+------------------+------------------+
|UserID|ExpenseID|CategoryID|Amount|      Date|            Notes|Year|Month|        avg_amount|     stddev_amount|
+------+---------+----------+------+----------+-----------------+----+-----+------------------+------------------+
|     1|      201|         1| 300.0|2025-06-01| Fruits & Veggies|2025|    6|            1195.0| 1383.993078777948|
|     1|      202|         3|2200.0|2025-06-02| Electricity Bill|2025|    6|            1195.0| 1383.993078777948|
|     1|      203|         5| 500.0|2025-06-03|       Food Panda|2025|    6|            1195.0| 1383.993078777948|
|     1|      204|         2| 200.0|2025-06-04|              Bus|2025|    6|            1195.0| 1383.993078777948|
|     1|      205|         1| 700.0|2025-06-05|Monthly groceries|2025|    6|            1195.0| 1383.993078777948|
|     2|      206|         4|5000.0|2025-06-01|           Laptop|2025|    6|    

In [29]:
df_anomalies = df.withColumn("is_anomaly", col("Amount") > col("avg_amount") + 2 * col("stddev_amount"))

df_anomalies.show()

+------+---------+----------+------+----------+-----------------+----+-----+------------------+------------------+----------+
|UserID|ExpenseID|CategoryID|Amount|      Date|            Notes|Year|Month|        avg_amount|     stddev_amount|is_anomaly|
+------+---------+----------+------+----------+-----------------+----+-----+------------------+------------------+----------+
|     1|      201|         1| 300.0|2025-06-01| Fruits & Veggies|2025|    6|            1195.0| 1383.993078777948|     false|
|     1|      202|         3|2200.0|2025-06-02| Electricity Bill|2025|    6|            1195.0| 1383.993078777948|     false|
|     1|      203|         5| 500.0|2025-06-03|       Food Panda|2025|    6|            1195.0| 1383.993078777948|     false|
|     1|      204|         2| 200.0|2025-06-04|              Bus|2025|    6|            1195.0| 1383.993078777948|     false|
|     1|      205|         1| 700.0|2025-06-05|Monthly groceries|2025|    6|            1195.0| 1383.993078777948|    

Detect unusual spikes or large one-time expenses

In [30]:
anomalies = df_anomalies.filter(col("is_anomaly") == True)

anomalies.select("ExpenseID", "UserID", "Amount", "avg_amount", "stddev_amount").show()

+---------+------+------+----------+------------------+
|ExpenseID|UserID|Amount|avg_amount|     stddev_amount|
+---------+------+------+----------+------------------+
|      247|     1|5000.0|    1195.0| 1383.993078777948|
|      232|     1|4000.0|    1195.0| 1383.993078777948|
|      206|     2|5000.0|    1424.0|1242.6975266963168|
+---------+------+------+----------+------------------+



In [31]:
anomalies.select("ExpenseID", "UserID", "Amount", "avg_amount", "stddev_amount").toPandas().to_csv("anomalies.csv")
