In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd

spark = SparkSession.builder.appName("Upload&Combine").getOrCreate()
spark

In [0]:
df_user = spark.read.csv("file:/Workspace/Shared/cleaned_large_expenses.csv", header=True, inferSchema=True)
df_expenses = spark.read.csv("file:/Workspace/Shared/expenses_cleaned.csv", header=True, inferSchema=True)
df_user.display()
df_expenses.display()


_c0,UserID,ExpenseID,CategoryID,Amount,Date,Notes,Year,Month,avg_amount,stddev_amount,is_anomaly
0,1,201,1,300.0,2025-06-01,Fruits & Veggies,2025,6,1195.0,1383.993078777948,False
1,1,202,3,2200.0,2025-06-02,Electricity Bill,2025,6,1195.0,1383.993078777948,False
2,1,203,5,500.0,2025-06-03,Food Panda,2025,6,1195.0,1383.993078777948,False
3,1,204,2,200.0,2025-06-04,Bus,2025,6,1195.0,1383.993078777948,False
4,1,205,1,700.0,2025-06-05,Monthly groceries,2025,6,1195.0,1383.993078777948,False
5,2,206,4,5000.0,2025-06-01,Laptop,2025,6,1424.0,1242.6975266963168,True
6,2,207,1,1000.0,2025-06-02,Grocery,2025,6,1424.0,1242.6975266963168,False
7,2,208,5,900.0,2025-06-03,Pizza night,2025,6,1424.0,1242.6975266963168,False
8,2,209,2,150.0,2025-06-04,Rickshaw,2025,6,1424.0,1242.6975266963168,False
9,2,210,3,1900.0,2025-06-05,Water Bill,2025,6,1424.0,1242.6975266963168,False


ExpenseID,UserID,CategoryID,Amount,Date,Notes,Month
101,1,1,1200.0,2025-05-10,Big Bazaar purchase,2025-05-01
102,1,2,450.0,2025-05-12,Bus ticket,2025-05-01
103,1,3,2000.0,2025-05-15,Electricity bill,2025-05-01
104,2,4,3000.0,2025-05-20,Clothes at Myntra,2025-05-01
105,2,5,600.0,2025-05-25,Dinner at McDonald's,2025-05-01
106,1,1,800.0,2025-06-02,Local grocery,2025-06-01
107,1,3,1900.0,2025-06-05,Water bill,2025-06-01
108,2,2,1000.0,2025-06-08,Uber ride,2025-06-01
109,1,5,750.0,2025-06-10,Zomato lunch,2025-06-01
110,2,4,2200.0,2025-06-12,Amazon sale,2025-06-01


In [0]:
df_user = df_user.withColumn("Date", to_date("Date"))
df_expenses = df_expenses.withColumn("Date", to_date("Date"))
df_user.printSchema()
df_expenses.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- UserID: integer (nullable = true)
 |-- ExpenseID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Date: date (nullable = true)
 |-- Notes: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- avg_amount: double (nullable = true)
 |-- stddev_amount: double (nullable = true)
 |-- is_anomaly: boolean (nullable = true)

root
 |-- ExpenseID: integer (nullable = true)
 |-- UserID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Date: date (nullable = true)
 |-- Notes: string (nullable = true)
 |-- Month: date (nullable = true)



Combine user and expense data

In [0]:
df_combined = df_user.select("ExpenseID", "UserID", "CategoryID", "Amount", "Date", "Notes").unionByName(df_expenses.select("ExpenseID", "UserID", "CategoryID", "Amount", "Date", "Notes"))

df_combined.display()

ExpenseID,UserID,CategoryID,Amount,Date,Notes
201,1,1,300.0,2025-06-01,Fruits & Veggies
202,1,3,2200.0,2025-06-02,Electricity Bill
203,1,5,500.0,2025-06-03,Food Panda
204,1,2,200.0,2025-06-04,Bus
205,1,1,700.0,2025-06-05,Monthly groceries
206,2,4,5000.0,2025-06-01,Laptop
207,2,1,1000.0,2025-06-02,Grocery
208,2,5,900.0,2025-06-03,Pizza night
209,2,2,150.0,2025-06-04,Rickshaw
210,2,3,1900.0,2025-06-05,Water Bill


In [0]:
df_combined.write.format("delta").mode("overwrite").saveAsTable("expenses_table")

display(spark.sql("SELECT * FROM expenses_table"))

ExpenseID,UserID,CategoryID,Amount,Date,Notes
201,1,1,300.0,2025-06-01,Fruits & Veggies
202,1,3,2200.0,2025-06-02,Electricity Bill
203,1,5,500.0,2025-06-03,Food Panda
204,1,2,200.0,2025-06-04,Bus
205,1,1,700.0,2025-06-05,Monthly groceries
206,2,4,5000.0,2025-06-01,Laptop
207,2,1,1000.0,2025-06-02,Grocery
208,2,5,900.0,2025-06-03,Pizza night
209,2,2,150.0,2025-06-04,Rickshaw
210,2,3,1900.0,2025-06-05,Water Bill


Create a summary table with monthly spend, savings, and alerts

In [0]:
df = spark.read.table("expenses_table")
df = df.withColumn("Year", year("Date")).withColumn("Month", month("Date"))

df.show()

+---------+------+----------+------+----------+-----------------+----+-----+
|ExpenseID|UserID|CategoryID|Amount|      Date|            Notes|Year|Month|
+---------+------+----------+------+----------+-----------------+----+-----+
|      201|     1|         1| 300.0|2025-06-01| Fruits & Veggies|2025|    6|
|      202|     1|         3|2200.0|2025-06-02| Electricity Bill|2025|    6|
|      203|     1|         5| 500.0|2025-06-03|       Food Panda|2025|    6|
|      204|     1|         2| 200.0|2025-06-04|              Bus|2025|    6|
|      205|     1|         1| 700.0|2025-06-05|Monthly groceries|2025|    6|
|      206|     2|         4|5000.0|2025-06-01|           Laptop|2025|    6|
|      207|     2|         1|1000.0|2025-06-02|          Grocery|2025|    6|
|      208|     2|         5| 900.0|2025-06-03|      Pizza night|2025|    6|
|      209|     2|         2| 150.0|2025-06-04|         Rickshaw|2025|    6|
|      210|     2|         3|1900.0|2025-06-05|       Water Bill|2025|    6|

In [0]:
summary_df = df.groupBy("UserID", "Year", "Month").agg(round(sum("Amount"), 2).alias("TotalSpend"))

summary_df.show()

+------+----+-----+----------+
|UserID|Year|Month|TotalSpend|
+------+----+-----+----------+
|     2|2025|    6|   24560.0|
|     3|2025|    6|   15670.0|
|     1|2025|    6|   27350.0|
|     1|2025|    5|    3650.0|
|     2|2025|    5|    3600.0|
+------+----+-----+----------+



In [0]:
summary_df = summary_df.withColumn("EstimatedSavings", round(summary_df["TotalSpend"] * 0.30, 2))

summary_df.show()

+------+----+-----+----------+----------------+
|UserID|Year|Month|TotalSpend|EstimatedSavings|
+------+----+-----+----------+----------------+
|     2|2025|    6|   24560.0|          7368.0|
|     3|2025|    6|   15670.0|          4701.0|
|     1|2025|    6|   27350.0|          8205.0|
|     1|2025|    5|    3650.0|          1095.0|
|     2|2025|    5|    3600.0|          1080.0|
+------+----+-----+----------+----------------+



In [0]:
alerts_df = summary_df.filter(col("TotalSpend") > 20000)

display(alerts_df)

alerts_df.select("UserID", "Year", "Month", "TotalSpend").toPandas().to_csv("monthly_alerts.csv", index=False)

UserID,Year,Month,TotalSpend,EstimatedSavings
2,2025,6,24560.0,7368.0
1,2025,6,27350.0,8205.0


In [0]:
summary_df.write.format("delta").mode("overwrite").save("file:/Workspace/Shared/final_report")
#summary_df.write.format("csv").mode("overwrite").save("file:/Workspace/Shared/final_report.csv")
summary_df.toPandas().to_csv("final_report.csv", index=False)
display(summary_df)

UserID,Year,Month,TotalSpend,EstimatedSavings
2,2025,6,24560.0,7368.0
3,2025,6,15670.0,4701.0
1,2025,6,27350.0,8205.0
1,2025,5,3650.0,1095.0
2,2025,5,3600.0,1080.0
