In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, month, year, when, lit

# Start Spark session
spark = SparkSession.builder \
    .appName("ExpenseMonitoringETL") \
    .getOrCreate()


3. Create Sample User Data

In [15]:
user_data = [
    (1, "John Doe", "john@example.com", 3000),   # salary/income
    (2, "Jane Smith", "jane@example.com", 4000),
    (3, "Alex Brown", "alex@example.com", 3500),
    (4, "Maria Lee", "maria@example.com", 2800),
    (5, "Chris Green", "chris@example.com", 4500)
]

users_df = spark.createDataFrame(user_data, ["user_id", "name", "email", "monthly_income"])

import pandas as pd
users_df = users_df.toPandas()
users_df


Unnamed: 0,user_id,name,email,monthly_income
0,1,John Doe,john@example.com,3000
1,2,Jane Smith,jane@example.com,4000
2,3,Alex Brown,alex@example.com,3500
3,4,Maria Lee,maria@example.com,2800
4,5,Chris Green,chris@example.com,4500


4. Create Sample Expense Data

In [14]:
expense_data = [
    (1, "2025-01-05", "Food", 250.50),
    (1, "2025-01-12", "Transport", 80.00),
    (1, "2025-01-20", "Shopping", 500.00),
    (2, "2025-02-02", "Food", 300.00),
    (2, "2025-02-15", "Entertainment", 200.00),
    (3, "2025-01-10", "Food", 150.00),
    (3, "2025-01-25", "Shopping", 400.00),
    (4, "2025-02-18", "Transport", 50.00),
    (4, "2025-02-20", "Bills", 180.00),
    (5, "2025-01-28", "Entertainment", 600.00)
]

expenses_df = spark.createDataFrame(expense_data, ["user_id", "expense_date", "category", "amount"])

import pandas as pd
expenses_df = expenses_df.toPandas()
expenses_df


Unnamed: 0,user_id,expense_date,category,amount
0,1,2025-01-05,Food,250.5
1,1,2025-01-12,Transport,80.0
2,1,2025-01-20,Shopping,500.0
3,2,2025-02-02,Food,300.0
4,2,2025-02-15,Entertainment,200.0
5,3,2025-01-10,Food,150.0
6,3,2025-01-25,Shopping,400.0
7,4,2025-02-18,Transport,50.0
8,4,2025-02-20,Bills,180.0
9,5,2025-01-28,Entertainment,600.0


5. Transform: Add Year & Month Columns

In [13]:
from pyspark.sql.functions import to_date

expenses_df = expenses_df.withColumn("expense_date", to_date(col("expense_date"), "yyyy-MM-dd"))
expenses_df = expenses_df.withColumn("year", year(col("expense_date")))
expenses_df = expenses_df.withColumn("month", month(col("expense_date")))

import pandas as pd
expenses_df = expenses_df.toPandas()
expenses_df


Unnamed: 0,user_id,expense_date,category,amount,year,month
0,1,2025-01-05,Food,250.5,2025,1
1,1,2025-01-12,Transport,80.0,2025,1
2,1,2025-01-20,Shopping,500.0,2025,1
3,2,2025-02-02,Food,300.0,2025,2
4,2,2025-02-15,Entertainment,200.0,2025,2
5,3,2025-01-10,Food,150.0,2025,1
6,3,2025-01-25,Shopping,400.0,2025,1
7,4,2025-02-18,Transport,50.0,2025,2
8,4,2025-02-20,Bills,180.0,2025,2
9,5,2025-01-28,Entertainment,600.0,2025,1


6. Aggregate Monthly Spend

In [12]:
monthly_spend_df = expenses_df.groupBy("user_id", "year", "month") \
    .agg(_sum("amount").alias("total_spend"))
import pandas as pd
monthly_spend_df = monthly_spend_df.toPandas()

# Display in Colab as a nice table
monthly_spend_df



Unnamed: 0,user_id,year,month,total_spend
0,1,2025,1,830.5
1,2,2025,2,500.0
2,3,2025,1,550.0
3,5,2025,1,600.0
4,4,2025,2,230.0


7. Join with Users to Calculate Savings & Alerts

In [11]:
summary_df = monthly_spend_df.join(users_df, on="user_id", how="left") \
    .withColumn("savings", col("monthly_income") - col("total_spend")) \
    .withColumn("alert", when(col("total_spend") > (col("monthly_income") * 0.8), lit("High Spend Alert")).otherwise(lit("Normal")))



import pandas as pd

# Convert to Pandas
summary_pd = summary_df.toPandas()

# Display in Colab as a nice table
summary_pd



Unnamed: 0,user_id,year,month,total_spend,name,email,monthly_income,savings,alert
0,5,2025,1,600.0,Chris Green,chris@example.com,4500,3900.0,Normal
1,1,2025,1,830.5,John Doe,john@example.com,3000,2169.5,Normal
2,3,2025,1,550.0,Alex Brown,alex@example.com,3500,2950.0,Normal
3,2,2025,2,500.0,Jane Smith,jane@example.com,4000,3500.0,Normal
4,4,2025,2,230.0,Maria Lee,maria@example.com,2800,2570.0,Normal


8. Save as CSV (Delta Alternative in Colab)

In [9]:
output_path = "/content/expense_summary"
summary_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(output_path)

print("ETL Output saved at:", output_path)


ETL Output saved at: /content/expense_summary


9. Optional: Save as Parquet (Delta-Like)

In [10]:
parquet_path = "/content/expense_summary_parquet"
summary_df.write.mode("overwrite").parquet(parquet_path)

print("ETL Output saved in Parquet format at:", parquet_path)


ETL Output saved in Parquet format at: /content/expense_summary_parquet
