<a href="https://colab.research.google.com/github/P-Brundha/info/blob/main/23BIT012_In_memory_data_processing_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
# ⚡ In-Memory vs Disk-Based Computation using Apache Spark
# ============================================================

import pandas as pd
import random
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum as _sum

# ------------------------------------------------------------
# Step 1: Start Spark Session
# ------------------------------------------------------------
spark = (
    SparkSession.builder
    .appName("MemoryPerformanceDemo")
    .master("local[*]")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("WARN")
print("🚀 Spark session initialized successfully")

# ------------------------------------------------------------
# Step 2: Generate Synthetic Transaction Dataset
# ------------------------------------------------------------
num_rows = 1_000_000  # 1 million records

transactions = pd.DataFrame({
    "cust_id": [random.randint(1, 120000) for _ in range(num_rows)],
    "txn_value": [round(random.uniform(10, 1000), 2) for _ in range(num_rows)],
    "segment": [random.choice(["Gold", "Silver", "Bronze", "Platinum"]) for _ in range(num_rows)],
    "customer_age": [random.randint(18, 70) for _ in range(num_rows)]
})

print(f"✅ Generated synthetic dataset with {len(transactions):,} rows")

# ------------------------------------------------------------
# Step 3: Create Spark DataFrame
# ------------------------------------------------------------
df = spark.createDataFrame(transactions)
print("📦 Spark DataFrame created and ready for processing")

# ------------------------------------------------------------
# Step 4: Run Aggregation (Disk-Based)
# ------------------------------------------------------------
start_time = time.time()
summary_disk = (
    df.groupBy("segment")
      .agg(avg("txn_value").alias("avg_txn"), _sum("txn_value").alias("total_txn"))
)
summary_disk.show(4)
end_time = time.time()

print(f"🕒 Query time (without caching): {end_time - start_time:.2f} sec")

# ------------------------------------------------------------
# Step 5: Cache DataFrame in Memory
# ------------------------------------------------------------
df.cache()
df.count()  # Force cache evaluation
print("💾 Data cached in memory successfully")

# ------------------------------------------------------------
# Step 6: Run Aggregation (In-Memory)
# ------------------------------------------------------------
start_time = time.time()
summary_mem = (
    df.groupBy("segment")
      .agg(avg("txn_value").alias("avg_txn"), _sum("txn_value").alias("total_txn"))
)
summary_mem.show(4)
end_time = time.time()

print(f"⚡ Query time (with in-memory caching): {end_time - start_time:.2f} sec")

# ------------------------------------------------------------
# Step 7: Additional Analytics - Top Customers
# ------------------------------------------------------------
print("\n🏆 Identifying Top 5 Customers by Total Spend...")
start_time = time.time()

top_spenders = (
    df.groupBy("cust_id")
      .agg(_sum("txn_value").alias("total_spent"))
      .orderBy(col("total_spent").desc())
)
top_spenders.show(5)

end_time = time.time()
print(f"🧮 Execution time (top customers): {end_time - start_time:.2f} sec")

# ------------------------------------------------------------
# Step 8: Wrap Up
# ------------------------------------------------------------
spark.stop()
print("\n✅ Spark session terminated cleanly")
print("📊 Summary:")
print(" - Compared runtime with and without in-memory caching")
print(" - Demonstrated Spark DataFrame caching impact on performance")
print(" - Extracted top 5 high-value customers successfully")


🚀 Spark session initialized successfully
✅ Generated synthetic dataset with 1,000,000 rows
📦 Spark DataFrame created and ready for processing
+--------+------------------+--------------------+
| segment|           avg_txn|           total_txn|
+--------+------------------+--------------------+
|Platinum|505.24028937682357|1.2600591769000104E8|
|  Silver|504.36755270173677|1.2599202339999925E8|
|    Gold| 503.7946137469375| 1.261269967300008E8|
|  Bronze|505.38293141036445|1.2657113364000013E8|
+--------+------------------+--------------------+

🕒 Query time (without caching): 12.84 sec
💾 Data cached in memory successfully
+--------+------------------+--------------------+
| segment|           avg_txn|           total_txn|
+--------+------------------+--------------------+
|Platinum|505.24028937682357|1.2600591769000104E8|
|  Silver|504.36755270173677|1.2599202339999925E8|
|    Gold| 503.7946137469375| 1.261269967300008E8|
|  Bronze|505.38293141036445|1.2657113364000013E8|
+--------+---