In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import *
import pandas as pd

In [0]:

# Create a small pandas DataFrame
df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35]
})

df.head(2)



In [0]:
# Sample data
data = [
    ("P1", "North", "Electronics", 1200.50, "2025-04-01"),
    ("P2", "North", "Electronics", 950.75, "2025-04-01"),
    ("P3", "North", "Electronics", 850.00, "2025-04-01"),
    ("P4", "North", "Electronics", 200.00, "2025-04-01"),
    ("P1", "South", "Furniture", 500.00, "2025-04-01"),
    ("P5", "South", "Furniture", 1500.00, "2025-04-01"),
    ("P6", "South", "Furniture", 700.00, "2025-04-01"),
    ("P7", "East", "Grocery", 300.00, "2025-04-01"),
    ("P8", "East", "Grocery", 1200.00, "2025-04-01"),
    ("P9", "East", "Grocery", 1000.00, "2025-04-01"),
    ("P10", "East", "Grocery", 50.00, "2025-04-01"),
]

# Define schema with sale_date as string
schema = StructType([
    StructField("product_id", StringType(), True),
    StructField("region", StringType(), True),
    StructField("category", StringType(), True),
    StructField("sale_amount", DoubleType(), True),
    StructField("sale_date", StringType(), True)
])

# Create DataFrame
sales_df = spark.createDataFrame(data, schema)
sales_df = sales_df.withColumn("sale_date", to_date("sale_date"))

# Show the DataFrame
sales_df.show(truncate=False)


In [0]:
"""
For each region and category, find:
Top 3 selling products by revenue
"""
# Aggregating total revenue for each product in category and region
aggregated_df = sales_df.groupBy("region", "category", "product_id").agg(
    sum("sale_amount").alias("total_revenue")
)

# Assigning ranks for products
window_spec = Window.partitionBy("region", "category").orderBy(desc("total_revenue"))
ranking_df = aggregated_df.withColumn(
    "rank",
    dense_rank().over(window_spec)
)

top3_products_df = ranking_df.filter(col("rank") <= 3)
top3_products_df.show()

In [0]:
"""
Average revenue of top 3 products
"""
# Join original sales data with top 3 products (based on total revenue)
top3_sales_df = sales_df.join(top3_products_df.select("region", "category", "product_id"), 
                              on=["region", "category", "product_id"], how="inner")

# Calculate average revenue of those top 3 products per region-category
avg_top3_df = top3_sales_df.groupBy("region", "category").agg(
    avg("sale_amount").alias("avg_top3_revenue")
)
avg_top3_df.show()

In [0]:
# Percentage contribution of each top product to its category-region revenue
# Aggregating total revenue for each product in category and region
aggregated_df = sales_df.groupBy("region", "category", "product_id").agg(
    sum("sale_amount").alias("top_product_revenue")
)

# Assigning ranks for products
window_spec = Window.partitionBy("region", "category").orderBy(desc("top_product_revenue"))
ranking_df = aggregated_df.withColumn(
    "rank",
    dense_rank().over(window_spec)
)

top3_products_df = ranking_df.filter(col("rank") <= 3)

# Aggregating total reveenue across each region  and category
total_reg_cate_df = sales_df.groupBy("region", "category").agg(
    sum("sale_amount").alias("total_revenue")
)

# Joining total revenue and catgeory region level revenue
joined_df = total_reg_cate_df.join(top3_products_df, on=["region", "category"], how="inner")

# Contribution of top product in each category
top_product_cont_df = joined_df.withColumn(
    "contribution",
    (col("top_product_revenue") / col("total_revenue")) * 100
)
top_product_cont_df.show()

In [0]:
"""
Detect First and Last Purchase of Customer (Lifecycle Analysis)
Dataset:
orders: order_id, customer_id, order_date, order_amount

🛠️ Task:
For each customer:
Get first and last purchase date
Calculate days between first and last order
Flag if a customer is "One-Time Buyer" or "Repeat Buyer"
Also, calculate average order value
"""

In [0]:
# Sample schema for the 'orders' DataFrame
schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("order_date", StringType(), True),
    StructField("order_amount", FloatType(), True)
])

# Sample data for the 'orders' DataFrame
data = [
    (1, 101, "2024-01-01", 150.0),
    (2, 101, "2024-03-15", 200.0),
    (3, 102, "2024-01-10", 300.0),
    (4, 103, "2024-01-12", 250.0),
    (5, 104, "2024-02-01", 120.0),
    (6, 104, "2024-04-10", 180.0)
]

# Create the DataFrame
orders_df = spark.createDataFrame(data, schema)

# Convert the 'order_date' column to date format
orders_df = orders_df.withColumn("order_date", col("order_date").cast(DateType()))
orders_df.show()

In [0]:
# Get first and last purchase date
first_last_df = orders_df.groupBy("customer_id").agg(
    min("order_date").alias("first_purchase_date"),
    max("order_date").alias("last_purchase_date")
)
# Calculate days between first and last order
day_diff_df = first_last_df.withColumn(
    "day_diff",
    datediff(col("last_purchase_date"), col("first_purchase_date"))
)
day_diff_df.show()

# Flag if a customer is "One-Time Buyer" or "Repeat Buyer"
customer_analysis_df = orders_df.groupBy("customer_id").agg(
    countDistinct("order_id").alias("total_orders")
)
customers_flags_df = customer_analysis_df.withColumn(
    "buyer_type",
    when(col("total_orders") > 1, "Repeat_Buyer").otherwise("One_Time_Buyer")
)
customer_flags_df.show()

# Also, calculate average order value
avg_order_val_df = orders_df.groupBy("customer_id").agg(
    avg("order_amount").alias("avg_order_value")
)
window_spec = Window.orderBy(desc("avg_order_value"))
ranking_df = avg_order_val_df.withColumn("rank", dense_rank().over(window_spec))
ranking_df.show()

In [0]:
"""
Time Series Trend Analysis Per Product
Dataset:
daily_sales: product_id, sale_date, units_sold

🛠️ Task:
For each product:
Calculate 7-day moving average of sales
Identify upward or downward trend using 3 consecutive increases/decreases
Flag spike days (where sales > 2 * moving avg)"""

In [0]:
# Define schema
schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("sale_date", StringType(), True),
    StructField("units_sold", IntegerType(), True)
])

# Sample data
data = [
    (101, "2024-04-01", 50),
    (101, "2024-04-02", 55),
    (101, "2024-04-03", 53),
    (101, "2024-04-04", 60),
    (101, "2024-04-05", 65),
    (101, "2024-04-06", 70),
    (101, "2024-04-07", 75),
    (101, "2024-04-08", 150),  # Spike
    (101, "2024-04-09", 80),
    (102, "2024-04-01", 20),
    (102, "2024-04-02", 22),
    (102, "2024-04-03", 25),
    (102, "2024-04-04", 23),
    (102, "2024-04-05", 24),
    (102, "2024-04-06", 21),
    (102, "2024-04-07", 19),
    (102, "2024-04-08", 18),
    (102, "2024-04-09", 35)  # Spike
]

# Create DataFrame
daily_sales_df = spark.createDataFrame(data, schema)
daily_sales_df = daily_sales_df.withColumn("sale_date", to_date("sale_date"))
daily_sales_df.show()

In [0]:
# Calculate 7-day moving average of sales
window = Window.partitionBy("product").orderBy("sale_date").rowsBetween(-6, 0)
moving_avg_df = daily_sales_df.withColumn(
    "moving_avg",
    avg("units_sold").over(window)
)

# Identify upward or downward trend using 3 consecutive increases/decreases
window_spec = Window.partitionBy("product").orderBy("sale_date")
product_trends_df = daily_sales_df.withColumn(
    "prev_day_sale",
    lag("units_sold").over(window_spec)
)
sales_analysis_df = product_trends_df.withColumn(
    "isHigher",
    when(col("units_sold") > col("prev_day_sale"), "Yes").otherwise("No")
)

# Identifying consequtive streaks
windows = Window.partitionBy("product").orderBy("sale_date")
random_rows_df = sales_analysis_df.withColumn(
    "rn", 
    row_number().over(windows)
).filter("prev_day_sale is not null")

# Creating groups to find consequtives by substracting date and rownumber
conse_grop_df = random_rows_df.withColumn(
    "grouping",
    date_sub("sale_date", "rn")
)
aggregated_df = conse_grop_df.groupBy("product_id", "grouping").agg(
    sum(when(col("isHigher") == 'Yes'),1).alias("total_growths")
)

# Assigning ranks for consequtives for each product
window_rank = Window.partitionBy("product").orderBy(desc("total_growths"))
product_cons_rank_df = aggregated_df.withColumn("rank", dense_rank().over(window_rank))
product_cons_rank_df.show()

# Flag spike days (where sales > 2 * moving avg)

# Getting only last row per product to get moving average to join with original dataframe
window_last = Window.partitionBy("product").orderBy("sale_date").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
product_moving_avg_df = moving_avg_df.withColumn("last_mov_avg", last("moving_avg").over(window_last))
product_moving_avg_df = product_moving_avg_df.distinct()

# Joining with original dataframe
joined_df = daily_sales_df.join(product_moving_avg_df.select("product", "last_mov_avg"), on="product_id", how="inner")

# Filetering sales where greater than moving average
filtered_df = joined_df.filter(col("units_sold") > 2 * col("last_mov_avg"))
filtered_df.show()

In [0]:
data = [
    {
        "user_id": "u1",
        "events": [
            {"event_type": "click", "timestamp": "2024-01-01T10:00:00"},
            {"event_type": "purchase", "timestamp": "2024-01-01T10:05:00"},
        ],
    }
]

schema = StructType(
    [
        StructField("user_id", StringType(), True),
        StructField(
            "events",
            ArrayType(
                StructType(
                    [
                        StructField("event_type", StringType(), True),
                        StructField("timestamp", StringType(), True),
                    ]
                )
            ),
            True,
        ),
    ]
)

json_df = spark.createDataFrame(data, schema)

# Exploding the json dataframe with explode function
exploded_df = json_df.withColumn("event", explode("events"))

final_df = exploded_df.select(
    col("user_id"),
    col("event.event_type").alias("event_type"),
    col("event.timestamp").alias("timestamp"),
)

final_df = final_df.withColumn("timestamp", to_timestamp("timestamp"))

# For each event, calculate time since previous event (per user)
window_spec = Window.partitionBy("user_id").orderBy("timestamp")
time_diff_df = final_df.withColumn(
    "previous_event_time", lag("timestamp").over(window_spec)
).withColumn(
    "time_diff_seconds",
    unix_timestamp(col("timestamp")) - unix_timestamp(col("previous_event_time")),
).withColumn(
    "time_diff_readable",
    from_unixtime(col("time_diff_seconds"), "HH:mm:ss")
)
time_diff_df.filter("previous_event_time IS Not Null").show()

In [0]:
"""
Complex Pivot with Aggregation and Ratio Calculation
Dataset:
transactions: user_id, category, amount, transaction_date

🛠️ Task:
Pivot to get total amount spent per category as separate columns
Add total amount and ratio of each category to total
"""
# Sample data
data = [
    ("u1", "grocery", 120.0, "2024-01-01"),
    ("u1", "electronics", 500.0, "2024-01-02"),
    ("u1", "grocery", 80.0, "2024-01-03"),
    ("u2", "grocery", 200.0, "2024-01-01"),
    ("u2", "fashion", 300.0, "2024-01-02"),
    ("u3", "fashion", 150.0, "2024-01-03"),
    ("u3", "electronics", 700.0, "2024-01-04")
]

# Define schema
schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("category", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("transaction_date", StringType(), True)  # You can also make it DateType if needed
])

# Create DataFrame
transactions_df = spark.createDataFrame(data, schema)
transactions_df = transactions_df.withColumn("transaction_date", to_date("transaction_date"))

transactions_df.show(truncate=False)

In [0]:
# Pivoted the dataframe to find each category amount
pivoted_df = transactions_df.groupBy("user_id").pivot("category").sum("amount")

# Aggregating total amount per user
aggregated_df = transactions_df.groupBy("user_id").agg(
    sum("amount").alias("total_amount")
)

# Joining both pivoted and aggregated dataframes
joined_df = pivoted_df.join(aggregated_df, on="user_id", how="inner")
joined_df.show()

# Computing ratios for each category
ratio_df = (joined_df
            .withColumn("electronics_ratio", round(coalesce(col("electronics"), lit(0))/ col("total_amount"), 2))
            .withColumn("fashion_ratio", round(coalesce(col("fashion"), lit(0))/ col("total_amount"), 2))
            .withColumn("grocery_ratio", round(coalesce(col("grocery"), lit(0))/ col("total_amount"), 2))
            )

ratio_df.show()            

In [0]:
# Dynamic way to calculate category ratios 
from pyspark.sql.functions import col, round, coalesce, lit

# Existing columns
basic_columns = joined_df.columns

# List of category columns (excluding user_id and total_amount)
category_columns = [c for c in basic_columns if c not in ("user_id", "total_amount")]

# Start with existing columns
final_cols = [col("user_id")] + [col(c) for c in category_columns] + [col("total_amount")]

# Add ratio columns dynamically
for cat in category_columns:
    ratio_col_name = f"{cat}_ratio"
    final_cols.append(
        round(coalesce(col(cat), lit(0)) / col("total_amount"), 2).alias(ratio_col_name)
    )

# Select all
dynamic_ratio_df = joined_df.select(*final_cols)

dynamic_ratio_df.show()


In [0]:
"""
Financial Transactions – Fraud Pattern Detection
Dataset:
transactions: transaction_id, account_id, amount, timestamp, location

🛠️ Tasks:
For each account:
Find if more than 3 transactions occur within 1 minute → mark as "suspicious burst"
Flag any transaction where location changes between two transactions < 5 minutes apart
Output flagged transactions with reason (burst, suspicious_location)
"""

In [0]:
# Define schema
schema = StructType([
    StructField("transaction_id", IntegerType(), True),
    StructField("account_id", IntegerType(), True),
    StructField("amount", IntegerType(), True),
    StructField("timestamp", StringType(), True),
    StructField("location", StringType(), True)
])

# Sample data
data = [
    (1, 101, 500, "2025-04-29 10:00:00", "New York"),
    (2, 101, 300, "2025-04-29 10:01:00", "New York"),
    (3, 101, 100, "2025-04-29 10:02:00", "New York"),
    (4, 101, 200, "2025-04-29 10:03:00", "New York"),
    (5, 101, 150, "2025-04-29 10:05:00", "Los Angeles"),
    (6, 102, 700, "2025-04-29 09:50:00", "Chicago"),
    (7, 102, 300, "2025-04-29 09:51:00", "Chicago"),
    (8, 102, 200, "2025-04-29 09:52:00", "Chicago"),
    (9, 102, 150, "2025-04-29 09:53:00", "Chicago"),
    (10, 102, 100, "2025-04-29 09:54:00", "Chicago"),
    (11, 103, 1000, "2025-04-29 10:30:00", "San Francisco"),
    (12, 103, 200, "2025-04-29 10:35:00", "San Francisco"),
    (13, 103, 300, "2025-04-29 10:36:00", "San Francisco"),
    (14, 103, 100, "2025-04-29 10:40:00", "Los Angeles")
]

# Create DataFrame
df = spark.createDataFrame(data, schema)
df = df.withColumn("timestamp", to_timestamp("timestamp"))

# Show the DataFrame
df.show(truncate=False)

In [0]:
# Find if more than 3 transactions occur within 1 minute → mark as "suspicious burst"

# Converting timestamp to seconds using unix_timestamp function
df = df.withColumn("time_in_seconds", unix_timestamp(col("timestamp")))

# window specification for rolling 1 minute window
window_spec = Window.partitionBy("account_id").orderBy("timestamp").rangeBetween(-60, 0)

# Counting total transactions per account in rolling 1 window
aggregated_df = df.withColumn(
    "total_transactions", 
    count("transaction_id").over(window_spec)
)

# Filtering transactions less than or equal to 3 to flag them as suspecious
suspecious_df = aggregated_df.filter(col("total_transactions") > 3)
suspecious_df = suspecious_df.withColumn("transaction_flag", lit("suspecious_burst"))

suspecious_burst_df = suspecious_df.select(
    col("transaction_id"),
    col("account_id"),
    col("timestamp"),
    col("location"),
    col("transaction_flag")
)

# Flag any transaction where location changes between two transactions < 5 minutes apart

# window specification to find previous transaction location and timestamp
window = Window.partitionBy("account_id").orderBy("timestamp")

# Creating prev location and prev timestamp columns
transaction_details_df = df.withColumn(
    "prev_location",
    lag("location").over(window)
).withColumn(
    "prev_timestamp",
    lag("timestamp").over(window)
)

# Identifying differences between timestamp and prevtimestamp 
transaction_analysis_df = transaction_details_df.withColumn(
    "time_diff",
    (unix_timestamp(col("timestamp")) - unix_timestamp(col("prev_timestamp")))
)

# Filtering suspecious locations transactions
suspecious_loc_df = transaction_analysis_df.filter(
    (col("time_diff") < 300) & 
    (col("location") != col("prev_location"))
)

suspecious_loc_df = suspecious_loc_df.withColumn("transaction_flag", lit("suspecious_location"))

suspecious_loc_df = suspecious_loc_df.select(
    col("transaction_id"),
    col("account_id"),
    col("timestamp"),
    col("location"),
    col("transaction_flag")
)

suspecious_trans_df = suspecious_burst_df.union(suspecious_loc_df)
suspecious_trans_df.show()

In [0]:
"""
Cart Abandonment Tracking
Dataset:
cart_events: user_id, event_type (add_to_cart, purchase), timestamp

🛠️ Tasks:
For each cart session (based on 1-hour inactivity):
Count number of items added
Whether the cart was purchased
Calculate cart abandonment rate
Final output per user per session
"""

In [0]:
# Define schema
schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("event_type", StringType(), True),  # 'add_to_cart' or 'purchase'
    StructField("timestamp", StringType(), True)
])

# Sample data
data = [
    ("user_1", "add_to_cart", "2025-04-29 10:00:00"),
    ("user_1", "add_to_cart", "2025-04-29 10:10:00"),
    ("user_1", "purchase",     "2025-04-29 10:20:00"),
    ("user_1", "add_to_cart", "2025-04-29 12:00:00"),
    ("user_2", "add_to_cart", "2025-04-29 11:00:00"),
    ("user_2", "add_to_cart", "2025-04-29 11:45:00"),
    ("user_2", "add_to_cart", "2025-04-29 13:00:00"),
    ("user_2", "purchase",     "2025-04-29 13:30:00"),
    ("user_3", "add_to_cart", "2025-04-29 14:00:00"),
]

# Convert timestamps and create DataFrame
df = spark.createDataFrame(data, schema)
df = df.withColumn("timestamp", to_timestamp("timestamp"))

# Show schema and data
df.printSchema()
df.show(truncate=False)

In [0]:
class CartAbandonmentTracking:
    def read_file(self, path):
        try:
            df = spark.read.format('csv').option('header', True).load(path)
            return df
        except Exception as e:
            print(f"Error occured while file reading: {e}")
            return None
        
    def user_analysis(self, df):
        # Ensure timestamp is in correct format
        df = df.withColumn("timestamp", col("timestamp").cast("timestamp"))

        # Create lag column to get previous event timestamp
        window_spec = Window.partitionBy("user_id").orderBy("timestamp")
        df = df.withColumn(
            "previous_timestamp",
            lag("timestamp").over(window_spec)
        )  

        # Checking time difference in minutes to create sessions
        df = df.withColumn(
            "time_diff_minutes",
            (unix_timestamp(col("timestamp")) - unix_timestamp(col("previous_timestamp"))) / 60
        )

        # Flag when a new session starts (inactivity > 60 min)
        df = df.withColumn(
            "session_flag",
            when(col("time_diff_minutes").isNull() | (col("time_diff_minutes") > 60), 1).otherwise(0)
        )

        # Generate incremental session IDs using cumulative sum
        df = df.withColumn(
            "session_number",
            sum("session_flag").over(window_spec)
        )

        # Aggregating total addtocart and purchase count
        agg_df = df.groupBy("user_id", "sessionID").agg(
            count(when(col("event_type") == "add_to_cart", True)).alias("total_events_add"),
            count(when(col("event_type") == "purchase", True)).alias("total_purchases")
        )

        # Count user sessions and abondoned sessions
        user_session_stats_df = agg_df.groupBy("user_id").agg(
            count("sessionID").alias("total_sessions"),
            count(when(col("total_purchases") == 0, True)).alias("abondoned_sessions") 
        )

        # Abondonment rate per user
        result_df = user_session_stats_df.withColumn(
            "abondonment_rate",
            (col("abondonted_sessions") / col("total_sessions")) * 100 
        )
        return result_df
    
cart_trac_inst = CartAbandonmentTracking

# Reading file from path
df = cart_trac_inst.read_file("dbfs://FileStore/raw/carts_tracking")

result = cart_trac_inst.user_analysis(df)

result.show()


In [0]:
"""
Shift Scheduling Conflicts
Dataset:
employee_shifts: emp_id, shift_start, shift_end

🛠️ Tasks:
Identify employees with overlapping shifts
Report all conflict pairs: emp_id, conflict_start, conflict_end
Use self-join on emp_id, and time overlap logic:
"""

In [0]:
# Define schema with string timestamps
schema = StructType([
    StructField("emp_id", IntegerType(), True),
    StructField("shift_start", StringType(), True),
    StructField("shift_end", StringType(), True),
])

# Sample data (timestamps as strings)
data = [
    (101, "2023-05-01 09:00:00", "2023-05-01 13:00:00"),
    (101, "2023-05-01 12:00:00", "2023-05-01 16:00:00"),  # Overlap
    (101, "2023-05-01 17:00:00", "2023-05-01 20:00:00"),  # No overlap
    (102, "2023-05-02 08:00:00", "2023-05-02 12:00:00"),
    (102, "2023-05-02 11:00:00", "2023-05-02 15:00:00"),  # Overlap
    (103, "2023-05-03 10:00:00", "2023-05-03 14:00:00"),  # No conflict
]

# Create DataFrame with string timestamps
df = spark.createDataFrame(data, schema)

# Convert to actual timestamps
df = df.withColumn("shift_start", to_timestamp(col("shift_start"), "yyyy-MM-dd HH:mm:ss")) \
       .withColumn("shift_end", to_timestamp(col("shift_end"), "yyyy-MM-dd HH:mm:ss"))

# Show the DataFrame and schema
df.show(truncate=False)

In [0]:
# Applying self join to identify overlap shifts for each employee
overlap_shifts_df = df.alias("df1").join(
    df.alias("df2"),
    (col("df1.emp_id") == col("df2.emp_id")) &
    (col("df1.shift_start") < col("df2.shift_start")) &  # avoids symmetric pairs and self-pairs
    (col("df1.shift_end") > col("df2.shift_start")) &
    (col("df1.shift_start") < col("df2.shift_end"))
)

shifts_conflicts_df = overlap_shifts_df.select(
    col("df1.emp_id").alias("emp_id"),
    col("df1.shift_start").alias("shift1_start"),
    col("df1.shift_end").alias("shift1_end"),
    col("df2.shift_start").alias("conflict_start"),
    col("df2.shift_end").alias("conflict_end")
)

shifts_conflicts_df.show(truncate=False)

In [0]:
"""
Product Lifecycle Movement
Dataset:
product_events: product_id, event_type, event_date
(event types: launched, promoted, discounted, discontinued)

🛠️ Tasks:
For each product:
Track how long it stays in each state
Output the full lifecycle with duration
Flag products that never reached discontinued state
"""

In [0]:
# Define schema
schema = StructType([
    StructField("product_id", StringType(), True),
    StructField("event_type", StringType(), True),
    StructField("event_date", StringType(), True),  # Keep as string for now
])

# Sample data
data = [
    ("P1", "launched", "2023-01-01"),
    ("P1", "promoted", "2023-02-01"),
    ("P1", "discounted", "2023-03-01"),
    ("P1", "discontinued", "2023-04-01"),

    ("P2", "launched", "2023-01-10"),
    ("P2", "promoted", "2023-02-15"),
    ("P2", "discounted", "2023-03-15"),
    # No discontinued for P2

    ("P3", "launched", "2023-05-01"),
    ("P3", "discontinued", "2023-06-01"),
    # Skipped other events

    ("P4", "launched", "2023-07-01"),
    # Only launched
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Convert event_date to date type
df = df.withColumn("event_date", to_date(col("event_date"), "yyyy-MM-dd"))

# Show DataFrame
df.show(truncate=False)

In [0]:
# Product lifecycle
# window specification for lag function
window_spec = Window.partitionBy("product_id").orderBy("event_date")
prev_events_df = df.withColumn(
    "prev_event_date",
    lag("event_date").over(window_spec)
)

# Checking duration between events for each product
event_durations_df = prev_events_df.withColumn(
    "duration",
    date_diff("event_date", "prev_event_date")
)
event_durations_df.show()

# Filtering only discontinued event 
filtered_df = event_durations_df.filter(col("event_type") == "discontinued")
filtered_df = filtered_df.select(
    col("product_id"),
    col("event_type").alias("discontinued_event")
)

# Joining products which have discontinued event with oiginal dataframe
joined_df = df.join(filtered_df, on="product_id", how="left")

# Flagging products which doesn't have discontinued event
flagged_df = joined_df.withColumn(
    "is_discontinued",
    when(col("discontinued_event").isNull(), 'No').otherwise('Yes')
)
final_df = flagged_df.select("product_id", "discontinued_event", "is_discontinued").dropDuplicates(["product_id"])

final_df.show()


In [0]:
"""
Build User Graph (Advanced Joins)
Dataset:

messages: sender_id, receiver_id, message_time

🛠️ Tasks:
For each user, find:
Total number of distinct connections (sent or received)
Top 3 most messaged people
Average message frequency (messages/day)
"""

In [0]:
# Initialize Spark
#spark = SparkSession.builder.appName("UserGraph").getOrCreate()

# Define schema
schema = StructType([
    StructField("sender_id", StringType(), True),
    StructField("receiver_id", StringType(), True),
    StructField("message_time", StringType(), True)  # Keep as string initially
])

# Sample data
data = [
    ("U1", "U2", "2023-01-01 10:00:00"),
    ("U2", "U1", "2023-01-01 11:00:00"),
    ("U1", "U3", "2023-01-02 09:30:00"),
    ("U3", "U1", "2023-01-02 10:00:00"),
    ("U1", "U2", "2023-01-03 08:00:00"),
    ("U2", "U4", "2023-01-03 09:00:00"),
    ("U4", "U2", "2023-01-03 10:00:00"),
    ("U1", "U5", "2023-01-04 12:00:00"),
    ("U1", "U2", "2023-01-05 14:00:00"),
    ("U2", "U1", "2023-01-06 16:00:00"),
    ("U5", "U1", "2023-01-07 17:00:00")
]

# Create DataFrame
df = spark.createDataFrame(data, schema)
df = df.withColumn("message_time", to_timestamp("message_time"))

# Show data
df.show(truncate=False)


In [0]:
 # For each user Total number of distinct connections (sent or received)

 # Joining the same dataset by self join with senderid
joined_df = df.alias("user1").join(
     df.alias("user2"),
     col("user1.sender_id") == col("user2.sender_id"),
     "inner"
)
sender_con_df = joined_df.select(
    col("user1.sender_id").alias("user_id"),
    col("user2.receiver_id").alias("connection_id")
)

 # Joining the same dataset by self join with receiverid
joined_df = df.alias("user1").join(
     df.alias("user2"),
     col("user1.receiver_id") == col("user2.receiver_id"),
     "inner"
)
receiver_con_df = joined_df.select(
    col("user1.receiver_id").alias("user_id"),
    col("user2.sender_id").alias("connection_id")
)

merge_df = sender_con_df.union(receiver_con_df)
# Aggregating total connection for users
agg_df = merge_df.groupBy("user_id").agg(
    countDistinct("connection_id").alias("total_connections") 
)
agg_df.show()


In [0]:
# Top 3 most messaged people per user
total_messages_df = df.groupBy("receiver_id", "sender_id").agg(
    count("*").alias("total_messages")
)

# Ranking senders for each receiver 
window_spec = Window.partitionBy("receiver_id").orderBy(desc("total_messages"))
ranking_df = total_messages_df.withColumn(
    "rnk",
    dense_rank().over(window_spec)
) 

filtered_df = ranking_df.filter(col("rnk") <= 3)
filtered_df.show()

In [0]:
# Average message frequency (messages/day)
df.show()

# Aggregating total messages and active days
total_mes_df = df.groupBy("sender_id").agg(
    count("*").alias("total_messages"),
    countDistinct("message_time").alias("active_days")
)

# Message frequency for each user
msg_fre_df = total_mes_df.withColumn(
    "frequency",
    (col("total_messages") / col("active_days"))
)
msg_fre_df.show()


In [0]:
"""
Hierarchical Rollup Aggregation (Multilevel Grouping)
Dataset:
sales: region, country, state, product, revenue

🛠️ Tasks:
Output:
Total revenue at each level: state, country, region
Subtotals (e.g., country = All, region = All)
Include a grand total
"""

In [0]:
# Define schema
schema = StructType([
    StructField("region", StringType(), True),
    StructField("country", StringType(), True),
    StructField("state", StringType(), True),
    StructField("product", StringType(), True),
    StructField("revenue", DoubleType(), True)
])

# Sample data
data = [
    ("North America", "USA", "California", "Product A", 1000.0),
    ("North America", "USA", "Texas", "Product B", 1500.0),
    ("North America", "Canada", "Ontario", "Product C", 800.0),
    ("Europe", "Germany", "Bavaria", "Product D", 1200.0),
    ("Europe", "Germany", "Berlin", "Product E", 900.0),
    ("Europe", "France", "Paris", "Product F", 1100.0)
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Show the data
df.show()


# 🧩 Apply rollup on region, country, state
result = (
    df.rollup("region", "country", "state")
      .agg(sum("revenue").alias("total_revenue"))
)

# 🧹 Replace nulls with 'All' to show subtotals clearly
final_result = (
    result.select(
        coalesce("region", lit("All")).alias("region"),
        coalesce("country", lit("All")).alias("country"),
        coalesce("state", lit("All")).alias("state"),
        "total_revenue"
    )
    .orderBy("region", "country", "state")
)

final_result.show()
