In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import *
import pandas as pd

In [0]:

# Create a small pandas DataFrame
df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35]
})

df.head(2)



In [0]:
# Sample data
data = [
    ("P1", "North", "Electronics", 1200.50, "2025-04-01"),
    ("P2", "North", "Electronics", 950.75, "2025-04-01"),
    ("P3", "North", "Electronics", 850.00, "2025-04-01"),
    ("P4", "North", "Electronics", 200.00, "2025-04-01"),
    ("P1", "South", "Furniture", 500.00, "2025-04-01"),
    ("P5", "South", "Furniture", 1500.00, "2025-04-01"),
    ("P6", "South", "Furniture", 700.00, "2025-04-01"),
    ("P7", "East", "Grocery", 300.00, "2025-04-01"),
    ("P8", "East", "Grocery", 1200.00, "2025-04-01"),
    ("P9", "East", "Grocery", 1000.00, "2025-04-01"),
    ("P10", "East", "Grocery", 50.00, "2025-04-01"),
]

# Define schema with sale_date as string
schema = StructType([
    StructField("product_id", StringType(), True),
    StructField("region", StringType(), True),
    StructField("category", StringType(), True),
    StructField("sale_amount", DoubleType(), True),
    StructField("sale_date", StringType(), True)
])

# Create DataFrame
sales_df = spark.createDataFrame(data, schema)
sales_df = sales_df.withColumn("sale_date", to_date("sale_date"))

# Show the DataFrame
sales_df.show(truncate=False)


In [0]:
"""
For each region and category, find:
Top 3 selling products by revenue
"""
# Aggregating total revenue for each product in category and region
aggregated_df = sales_df.groupBy("region", "category", "product_id").agg(
    sum("sale_amount").alias("total_revenue")
)

# Assigning ranks for products
window_spec = Window.partitionBy("region", "category").orderBy(desc("total_revenue"))
ranking_df = aggregated_df.withColumn(
    "rank",
    dense_rank().over(window_spec)
)

top3_products_df = ranking_df.filter(col("rank") <= 3)
top3_products_df.show()

In [0]:
"""
Average revenue of top 3 products
"""
# Join original sales data with top 3 products (based on total revenue)
top3_sales_df = sales_df.join(top3_products_df.select("region", "category", "product_id"), 
                              on=["region", "category", "product_id"], how="inner")

# Calculate average revenue of those top 3 products per region-category
avg_top3_df = top3_sales_df.groupBy("region", "category").agg(
    avg("sale_amount").alias("avg_top3_revenue")
)
avg_top3_df.show()

In [0]:
# Percentage contribution of each top product to its category-region revenue
# Aggregating total revenue for each product in category and region
aggregated_df = sales_df.groupBy("region", "category", "product_id").agg(
    sum("sale_amount").alias("top_product_revenue")
)

# Assigning ranks for products
window_spec = Window.partitionBy("region", "category").orderBy(desc("top_product_revenue"))
ranking_df = aggregated_df.withColumn(
    "rank",
    dense_rank().over(window_spec)
)

top3_products_df = ranking_df.filter(col("rank") <= 3)

# Aggregating total reveenue across each region  and category
total_reg_cate_df = sales_df.groupBy("region", "category").agg(
    sum("sale_amount").alias("total_revenue")
)

# Joining total revenue and catgeory region level revenue
joined_df = total_reg_cate_df.join(top3_products_df, on=["region", "category"], how="inner")

# Contribution of top product in each category
top_product_cont_df = joined_df.withColumn(
    "contribution",
    (col("top_product_revenue") / col("total_revenue")) * 100
)
top_product_cont_df.show()

In [0]:
"""
Detect First and Last Purchase of Customer (Lifecycle Analysis)
Dataset:
orders: order_id, customer_id, order_date, order_amount

üõ†Ô∏è Task:
For each customer:
Get first and last purchase date
Calculate days between first and last order
Flag if a customer is "One-Time Buyer" or "Repeat Buyer"
Also, calculate average order value
"""

In [0]:
# Sample schema for the 'orders' DataFrame
schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("order_date", StringType(), True),
    StructField("order_amount", FloatType(), True)
])

# Sample data for the 'orders' DataFrame
data = [
    (1, 101, "2024-01-01", 150.0),
    (2, 101, "2024-03-15", 200.0),
    (3, 102, "2024-01-10", 300.0),
    (4, 103, "2024-01-12", 250.0),
    (5, 104, "2024-02-01", 120.0),
    (6, 104, "2024-04-10", 180.0)
]

# Create the DataFrame
orders_df = spark.createDataFrame(data, schema)

# Convert the 'order_date' column to date format
orders_df = orders_df.withColumn("order_date", col("order_date").cast(DateType()))
orders_df.show()

In [0]:
# Get first and last purchase date
first_last_df = orders_df.groupBy("customer_id").agg(
    min("order_date").alias("first_purchase_date"),
    max("order_date").alias("last_purchase_date")
)
# Calculate days between first and last order
day_diff_df = first_last_df.withColumn(
    "day_diff",
    datediff(col("last_purchase_date"), col("first_purchase_date"))
)
day_diff_df.show()

# Flag if a customer is "One-Time Buyer" or "Repeat Buyer"
customer_analysis_df = orders_df.groupBy("customer_id").agg(
    countDistinct("order_id").alias("total_orders")
)
customers_flags_df = customer_analysis_df.withColumn(
    "buyer_type",
    when(col("total_orders") > 1, "Repeat_Buyer").otherwise("One_Time_Buyer")
)
customer_flags_df.show()

# Also, calculate average order value
avg_order_val_df = orders_df.groupBy("customer_id").agg(
    avg("order_amount").alias("avg_order_value")
)
window_spec = Window.orderBy(desc("avg_order_value"))
ranking_df = avg_order_val_df.withColumn("rank", dense_rank().over(window_spec))
ranking_df.show()

In [0]:
"""
Time Series Trend Analysis Per Product
Dataset:
daily_sales: product_id, sale_date, units_sold

üõ†Ô∏è Task:
For each product:
Calculate 7-day moving average of sales
Identify upward or downward trend using 3 consecutive increases/decreases
Flag spike days (where sales > 2 * moving avg)"""

In [0]:
# Define schema
schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("sale_date", StringType(), True),
    StructField("units_sold", IntegerType(), True)
])

# Sample data
data = [
    (101, "2024-04-01", 50),
    (101, "2024-04-02", 55),
    (101, "2024-04-03", 53),
    (101, "2024-04-04", 60),
    (101, "2024-04-05", 65),
    (101, "2024-04-06", 70),
    (101, "2024-04-07", 75),
    (101, "2024-04-08", 150),  # Spike
    (101, "2024-04-09", 80),
    (102, "2024-04-01", 20),
    (102, "2024-04-02", 22),
    (102, "2024-04-03", 25),
    (102, "2024-04-04", 23),
    (102, "2024-04-05", 24),
    (102, "2024-04-06", 21),
    (102, "2024-04-07", 19),
    (102, "2024-04-08", 18),
    (102, "2024-04-09", 35)  # Spike
]

# Create DataFrame
daily_sales_df = spark.createDataFrame(data, schema)
daily_sales_df = daily_sales_df.withColumn("sale_date", to_date("sale_date"))
daily_sales_df.show()

In [0]:
# Calculate 7-day moving average of sales
window = Window.partitionBy("product").orderBy("sale_date").rowsBetween(-6, 0)
moving_avg_df = daily_sales_df.withColumn(
    "moving_avg",
    avg("units_sold").over(window)
)

# Identify upward or downward trend using 3 consecutive increases/decreases
window_spec = Window.partitionBy("product").orderBy("sale_date")
product_trends_df = daily_sales_df.withColumn(
    "prev_day_sale",
    lag("units_sold").over(window_spec)
)
sales_analysis_df = product_trends_df.withColumn(
    "isHigher",
    when(col("units_sold") > col("prev_day_sale"), "Yes").otherwise("No")
)

# Identifying consequtive streaks
windows = Window.partitionBy("product").orderBy("sale_date")
random_rows_df = sales_analysis_df.withColumn(
    "rn", 
    row_number().over(windows)
).filter("prev_day_sale is not null")

# Creating groups to find consequtives by substracting date and rownumber
conse_grop_df = random_rows_df.withColumn(
    "grouping",
    date_sub("sale_date", "rn")
)
aggregated_df = conse_grop_df.groupBy("product_id", "grouping").agg(
    sum(when(col("isHigher") == 'Yes'),1).alias("total_growths")
)

# Assigning ranks for consequtives for each product
window_rank = Window.partitionBy("product").orderBy(desc("total_growths"))
product_cons_rank_df = aggregated_df.withColumn("rank", dense_rank().over(window_rank))
product_cons_rank_df.show()

# Flag spike days (where sales > 2 * moving avg)

# Getting only last row per product to get moving average to join with original dataframe
window_last = Window.partitionBy("product").orderBy("sale_date").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
product_moving_avg_df = moving_avg_df.withColumn("last_mov_avg", last("moving_avg").over(window_last))
product_moving_avg_df = product_moving_avg_df.distinct()

# Joining with original dataframe
joined_df = daily_sales_df.join(product_moving_avg_df.select("product", "last_mov_avg"), on="product_id", how="inner")

# Filetering sales where greater than moving average
filtered_df = joined_df.filter(col("units_sold") > 2 * col("last_mov_avg"))
filtered_df.show()

In [0]:
data = [
    {
        "user_id": "u1",
        "events": [
            {"event_type": "click", "timestamp": "2024-01-01T10:00:00"},
            {"event_type": "purchase", "timestamp": "2024-01-01T10:05:00"},
        ],
    }
]

schema = StructType(
    [
        StructField("user_id", StringType(), True),
        StructField(
            "events",
            ArrayType(
                StructType(
                    [
                        StructField("event_type", StringType(), True),
                        StructField("timestamp", StringType(), True),
                    ]
                )
            ),
            True,
        ),
    ]
)

json_df = spark.createDataFrame(data, schema)

# Exploding the json dataframe with explode function
exploded_df = json_df.withColumn("event", explode("events"))

final_df = exploded_df.select(
    col("user_id"),
    col("event.event_type").alias("event_type"),
    col("event.timestamp").alias("timestamp"),
)

final_df = final_df.withColumn("timestamp", to_timestamp("timestamp"))

# For each event, calculate time since previous event (per user)
window_spec = Window.partitionBy("user_id").orderBy("timestamp")
time_diff_df = final_df.withColumn(
    "previous_event_time", lag("timestamp").over(window_spec)
).withColumn(
    "time_diff_seconds",
    unix_timestamp(col("timestamp")) - unix_timestamp(col("previous_event_time")),
).withColumn(
    "time_diff_readable",
    from_unixtime(col("time_diff_seconds"), "HH:mm:ss")
)
time_diff_df.filter("previous_event_time IS Not Null").show()

In [0]:
"""
Complex Pivot with Aggregation and Ratio Calculation
Dataset:
transactions: user_id, category, amount, transaction_date

üõ†Ô∏è Task:
Pivot to get total amount spent per category as separate columns
Add total amount and ratio of each category to total
"""
# Sample data
data = [
    ("u1", "grocery", 120.0, "2024-01-01"),
    ("u1", "electronics", 500.0, "2024-01-02"),
    ("u1", "grocery", 80.0, "2024-01-03"),
    ("u2", "grocery", 200.0, "2024-01-01"),
    ("u2", "fashion", 300.0, "2024-01-02"),
    ("u3", "fashion", 150.0, "2024-01-03"),
    ("u3", "electronics", 700.0, "2024-01-04")
]

# Define schema
schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("category", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("transaction_date", StringType(), True)  # You can also make it DateType if needed
])

# Create DataFrame
transactions_df = spark.createDataFrame(data, schema)
transactions_df = transactions_df.withColumn("transaction_date", to_date("transaction_date"))

transactions_df.show(truncate=False)

In [0]:
# Pivoted the dataframe to find each category amount
pivoted_df = transactions_df.groupBy("user_id").pivot("category").sum("amount")

# Aggregating total amount per user
aggregated_df = transactions_df.groupBy("user_id").agg(
    sum("amount").alias("total_amount")
)

# Joining both pivoted and aggregated dataframes
joined_df = pivoted_df.join(aggregated_df, on="user_id", how="inner")
joined_df.show()

# Computing ratios for each category
ratio_df = (joined_df
            .withColumn("electronics_ratio", round(coalesce(col("electronics"), lit(0))/ col("total_amount"), 2))
            .withColumn("fashion_ratio", round(coalesce(col("fashion"), lit(0))/ col("total_amount"), 2))
            .withColumn("grocery_ratio", round(coalesce(col("grocery"), lit(0))/ col("total_amount"), 2))
            )

ratio_df.show()            

In [0]:
# Dynamic way to calculate category ratios 
from pyspark.sql.functions import col, round, coalesce, lit

# Existing columns
basic_columns = joined_df.columns

# List of category columns (excluding user_id and total_amount)
category_columns = [c for c in basic_columns if c not in ("user_id", "total_amount")]

# Start with existing columns
final_cols = [col("user_id")] + [col(c) for c in category_columns] + [col("total_amount")]

# Add ratio columns dynamically
for cat in category_columns:
    ratio_col_name = f"{cat}_ratio"
    final_cols.append(
        round(coalesce(col(cat), lit(0)) / col("total_amount"), 2).alias(ratio_col_name)
    )

# Select all
dynamic_ratio_df = joined_df.select(*final_cols)

dynamic_ratio_df.show()
