[Question 1](https://datalemur.com/questions/trade-in-payouts) [Question 2](https://datalemur.com/questions/follow-up-airpod-percentage)

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType,TimestampType
from datetime import datetime


In [0]:
# Schema for trade_in_transactions
transactions_schema = StructType([
    StructField("transaction_id", IntegerType(), True),
    StructField("model_id", IntegerType(), True),
    StructField("store_id", IntegerType(), True),
    StructField("transaction_date", DateType(), True)
])

# Sample data (convert string date to datetime.date)
transactions_data = [
    (1, 112, 512, datetime.strptime("01/01/2022", "%d/%m/%Y").date()),
    (2, 113, 512, datetime.strptime("01/01/2022", "%d/%m/%Y").date())
]

# Create DataFrame
df_transactions = spark.createDataFrame(transactions_data, schema=transactions_schema)
df_transactions.show()

In [0]:
# Schema for trade_in_payouts
payouts_schema = StructType([
    StructField("model_id", IntegerType(), True),
    StructField("model_name", StringType(), True),
    StructField("payout_amount", IntegerType(), True)
])

# Sample data
payouts_data = [
    (111, "iPhone 11", 200),
    (112, "iPhone 12", 350),
    (113, "iPhone 13", 450),
    (114, "iPhone 13 Pro Max", 650)
]

# Create DataFrame
df_payouts = spark.createDataFrame(payouts_data, schema=payouts_schema)
df_payouts.show()

In [0]:
from pyspark.sql.functions import sum as _sum

df_joined = df_transactions.join(
    df_payouts,
    df_transactions["model_id"] == df_payouts["model_id"],
    "inner"
).groupBy(df_transactions["store_id"]).agg(
    _sum(df_payouts["payout_amount"]).alias("total_payout")
).orderBy("total_payout", ascending=False)

display(df_joined)

In [0]:
# Register the DataFrame as a SQL temporary view
df_transactions.createOrReplaceTempView("trade_in_transactions")
df_payouts.createOrReplaceTempView("trade_in_payouts")

# SQL query
sql_query = """
SELECT 
  transactions.store_id, 
  SUM(payouts.payout_amount) AS total_payout
FROM trade_in_transactions AS transactions
INNER JOIN trade_in_payouts AS payouts
  ON transactions.model_id = payouts.model_id
GROUP BY transactions.store_id
ORDER BY total_payout DESC
"""

# Execute the SQL query
df_result = spark.sql(sql_query)

# Display the result
display(df_result)

Q2

In [0]:
# Define schema for transactions
transactions_schema = StructType([
    StructField("transaction_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("product_name", StringType(), True),
    StructField("transaction_timestamp", TimestampType(), True)
])

# Define sample data (convert string to datetime)
transactions_data = [
    (1, 101, "iPhone", datetime.strptime("08/08/2022 00:00:00", "%d/%m/%Y %H:%M:%S")),
    (2, 101, "AirPods", datetime.strptime("08/08/2022 00:00:00", "%d/%m/%Y %H:%M:%S")),
    (5, 301, "iPhone", datetime.strptime("09/05/2022 00:00:00", "%d/%m/%Y %H:%M:%S")),
    (6, 301, "iPad", datetime.strptime("09/06/2022 00:00:00", "%d/%m/%Y %H:%M:%S")),
    (7, 301, "AirPods", datetime.strptime("09/07/2022 00:00:00", "%d/%m/%Y %H:%M:%S"))
]

# Create DataFrame
df_transactions = spark.createDataFrame(transactions_data, schema=transactions_schema)


In [0]:
df_transactions.show()

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import *

w = Window.partitionBy(df_transactions["customer_id"]).orderBy(df_transactions["transaction_id"])
df = df_transactions.withColumn(
    "previous", 
    lead(df_transactions["product_name"]).over(w)
).withColumn(
    "count", 
    when((df_transactions["product_name"] == "iPhone") & (col("previous") == "AirPods"), 1).otherwise(0)
)
total_customers = df.select("customer_id").distinct().count()

# Calculate the sum of 'count' column
sum_count = df.agg(sum("count")).collect()[0][0]

result = (sum_count / total_customers)*100
print(result)

# Display the DataFrame and the result