<a href="https://colab.research.google.com/github/Sreekar-Kandhadai/pyspark-interview-questions/blob/main/Uber.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
Problem Statement
Let's Imagine we are working at Uber, and our task is to determine the most profitable location based on signup duration and transaction amounts.
We are provided with two datasets: one containing signup details (including start and stop times) and another containing transaction details
 (such as amounts).

Our goal is to calculate:
The average signup duration in minutes for each location.
The average transaction amount for each location.
The ratio of the average transaction amount to the average signup duration.
Sort the results by the highest ratio to identify the most profitable location.

In [5]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

spark=SparkSession.builder.appName("uber").getOrCreate()

data_signups = [
    (1, '2020-01-01 10:00:00', '2020-01-01 12:00:00', 101, 'New York'),
    (2, '2020-01-02 11:00:00', '2020-01-02 13:00:00', 102, 'Los Angeles'),
    (3, '2020-01-03 10:00:00', '2020-01-03 14:00:00', 103, 'Chicago'),
    (4, '2020-01-04 09:00:00', '2020-01-04 10:30:00', 101, 'San Francisco'),
    (5, '2020-01-05 08:00:00', '2020-01-05 11:00:00', 102, 'New York')
]

data_transactions = [
    (1, 1, '2020-01-01 10:30:00', 50.00),
    (2, 1, '2020-01-01 11:00:00', 30.00),
    (3, 2, '2020-01-02 11:30:00', 100.00),
    (4, 2, '2020-01-02 12:00:00', 75.00),
    (5, 3, '2020-01-03 10:30:00', 120.00),
    (6, 4, '2020-01-04 09:15:00', 80.00),
    (7, 5, '2020-01-05 08:30:00', 90.00)
]


columns_signups = ["signup_id", "signup_start_date", "signup_stop_date", "plan_id", "location"]
signups_df = spark.createDataFrame(data_signups, columns_signups)

signups_df.show()


columns_transactions = ["transaction_id", "signup_id", "transaction_start_date", "amt"]
transactions_df = spark.createDataFrame(data_transactions, columns_transactions)

transactions_df.show()

signups_df=signups_df.withColumn("signup_duration_minutes",(unix_timestamp(col('signup_stop_date'))-unix_timestamp(col('signup_start_date')))/60)

signups_df.show()

transaction_avg_df=transactions_df.groupBy('signup_id').agg(avg('amt').alias('avg_transaction_amt'))

transaction_avg_df.show()

joined_df=signups_df.join(transaction_avg_df,"signup_id","inner")

joined_df.show()

result_df=joined_df.groupBy('location').agg(avg('signup_duration_minutes').alias('avg_signup_duration'),avg('avg_transaction_amt').alias('avg_transaction_amt'))

result_df.show()

result_df=result_df.withColumn('ratio',when(col('avg_signup_duration')!=0 ,col('avg_transaction_amt')/col('avg_signup_duration')).otherwise(0)).orderBy(col('ratio').desc())

result_df.show()

+---------+-------------------+-------------------+-------+-------------+
|signup_id|  signup_start_date|   signup_stop_date|plan_id|     location|
+---------+-------------------+-------------------+-------+-------------+
|        1|2020-01-01 10:00:00|2020-01-01 12:00:00|    101|     New York|
|        2|2020-01-02 11:00:00|2020-01-02 13:00:00|    102|  Los Angeles|
|        3|2020-01-03 10:00:00|2020-01-03 14:00:00|    103|      Chicago|
|        4|2020-01-04 09:00:00|2020-01-04 10:30:00|    101|San Francisco|
|        5|2020-01-05 08:00:00|2020-01-05 11:00:00|    102|     New York|
+---------+-------------------+-------------------+-------+-------------+

+--------------+---------+----------------------+-----+
|transaction_id|signup_id|transaction_start_date|  amt|
+--------------+---------+----------------------+-----+
|             1|        1|   2020-01-01 10:30:00| 50.0|
|             2|        1|   2020-01-01 11:00:00| 30.0|
|             3|        2|   2020-01-02 11:30:00|100.