# Ride-Sharing Trips Analytics (Uber / Ola–style)

# Create the Dataset in PySpark

In [1]:
data = [
("T001","Amit","Hyderabad","Ramesh","Sedan",12.5,320,28,"UPI","Completed"),
("T002","Neha","Bangalore","Suresh","Mini",8.2,210,22,"Card","Completed"),
("T003","Rahul","Delhi","Anil","Bike",5.1,120,15,"Cash","Completed"),
("T004","Pooja","Mumbai","Vikas","SUV",18.0,560,45,"UPI","Cancelled"),
("T005","Arjun","Chennai","Kumar","Mini",7.8,200,20,"UPI","Completed"),
("T006","Sneha","Hyderabad","Ramesh","Sedan",14.2,360,32,"Card","Completed"),
("T007","Karan","Delhi","Anil","Bike",6.3,140,18,"UPI","Completed"),
("T007","Karan","Delhi","Anil","Bike",6.3,140,18,"UPI","Completed"),
("T008","Riya","Bangalore","Suresh","Sedan",11.0,300,27,"Wallet","Completed"),
("T009","Vikas","Mumbai","Vijay","SUV",20.5,650,50,"Card","Completed"),
("T010","Anjali","Chennai","Kumar","Bike",4.9,110,14,"Cash","Complete"),
("T011","Farhan","Delhi","Anil","Mini",9.6,240,25,"UPI","Completed"),
("T012","Megha","Hyderabad","Ramesh","SUV",19.2,610,48,"Card","Cancelled"),
("T013","Suresh","Bangalore","Suresh","Sedan",13.0,340,30,"UPI","Completed"),
("T014","Divya","Mumbai","Vikas","Mini",10.2,260,26,"Wallet","Completed"),
("T015","Nikhil","Delhi","Anil","Sedan",15.5,390,34,"UPI","Completed"),
("T016","Kavya","Chennai","Kumar","Sedan",12.1,315,29,"UPI","Completed"),
("T017","Rohit","Hyderabad","Ramesh","SUV",22.0,700,55,"Card","Completed"),
("T018","Simran","Bangalore","Suresh","Bike",5.8,130,16,"Cash","Completed"),
("T019","Ayesha","Mumbai","Vijay","Mini",9.9,250,24,"UPI","Completed"),
("T020","Manish","Delhi","Anil","Bike",6.0,135,17,"Wallet","Completed"),
("T021","Priya","Hyderabad","Ramesh","Sedan",14.8,380,33,"Card","Completed"),
("T022","Yash","Chennai","Kumar","SUV",21.3,680,52,"UPI","Completed"),
("T023","Naina","Bangalore","Suresh","Mini",10.7,270,28,"UPI","Completed"),
("T024","Sameer","Mumbai","Vikas","Sedan",13.9,350,31,"Wallet","Completed"),
("T025","Ritika","Delhi","Anil","Bike",5.4,125,16,"Cash","Completed"),
("T026","Gopal","Hyderabad","Ramesh","Mini",8.9,225,23,"UPI","Completed"),
("T027","Tina","Bangalore","Suresh","Sedan",12.6,330,29,"Card","Completed"),
("T028","Irfan","Mumbai","Vijay","SUV",23.4,740,58,"Card","Completed"),
("T029","Sahil","Chennai","Kumar","Mini",9.4,235,24,"UPI","Completed"),
("T030","Lavanya","Delhi","Anil","Sedan",14.1,365,32,"Wallet","Completed"),
("T031","Deepak","Hyderabad","Ramesh","Bike",6.7,150,18,"Cash","Completed"),
("T032","Shweta","Bangalore","Suresh","Mini",10.0,255,26,"UPI","Completed"),
("T033","Aman","Mumbai","Vikas","Sedan",15.8,395,35,"Card","Completed"),
("T034","Rekha","Chennai","Kumar","Sedan",13.5,345,30,"UPI","Completed"),
("T035","Zubin","Delhi","Anil","SUV",24.0,760,60,"Card","Completed"),
("T036","Pallavi","Hyderabad","Ramesh","Mini",9.1,230,23,"Wallet","Completed"),
("T037","Naveen","Bangalore","Suresh","Bike",5.9,135,17,"UPI","Completed"),
("T038","Sonia","Mumbai","Vijay","SUV",21.7,690,54,"Card","Completed"),
("T039","Harish","Chennai","Kumar","Mini",8.5,215,21,"Cash","Completed"),
("T040","Kriti","Delhi","Anil","Sedan",14.6,375,33,"UPI","Completed"),
("T041","Apoorva","Hyderabad","Ramesh","Sedan",13.2,335,30,"Card","Completed"),
("T042","Mohit","Bangalore","Suresh","SUV",19.9,620,49,"UPI","Completed"),
("T043","Tanvi","Mumbai","Vikas","Mini",10.4,265,27,"Wallet","Completed"),
("T044","Rakesh","Chennai","Kumar","Bike",6.2,140,18,"Cash","Completed"),
("T045","Isha","Delhi","Anil","Mini",9.7,245,25,"UPI","Completed")
]
columns = [
"trip_id","rider_name","city","driver_name","vehicle_type",
"distance_km","trip_fare","trip_duration_minutes",
"payment_mode","trip_status"
]


# EXERCISES — MEDIUM LEVEL

# CSV, JSON, PARQUET (Ride-Sharing Use Case)

# SECTION A — CSV

# Exercise 1

Write the full dataset to CSV with header enabled.
Output:
trips_csv/

In [2]:
from pyspark.sql import SparkSession

# Initialize SparkSession if not already initialized
spark = SparkSession.builder.appName("RideSharingAnalytics").getOrCreate()

# Create DataFrame
df = spark.createDataFrame(data, schema=columns)

# Write DataFrame to CSV with header enabled
df.write.option("header", "true").csv("trips_csv/")

print("Dataset successfully written to trips_csv/ with header enabled.")

Dataset successfully written to trips_csv/ with header enabled.


# Exercise 2

Read the CSV and filter:
trip_fare > 400
trip_status = "Completed"

In [3]:
from pyspark.sql.functions import col

# Read the CSV file with header enabled
df_read = spark.read.option("header", "true").csv("trips_csv/")

# Filter the DataFrame
filtered_df = df_read.filter((col("trip_fare") > 400) & (col("trip_status") == "Completed"))

# Show the filtered data
filtered_df.show()

+-------+----------+---------+-----------+------------+-----------+---------+---------------------+------------+-----------+
|trip_id|rider_name|     city|driver_name|vehicle_type|distance_km|trip_fare|trip_duration_minutes|payment_mode|trip_status|
+-------+----------+---------+-----------+------------+-----------+---------+---------------------+------------+-----------+
|   T028|     Irfan|   Mumbai|      Vijay|         SUV|       23.4|      740|                   58|        Card|  Completed|
|   T035|     Zubin|    Delhi|       Anil|         SUV|       24.0|      760|                   60|        Card|  Completed|
|   T038|     Sonia|   Mumbai|      Vijay|         SUV|       21.7|      690|                   54|        Card|  Completed|
|   T042|     Mohit|Bangalore|     Suresh|         SUV|       19.9|      620|                   49|         UPI|  Completed|
|   T009|     Vikas|   Mumbai|      Vijay|         SUV|       20.5|      650|                   50|        Card|  Completed|


# Exercise 3

From CSV, select:
trip_id
city
vehicle_type
trip_fare
Sort by trip_fare descending.

In [4]:
from pyspark.sql.functions import col, desc

selected_and_sorted_df = df_read.select("trip_id", "city", "vehicle_type", col("trip_fare").cast("double")) \
                                  .orderBy(desc("trip_fare"))


selected_and_sorted_df.show()

+-------+---------+------------+---------+
|trip_id|     city|vehicle_type|trip_fare|
+-------+---------+------------+---------+
|   T035|    Delhi|         SUV|    760.0|
|   T028|   Mumbai|         SUV|    740.0|
|   T017|Hyderabad|         SUV|    700.0|
|   T038|   Mumbai|         SUV|    690.0|
|   T022|  Chennai|         SUV|    680.0|
|   T009|   Mumbai|         SUV|    650.0|
|   T042|Bangalore|         SUV|    620.0|
|   T012|Hyderabad|         SUV|    610.0|
|   T004|   Mumbai|         SUV|    560.0|
|   T033|   Mumbai|       Sedan|    395.0|
|   T015|    Delhi|       Sedan|    390.0|
|   T021|Hyderabad|       Sedan|    380.0|
|   T040|    Delhi|       Sedan|    375.0|
|   T030|    Delhi|       Sedan|    365.0|
|   T006|Hyderabad|       Sedan|    360.0|
|   T024|   Mumbai|       Sedan|    350.0|
|   T034|  Chennai|       Sedan|    345.0|
|   T013|Bangalore|       Sedan|    340.0|
|   T041|Hyderabad|       Sedan|    335.0|
|   T027|Bangalore|       Sedan|    330.0|
+-------+--

# Exercise 4

Write only Bike trips to CSV using delimiter | .

In [5]:
from pyspark.sql.functions import col

bike_trips_df = df_read.filter(col("vehicle_type") == "Bike")
bike_trips_df.write.option("header", "true").option("sep", "|").csv("bike_trips_csv/")
print("Bike trips successfully written to bike_trips_csv/ with '|' delimiter.")

Bike trips successfully written to bike_trips_csv/ with '|' delimiter.


# SECTION B — JSON

# Exercise 5

Write only trips from Mumbai to JSON.
Output:
mumbai_trips_json/

In [6]:
from pyspark.sql.functions import col

mumbai_trips_df = df_read.filter(col("city") == "Mumbai")

mumbai_trips_df.write.json("mumbai_trips_json/")

print("Mumbai trips successfully written to mumbai_trips_json/")

Mumbai trips successfully written to mumbai_trips_json/


# Exercise 5

Write only trips from Mumbai to JSON.
Output:
mumbai_trips_json/

In [7]:
from pyspark.sql.functions import col

mumbai_trips_df = df_read.filter(col("city") == "Mumbai")

mumbai_trips_df.write.mode("overwrite").json("mumbai_trips_json/")

print("Mumbai trips successfully written to mumbai_trips_json/ (overwriting existing files if any).")

Mumbai trips successfully written to mumbai_trips_json/ (overwriting existing files if any).


# Exercise 7

Filter JSON data:
payment_mode = "Card"
vehicle_type = "SUV"

In [8]:
from pyspark.sql.functions import col

df_json = spark.read.json("mumbai_trips_json/")

filtered_json_df = df_json.filter((col("payment_mode") == "Card") & (col("vehicle_type") == "SUV"))

filtered_json_df.show()

+------+-----------+-----------+------------+----------+---------------------+---------+-------+-----------+------------+
|  city|distance_km|driver_name|payment_mode|rider_name|trip_duration_minutes|trip_fare|trip_id|trip_status|vehicle_type|
+------+-----------+-----------+------------+----------+---------------------+---------+-------+-----------+------------+
|Mumbai|       23.4|      Vijay|        Card|     Irfan|                   58|      740|   T028|  Completed|         SUV|
|Mumbai|       21.7|      Vijay|        Card|     Sonia|                   54|      690|   T038|  Completed|         SUV|
|Mumbai|       20.5|      Vijay|        Card|     Vikas|                   50|      650|   T009|  Completed|         SUV|
+------+-----------+-----------+------------+----------+---------------------+---------+-------+-----------+------------+



# Exercise 8

Force JSON output into a single partition and observe the output structure.

In [9]:

mumbai_trips_df.repartition(1).write.mode("overwrite").json("mumbai_trips_json_single_partition/")

print("Mumbai trips successfully written to mumbai_trips_json_single_partition/ in a single partition.")

Mumbai trips successfully written to mumbai_trips_json_single_partition/ in a single partition.


# SECTION C — PARQUET

# Exercise 9

Convert full dataset to Parquet.
Output:
trips_parquet/

In [10]:
df.write.parquet("trips_parquet/")

print("Full dataset successfully written to trips_parquet/ in Parquet format.")

Full dataset successfully written to trips_parquet/ in Parquet format.


# Exercise 10

Read Parquet and filter:
trip_duration_minutes > 45

In [11]:
from pyspark.sql.functions import col

df_parquet = spark.read.parquet("trips_parquet/")

filtered_parquet_df = df_parquet.filter(col("trip_duration_minutes").cast("int") > 45)

filtered_parquet_df.show()

+-------+----------+---------+-----------+------------+-----------+---------+---------------------+------------+-----------+
|trip_id|rider_name|     city|driver_name|vehicle_type|distance_km|trip_fare|trip_duration_minutes|payment_mode|trip_status|
+-------+----------+---------+-----------+------------+-----------+---------+---------------------+------------+-----------+
|   T028|     Irfan|   Mumbai|      Vijay|         SUV|       23.4|      740|                   58|        Card|  Completed|
|   T035|     Zubin|    Delhi|       Anil|         SUV|       24.0|      760|                   60|        Card|  Completed|
|   T038|     Sonia|   Mumbai|      Vijay|         SUV|       21.7|      690|                   54|        Card|  Completed|
|   T042|     Mohit|Bangalore|     Suresh|         SUV|       19.9|      620|                   49|         UPI|  Completed|
|   T009|     Vikas|   Mumbai|      Vijay|         SUV|       20.5|      650|                   50|        Card|  Completed|


# Exercise 11

Sort Parquet data by distance_km descending and write top 10 trips back to Parquet.

In [12]:
from pyspark.sql.functions import desc

top_10_trips_df = df_parquet.orderBy(desc("distance_km")).limit(10)

top_10_trips_df.write.mode("overwrite").parquet("top_10_trips_parquet/")

print("Top 10 trips by distance_km successfully written to top_10_trips_parquet/.")

top_10_trips_df.show()

Top 10 trips by distance_km successfully written to top_10_trips_parquet/.
+-------+----------+---------+-----------+------------+-----------+---------+---------------------+------------+-----------+
|trip_id|rider_name|     city|driver_name|vehicle_type|distance_km|trip_fare|trip_duration_minutes|payment_mode|trip_status|
+-------+----------+---------+-----------+------------+-----------+---------+---------------------+------------+-----------+
|   T035|     Zubin|    Delhi|       Anil|         SUV|       24.0|      760|                   60|        Card|  Completed|
|   T028|     Irfan|   Mumbai|      Vijay|         SUV|       23.4|      740|                   58|        Card|  Completed|
|   T017|     Rohit|Hyderabad|     Ramesh|         SUV|       22.0|      700|                   55|        Card|  Completed|
|   T038|     Sonia|   Mumbai|      Vijay|         SUV|       21.7|      690|                   54|        Card|  Completed|
|   T022|      Yash|  Chennai|      Kumar|        

# Exercise 12

Compare storage size of:
CSV
JSON
Parquet
Answer which is smallest and why.

In [13]:
import os

csv_dir = "trips_csv/"
csv_size = 0

if os.path.exists(csv_dir) and os.path.isdir(csv_dir):
    for filename in os.listdir(csv_dir):
        filepath = os.path.join(csv_dir, filename)
        if os.path.isfile(filepath):
            csv_size += os.path.getsize(filepath)
    print(f"Total size of files in {csv_dir}: {csv_size} bytes")
else:
    print(f"Directory {csv_dir} does not exist or is not a directory.")

Total size of files in trips_csv/: 2910 bytes


In [14]:
import os

json_dir = "mumbai_trips_json_single_partition/"
json_size = 0

if os.path.exists(json_dir) and os.path.isdir(json_dir):
    for filename in os.listdir(json_dir):
        filepath = os.path.join(json_dir, filename)
        if os.path.isfile(filepath):
            json_size += os.path.getsize(filepath)
    print(f"Total size of files in {json_dir}: {json_size} bytes")
else:
    print(f"Directory {json_dir} does not exist or is not a directory.")

Total size of files in mumbai_trips_json_single_partition/: 1978 bytes


In [15]:
import os

parquet_dir = "trips_parquet/"
parquet_size = 0

if os.path.exists(parquet_dir) and os.path.isdir(parquet_dir):
    for filename in os.listdir(parquet_dir):
        filepath = os.path.join(parquet_dir, filename)
        if os.path.isfile(filepath):
            parquet_size += os.path.getsize(filepath)
    print(f"Total size of files in {parquet_dir}: {parquet_size} bytes")
else:
    print(f"Directory {parquet_dir} does not exist or is not a directory.")

Total size of files in trips_parquet/: 8025 bytes


# SECTION D — FORMAT CONVERSION

# Exercise 13

Convert:
CSV → Parquet
JSON → Parquet

In [16]:
print("Converting CSV to Parquet...")
df_read.write.mode("overwrite").parquet("csv_to_parquet/")
print("CSV data successfully converted to Parquet in 'csv_to_parquet/'.")

print("Converting JSON to Parquet...")
df_json.write.mode("overwrite").parquet("json_to_parquet/")
print("JSON data successfully converted to Parquet in 'json_to_parquet/'.")

Converting CSV to Parquet...
CSV data successfully converted to Parquet in 'csv_to_parquet/'.
Converting JSON to Parquet...
JSON data successfully converted to Parquet in 'json_to_parquet/'.


# Exercise 14

Read Parquet and write it back as CSV with header and delimiter ,

In [17]:
df_parquet.write.mode("overwrite").option("header", "true").option("sep", ",").csv("parquet_to_csv/")

print("Parquet data successfully written back to CSV in 'parquet_to_csv/'.")

Parquet data successfully written back to CSV in 'parquet_to_csv/'.


# ANALYTICS THINKING QUESTIONS

# Exercise 15

Which city generates the highest total trip_fare?

In [18]:
from pyspark.sql.functions import col, sum

total_fare_by_city = df_read.groupBy("city")\
                             .agg(sum(col("trip_fare").cast("int")).alias("total_trip_fare"))

total_fare_by_city.orderBy(col("total_trip_fare").desc()).show(1)

+------+---------------+
|  city|total_trip_fare|
+------+---------------+
|Mumbai|           4160|
+------+---------------+
only showing top 1 row


# Exercise 16

Which vehicle_type has the highest average fare?

In [19]:
from pyspark.sql.functions import col, avg

average_fare_by_vehicle = df_read.groupBy("vehicle_type")\
                                    .agg(avg(col("trip_fare").cast("double")).alias("average_trip_fare"))

average_fare_by_vehicle.orderBy(col("average_trip_fare").desc()).show(1)

+------------+-----------------+
|vehicle_type|average_trip_fare|
+------------+-----------------+
|         SUV|667.7777777777778|
+------------+-----------------+
only showing top 1 row


# Exercise 17

Which driver has completed the most trips?

In [20]:
from pyspark.sql.functions import col, count

completed_trips_by_driver = df_read.filter(col("trip_status") == "Completed")\
                                     .groupBy("driver_name")\
                                     .agg(count("trip_id").alias("completed_trips_count"))

completed_trips_by_driver.orderBy(col("completed_trips_count").desc()).show(1)

+-----------+---------------------+
|driver_name|completed_trips_count|
+-----------+---------------------+
|       Anil|                   11|
+-----------+---------------------+
only showing top 1 row


# Exercise 18

Why is Parquet preferred for analytics dashboards and aggregations?

Parquet is preferred for analytics dashboards and aggregations for several key reasons, primarily stemming from its columnar storage format and optimized design for big data processing:

Columnar Storage: Unlike row-oriented formats (like CSV or JSON), Parquet stores data column by column. This means that all values for a specific column are stored together. For analytical queries that often select only a subset of columns (e.g., SUM(trip_fare)), this is highly efficient as only the required columns need to be read from disk, significantly reducing I/O operations.

Efficient Compression and Encoding:

Better Compression: Columnar storage allows for more effective compression because data within a single column is typically of the same data type and often has similar values. This homogeneity leads to higher compression ratios, resulting in smaller file sizes on disk.
Various Encoding Schemes: Parquet supports different encoding schemes (e.g., Run Length Encoding, Dictionary Encoding) that are chosen based on the data type and distribution within each column, further optimizing storage and retrieval.
Predicate Pushdown (Filter Pushdown): Parquet files store metadata about each column, including statistics like min/max values. Query engines (like Spark) can use this metadata to skip reading entire blocks or files that do not contain data relevant to a query's filters. This significantly speeds up query execution, especially for large datasets.

Schema Evolution: Parquet supports schema evolution, allowing users to add new columns or modify existing ones without rewriting the entire dataset. This flexibility is crucial in dynamic data environments.

Optimized for Analytical Workloads: Aggregations (like SUM, AVG, COUNT) and scans are inherently faster on columnar data. When performing an aggregation on a column, the engine can quickly access all values for that column without having to read through irrelevant data from other columns.

Binary Format: While not human-readable (unlike CSV or JSON), the binary nature of Parquet makes it highly efficient for machine processing and serialization/deserialization.

Interoperability: Parquet is widely supported across various data processing frameworks and tools (e.g., Apache Spark, Apache Hive, Apache Impala), making it a versatile choice for a diverse ecosystem.

In summary, Parquet's columnar nature, combined with advanced compression, encoding, and optimization techniques, makes it ideal for handling large-scale analytical workloads, enabling faster query performance and reduced storage costs compared to row-oriented or text-based formats.



# OPTIONAL CHALLENGE

# Challenge 1
Repartition the dataset into 4 partitions and write to Parquet.

In [21]:
df.repartition(4).write.mode("overwrite").parquet("repartitioned_trips_parquet/")

print("Dataset successfully repartitioned into 4 partitions and written to 'repartitioned_trips_parquet/'.")

Dataset successfully repartitioned into 4 partitions and written to 'repartitioned_trips_parquet/'.


# Challenge 2
Create a summary dataset with:
city
total_trips
total_revenue

average_trip_duration
Write it to Parquet.

In [22]:
from pyspark.sql.functions import col, count, sum, avg

summary_df = df_read.groupBy("city").agg(
    count("trip_id").alias("total_trips"),
    sum(col("trip_fare").cast("double")).alias("total_revenue"),
    avg(col("trip_duration_minutes").cast("double")).alias("average_trip_duration")
)

summary_df.write.mode("overwrite").parquet("summary_trips_parquet/")

print("Summary dataset successfully created and written to 'summary_trips_parquet/'.")

summary_df.show()

Summary dataset successfully created and written to 'summary_trips_parquet/'.
+---------+-----------+-------------+---------------------+
|     city|total_trips|total_revenue|average_trip_duration|
+---------+-----------+-------------+---------------------+
|Bangalore|          9|       2590.0|    27.11111111111111|
|  Chennai|          8|       2240.0|                 26.0|
|   Mumbai|          9|       4160.0|   38.888888888888886|
|    Delhi|         11|       3035.0|   26.636363636363637|
|Hyderabad|          9|       3310.0|    32.22222222222222|
+---------+-----------+-------------+---------------------+

