In [0]:
from datetime import datetime
from pyspark.sql import Row
web_data = [
Row(UserID=1, Page="Home", Timestamp="2024-04-10 10:00:00", Duration=35,
Device="Mobile", Country="India"),
Row(UserID=2, Page="Products", Timestamp="2024-04-10 10:02:00", Duration=120,
Device="Desktop", Country="USA"),
Row(UserID=3, Page="Cart", Timestamp="2024-04-10 10:05:00", Duration=45,
Device="Tablet", Country="UK"),
Row(UserID=1, Page="Checkout", Timestamp="2024-04-10 10:08:00", Duration=60,
Device="Mobile", Country="India"),
Row(UserID=4, Page="Home", Timestamp="2024-04-10 10:10:00", Duration=15,
Device="Mobile", Country="Canada"),
Row(UserID=2, Page="Contact", Timestamp="2024-04-10 10:15:00", Duration=25,
Device="Desktop", Country="USA"),
Row(UserID=5, Page="Products", Timestamp="2024-04-10 10:20:00", Duration=90,
Device="Desktop", Country="India"),
]
df_web = spark.createDataFrame(web_data)
df_web.show(truncate=False)

+------+--------+-------------------+--------+-------+-------+
|UserID|Page    |Timestamp          |Duration|Device |Country|
+------+--------+-------------------+--------+-------+-------+
|1     |Home    |2024-04-10 10:00:00|35      |Mobile |India  |
|2     |Products|2024-04-10 10:02:00|120     |Desktop|USA    |
|3     |Cart    |2024-04-10 10:05:00|45      |Tablet |UK     |
|1     |Checkout|2024-04-10 10:08:00|60      |Mobile |India  |
|4     |Home    |2024-04-10 10:10:00|15      |Mobile |Canada |
|2     |Contact |2024-04-10 10:15:00|25      |Desktop|USA    |
|5     |Products|2024-04-10 10:20:00|90      |Desktop|India  |
+------+--------+-------------------+--------+-------+-------+



In [0]:
# 1. Display the schema of web_traffic_data .
df_web.printSchema()


root
 |-- UserID: long (nullable = true)
 |-- Page: string (nullable = true)
 |-- Timestamp: string (nullable = true)
 |-- Duration: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Country: string (nullable = true)



In [0]:

# 2. Convert the Timestamp column to a proper timestamp type.

from pyspark.sql.functions import to_timestamp

df_web = df_web.withColumn("Timestamp", to_timestamp("Timestamp", "yyyy-MM-dd HH:mm:ss"))


In [0]:
# 3. Add a new column SessionMinute by extracting the minute from the Timestamp.
from pyspark.sql.functions import minute
df_web = df_web.withColumn("SessionMinute", minute("Timestamp"))
df_web.show(truncate=False)


+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|Page    |Timestamp          |Duration|Device |Country|SessionMinute|
+------+--------+-------------------+--------+-------+-------+-------------+
|1     |Home    |2024-04-10 10:00:00|35      |Mobile |India  |0            |
|2     |Products|2024-04-10 10:02:00|120     |Desktop|USA    |2            |
|3     |Cart    |2024-04-10 10:05:00|45      |Tablet |UK     |5            |
|1     |Checkout|2024-04-10 10:08:00|60      |Mobile |India  |8            |
|4     |Home    |2024-04-10 10:10:00|15      |Mobile |Canada |10           |
|2     |Contact |2024-04-10 10:15:00|25      |Desktop|USA    |15           |
|5     |Products|2024-04-10 10:20:00|90      |Desktop|India  |20           |
+------+--------+-------------------+--------+-------+-------+-------------+



In [0]:
# Filtering and Conditions
# 4. Filter users who used a "Mobile" device and visited the "Checkout" page.
df_web.filter((df_web.Device == "Mobile") & (df_web.Page == "Checkout")).show(truncate=False)

# 5. Show all entries with a Duration greater than 60 seconds.
df_web.filter(df_web.Duration > 60).show(truncate=False)

# 6. Find all users from India who visited the "Products" page.
df_web.filter((df_web.Country == "India") & (df_web.Page == "Products")).show(truncate=False)

+------+--------+-------------------+--------+------+-------+-------------+
|UserID|Page    |Timestamp          |Duration|Device|Country|SessionMinute|
+------+--------+-------------------+--------+------+-------+-------------+
|1     |Checkout|2024-04-10 10:08:00|60      |Mobile|India  |8            |
+------+--------+-------------------+--------+------+-------+-------------+

+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|Page    |Timestamp          |Duration|Device |Country|SessionMinute|
+------+--------+-------------------+--------+-------+-------+-------------+
|2     |Products|2024-04-10 10:02:00|120     |Desktop|USA    |2            |
|5     |Products|2024-04-10 10:20:00|90      |Desktop|India  |20           |
+------+--------+-------------------+--------+-------+-------+-------------+

+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|Page    |Timestamp          |Duration|Device |Country|SessionMinute|
+-

In [0]:
# Aggregation and Grouping
# 7. Get the average duration per device type.
from pyspark.sql.functions import avg

df_web.groupBy("Device").agg(avg("Duration").alias("AvgDuration")).show(truncate=False)

# 8. Count the number of sessions per country.
df_web.groupBy("Country").count().withColumnRenamed("count", "SessionCount").show(truncate=False)

# 9. Find the most visited page overall.

from pyspark.sql.functions import desc

df_web.groupBy("Page").count().orderBy(desc("count")).limit(1).show(truncate=False)


+-------+------------------+
|Device |AvgDuration       |
+-------+------------------+
|Mobile |36.666666666666664|
|Tablet |45.0              |
|Desktop|78.33333333333333 |
+-------+------------------+

+-------+------------+
|Country|SessionCount|
+-------+------------+
|India  |3           |
|USA    |2           |
|UK     |1           |
|Canada |1           |
+-------+------------+

+----+-----+
|Page|count|
+----+-----+
|Home|2    |
+----+-----+



In [0]:
# Window Functions
# 10. Rank each user’s pages by timestamp (oldest to newest).
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

window_spec = Window.partitionBy("UserID").orderBy("Timestamp")
df_web.withColumn("PageRank", rank().over(window_spec)).show(truncate=False)

# 11. Find the total duration of all sessions per user using groupBy .
from pyspark.sql.functions import sum as _sum

df_web.groupBy("UserID").agg(_sum("Duration").alias("TotalDuration")).show(truncate=False)


+------+--------+-------------------+--------+-------+-------+-------------+--------+
|UserID|Page    |Timestamp          |Duration|Device |Country|SessionMinute|PageRank|
+------+--------+-------------------+--------+-------+-------+-------------+--------+
|1     |Home    |2024-04-10 10:00:00|35      |Mobile |India  |0            |1       |
|1     |Checkout|2024-04-10 10:08:00|60      |Mobile |India  |8            |2       |
|2     |Products|2024-04-10 10:02:00|120     |Desktop|USA    |2            |1       |
|2     |Contact |2024-04-10 10:15:00|25      |Desktop|USA    |15           |2       |
|3     |Cart    |2024-04-10 10:05:00|45      |Tablet |UK     |5            |1       |
|4     |Home    |2024-04-10 10:10:00|15      |Mobile |Canada |10           |1       |
|5     |Products|2024-04-10 10:20:00|90      |Desktop|India  |20           |1       |
+------+--------+-------------------+--------+-------+-------+-------------+--------+

+------+-------------+
|UserID|TotalDuration|
+------

In [0]:
# Spark SQL Tasks
# 12. Create a temporary view called traffic_view .
df_web.createOrReplaceTempView("traffic_view")

# 13. Write a SQL query to get the top 2 longest sessions by duration.
spark.sql("""
    SELECT * FROM traffic_view
    ORDER BY Duration DESC
    LIMIT 2
""").show(truncate=False)

# 14. Get the number of unique users per page using SQL.
spark.sql("""
    SELECT Page, COUNT(DISTINCT UserID) AS UniqueUsers
    FROM traffic_view
    GROUP BY Page
""").show(truncate=False)


+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|Page    |Timestamp          |Duration|Device |Country|SessionMinute|
+------+--------+-------------------+--------+-------+-------+-------------+
|2     |Products|2024-04-10 10:02:00|120     |Desktop|USA    |2            |
|5     |Products|2024-04-10 10:20:00|90      |Desktop|India  |20           |
+------+--------+-------------------+--------+-------+-------+-------------+

+--------+-----------+
|Page    |UniqueUsers|
+--------+-----------+
|Cart    |1          |
|Home    |2          |
|Checkout|1          |
|Products|2          |
|Contact |1          |
+--------+-----------+



In [0]:
# Export & Save
# 15. Save the final DataFrame to CSV.
df_web.write.mode("overwrite").option("header", True).csv("output/web_traffic_csv")

# 16. Save partitioned by Country in Parquet format.
df_web.write.mode("overwrite").partitionBy("Country").parquet("output/web_traffic_parquet")
