In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Set-2").getOrCreate()
spark 

In [0]:
from datetime import datetime
from pyspark.sql import Row

web_data = [
    Row(UserID=1, Page="Home", Timestamp="2024-04-10 10:00:00", Duration=35,
Device="Mobile", Country="India"),
    Row(UserID=2, Page="Products", Timestamp="2024-04-10 10:02:00", Duration=120,
Device="Desktop", Country="USA"),
    Row(UserID=3, Page="Cart", Timestamp="2024-04-10 10:05:00", Duration=45,
Device="Tablet", Country="UK"),
    Row(UserID=1, Page="Checkout", Timestamp="2024-04-10 10:08:00", Duration=60,
Device="Mobile", Country="India"),
    Row(UserID=4, Page="Home", Timestamp="2024-04-10 10:10:00", Duration=15,
Device="Mobile", Country="Canada"),
    Row(UserID=2, Page="Contact", Timestamp="2024-04-10 10:15:00", Duration=25,
Device="Desktop", Country="USA"),
    Row(UserID=5, Page="Products", Timestamp="2024-04-10 10:20:00", Duration=90,
Device="Desktop", Country="India"),
]
df_web = spark.createDataFrame(web_data)
df_web.show(truncate=False)

+------+--------+-------------------+--------+-------+-------+
|UserID|Page    |Timestamp          |Duration|Device |Country|
+------+--------+-------------------+--------+-------+-------+
|1     |Home    |2024-04-10 10:00:00|35      |Mobile |India  |
|2     |Products|2024-04-10 10:02:00|120     |Desktop|USA    |
|3     |Cart    |2024-04-10 10:05:00|45      |Tablet |UK     |
|1     |Checkout|2024-04-10 10:08:00|60      |Mobile |India  |
|4     |Home    |2024-04-10 10:10:00|15      |Mobile |Canada |
|2     |Contact |2024-04-10 10:15:00|25      |Desktop|USA    |
|5     |Products|2024-04-10 10:20:00|90      |Desktop|India  |
+------+--------+-------------------+--------+-------+-------+



**Data Exploration & Preparation**

In [0]:
# 1. Display the schema of web_traffic_data .

df_web.printSchema()

root
 |-- UserID: long (nullable = true)
 |-- Page: string (nullable = true)
 |-- Timestamp: string (nullable = true)
 |-- Duration: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Country: string (nullable = true)



In [0]:
# 2. Convert the Timestamp column to a proper timestamp type.

from pyspark.sql.functions import col, to_timestamp

df_web = df_web.withColumn("Timestamp",to_timestamp("Timestamp","yyyy-MM-dd HH:mm:ss"))
df_web.show()

+------+--------+-------------------+--------+-------+-------+
|UserID|    Page|          Timestamp|Duration| Device|Country|
+------+--------+-------------------+--------+-------+-------+
|     1|    Home|2024-04-10 10:00:00|      35| Mobile|  India|
|     2|Products|2024-04-10 10:02:00|     120|Desktop|    USA|
|     3|    Cart|2024-04-10 10:05:00|      45| Tablet|     UK|
|     1|Checkout|2024-04-10 10:08:00|      60| Mobile|  India|
|     4|    Home|2024-04-10 10:10:00|      15| Mobile| Canada|
|     2| Contact|2024-04-10 10:15:00|      25|Desktop|    USA|
|     5|Products|2024-04-10 10:20:00|      90|Desktop|  India|
+------+--------+-------------------+--------+-------+-------+



In [0]:
# 3. Add a new column SessionMinute by extracting the minute from the Timestamp .
from pyspark.sql.functions import minute

df_web = df_web.withColumn("SessionMinute",minute("Timestamp"))
df_web.show()

+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|
+------+--------+-------------------+--------+-------+-------+-------------+
|     1|    Home|2024-04-10 10:00:00|      35| Mobile|  India|            0|
|     2|Products|2024-04-10 10:02:00|     120|Desktop|    USA|            2|
|     3|    Cart|2024-04-10 10:05:00|      45| Tablet|     UK|            5|
|     1|Checkout|2024-04-10 10:08:00|      60| Mobile|  India|            8|
|     4|    Home|2024-04-10 10:10:00|      15| Mobile| Canada|           10|
|     2| Contact|2024-04-10 10:15:00|      25|Desktop|    USA|           15|
|     5|Products|2024-04-10 10:20:00|      90|Desktop|  India|           20|
+------+--------+-------------------+--------+-------+-------+-------------+



**Filtering and Conditions**

In [0]:
# 4. Filter users who used a "Mobile" device and visited the "Checkout" page.

df_web.filter((df_web.Device == "Mobile") & (df_web.Page == "Checkout")).show()

+------+--------+-------------------+--------+------+-------+-------------+
|UserID|    Page|          Timestamp|Duration|Device|Country|SessionMinute|
+------+--------+-------------------+--------+------+-------+-------------+
|     1|Checkout|2024-04-10 10:08:00|      60|Mobile|  India|            8|
+------+--------+-------------------+--------+------+-------+-------------+



In [0]:
# 5. Show all entries with a Duration greater than 60 seconds.

df_web.filter(df_web.Duration > 60).show()

+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|
+------+--------+-------------------+--------+-------+-------+-------------+
|     2|Products|2024-04-10 10:02:00|     120|Desktop|    USA|            2|
|     5|Products|2024-04-10 10:20:00|      90|Desktop|  India|           20|
+------+--------+-------------------+--------+-------+-------+-------------+



In [0]:
# 6. Find all users from India who visited the "Products" page.

df_web.filter((df_web.Country == "India") & (df_web.Page == "Products")).show()

+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|
+------+--------+-------------------+--------+-------+-------+-------------+
|     5|Products|2024-04-10 10:20:00|      90|Desktop|  India|           20|
+------+--------+-------------------+--------+-------+-------+-------------+



**Aggregation and Grouping**

In [0]:
# 7. Get the average duration per device type.
from pyspark.sql.functions import avg

df_web.groupBy("Device").agg(avg("Duration")).show()

+-------+------------------+
| Device|     avg(Duration)|
+-------+------------------+
| Mobile|36.666666666666664|
| Tablet|              45.0|
|Desktop| 78.33333333333333|
+-------+------------------+



In [0]:
# 8. Count the number of sessions per country.
from pyspark.sql.functions import count

df_web.groupBy("Country").agg(count("UserId")).show()

+-------+-------------+
|Country|count(UserId)|
+-------+-------------+
|  India|            3|
|    USA|            2|
|     UK|            1|
| Canada|            1|
+-------+-------------+



In [0]:
# 9. Find the most visited page overall.
from pyspark.sql.functions import max

df_web.groupBy("Page").agg(max("Duration")).show()

+--------+-------------+
|    Page|max(Duration)|
+--------+-------------+
|    Home|           35|
|    Cart|           45|
|Products|          120|
|Checkout|           60|
| Contact|           25|
+--------+-------------+



**Window Functions**

In [0]:
# 10. Rank each user’s pages by timestamp (oldest to newest).

from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

user_window = Window.partitionBy("UserID").orderBy("Timestamp")
df_web = df_web.withColumn("rank",row_number().over(user_window))
df_ranked = df_web.withColumn("Ranked",row_number().over(user_window))
df_ranked.show()

+------+--------+-------------------+--------+-------+-------+-------------+----+------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|rank|Ranked|
+------+--------+-------------------+--------+-------+-------+-------------+----+------+
|     1|    Home|2024-04-10 10:00:00|      35| Mobile|  India|            0|   1|     1|
|     1|Checkout|2024-04-10 10:08:00|      60| Mobile|  India|            8|   2|     2|
|     2|Products|2024-04-10 10:02:00|     120|Desktop|    USA|            2|   1|     1|
|     2| Contact|2024-04-10 10:15:00|      25|Desktop|    USA|           15|   2|     2|
|     3|    Cart|2024-04-10 10:05:00|      45| Tablet|     UK|            5|   1|     1|
|     4|    Home|2024-04-10 10:10:00|      15| Mobile| Canada|           10|   1|     1|
|     5|Products|2024-04-10 10:20:00|      90|Desktop|  India|           20|   1|     1|
+------+--------+-------------------+--------+-------+-------+-------------+----+------+



In [0]:
# 11. Find the total duration of all sessions per user using groupBy .
from pyspark.sql.functions import sum

df_ranked.groupBy("UserID").agg(sum("Duration")).show()

+------+-------------+
|UserID|sum(Duration)|
+------+-------------+
|     1|           95|
|     3|           45|
|     2|          145|
|     4|           15|
|     5|           90|
+------+-------------+



**Spark SQL Tasks**

In [0]:
# 12. Create a temporary view called traffic_view .

df_ranked.createOrReplaceTempView("traffic_view")

In [0]:
# 13. Write a SQL query to get the top 2 longest sessions by duration.

spark.sql("select * from traffic_view order by Duration desc limit 2").show()

+------+--------+-------------------+--------+-------+-------+-------------+----+------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|rank|Ranked|
+------+--------+-------------------+--------+-------+-------+-------------+----+------+
|     2|Products|2024-04-10 10:02:00|     120|Desktop|    USA|            2|   1|     1|
|     5|Products|2024-04-10 10:20:00|      90|Desktop|  India|           20|   1|     1|
+------+--------+-------------------+--------+-------+-------+-------------+----+------+



In [0]:
# 14. Get the number of unique users per page using SQL.

spark.sql("select page,count(distinct UserID) as uniqueuser from traffic_view group by page").show()

+--------+----------+
|    page|uniqueuser|
+--------+----------+
|    Cart|         1|
|    Home|         2|
|Checkout|         1|
|Products|         2|
| Contact|         1|
+--------+----------+



**Export & Save**

In [0]:
# 15. Save the final DataFrame to CSV.
df_web.write.mode("overwrite").option("header", True).csv("/dbfs/FileStore/shared/converted_csv_output")

In [0]:
# 16. Save partitioned by Country in Parquet format.
df_web.write .mode("overwrite").partitionBy("Country").parquet("/dbfs/FileStore/shared/partitioned_parquet_output")