In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from datetime import date

In [2]:
spark = (SparkSession
         .builder
         .master("local[*]")
         .appName("user_activity")
         .getOrCreate()
        )

Создание и проверка структуры DataFrame

In [23]:
#Набор данных
data = [
    (101, date(2025, 1, 1), {"mobile": 3, "desktop": 1}, ["/home", "/products", "/cart"], 4.5),
    (102, date(2025, 1, 1), {"desktop": 2}, ["/home", "/about"], 3.0),
    (101, date(2025, 1, 2), {"mobile": 2}, ["/products", "/checkout"], None),
    (103, date(2025, 1, 2), {"tablet": 1, "mobile": 1}, ["/blog", "/contact"], 5.0), 
    (104, date(2025, 1, 3), {"desktop": 4}, ["/dashboard"], 3.5),
    (101, date(2025, 1, 3), {"mobile": 1, "desktop": 1}, ["/home", "/products"], 4.0),
    (105, date(2025, 1, 4), {"mobile": 5}, ["/faq"], None),
    (102, date(2025, 1, 4), {"desktop": 1, "mobile": 1}, ["/settings"], 3.8),
    (103, date(2025, 1, 5), {"tablet": 2}, ["/products"], 4.2), 
    (106, date(2025, 1, 5), {"desktop": 3, "mobile": 2}, ["/login", "/profile", "/home"], 4.7),
    (101, date(2025, 1, 6), {"mobile": 1}, ["/cart", "/checkout"], 4.0),
    (104, date(2025, 1, 6), {"desktop": 2, "tablet": 1}, ["/contact"], None),
    (105, date(2025, 1, 7), {"mobile": 3, "desktop": 1}, ["/pricing"], 4.1),
    (106, date(2025, 1, 7), {"desktop": 1}, ["/home", "/about"], 3.9),
    (107, date(2025, 1, 8), {"mobile": 4, "tablet": 2}, ["/products", "/blog"], 4.9) 
]

In [24]:
#Создание схемы для DataFrame
user_activity_schema=StructType([StructField("user_id", IntegerType() ,True),
                                   StructField("activity_date", DateType() ,True),
                                   StructField("session_by_device", MapType(StringType(), IntegerType()) ,True),
                                   StructField("visited_pages", ArrayType(StringType()) ,True),
                                   StructField("usability_rating", DoubleType() ,True)
                                  ])

In [25]:
#Создаем DataFrame
df_user_activity = spark.createDataFrame(data, schema=user_activity_schema)

In [26]:
#Показать 3 строчки
df_user_activity.show(3)

+-------+-------------+--------------------+--------------------+----------------+
|user_id|activity_date|   session_by_device|       visited_pages|usability_rating|
+-------+-------------+--------------------+--------------------+----------------+
|    101|   2025-01-01|{mobile -> 3, des...|[/home, /products...|             4.5|
|    102|   2025-01-01|      {desktop -> 2}|     [/home, /about]|             3.0|
|    101|   2025-01-02|       {mobile -> 2}|[/products, /chec...|            NULL|
+-------+-------------+--------------------+--------------------+----------------+
only showing top 3 rows



In [27]:
#Схема DataFrame
df_user_activity.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- activity_date: date (nullable = true)
 |-- session_by_device: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)
 |-- visited_pages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- usability_rating: double (nullable = true)



Расчет построчных показателей

In [28]:
df_user_activity = (df_user_activity.withColumn("total_sessions_count",
    (F.coalesce(F.col("session_by_device").mobile, F.lit(0)) +
    F.coalesce(F.col("session_by_device").desktop, F.lit(0)) +
    F.coalesce(F.col("session_by_device").tablet, F.lit(0))))
)
df_user_activity.show()

+-------+-------------+--------------------+--------------------+----------------+--------------------+
|user_id|activity_date|   session_by_device|       visited_pages|usability_rating|total_sessions_count|
+-------+-------------+--------------------+--------------------+----------------+--------------------+
|    101|   2025-01-01|{mobile -> 3, des...|[/home, /products...|             4.5|                   4|
|    102|   2025-01-01|      {desktop -> 2}|     [/home, /about]|             3.0|                   2|
|    101|   2025-01-02|       {mobile -> 2}|[/products, /chec...|            NULL|                   2|
|    103|   2025-01-02|{mobile -> 1, tab...|   [/blog, /contact]|             5.0|                   2|
|    104|   2025-01-03|      {desktop -> 4}|        [/dashboard]|             3.5|                   4|
|    101|   2025-01-03|{mobile -> 1, des...|  [/home, /products]|             4.0|                   2|
|    105|   2025-01-04|       {mobile -> 5}|              [/faq]

In [31]:
df_user_activity=df_user_activity.withColumn(
    "mobile_sessions", 
    F.coalesce(F.col("session_by_device").mobile, F.lit(0))
)

df_user_activity.show()

+-------+-------------+--------------------+--------------------+----------------+--------------------+---------------+
|user_id|activity_date|   session_by_device|       visited_pages|usability_rating|total_sessions_count|mobile_sessions|
+-------+-------------+--------------------+--------------------+----------------+--------------------+---------------+
|    101|   2025-01-01|{mobile -> 3, des...|[/home, /products...|             4.5|                   4|              3|
|    102|   2025-01-01|      {desktop -> 2}|     [/home, /about]|             3.0|                   2|              0|
|    101|   2025-01-02|       {mobile -> 2}|[/products, /chec...|            NULL|                   2|              2|
|    103|   2025-01-02|{mobile -> 1, tab...|   [/blog, /contact]|             5.0|                   2|              1|
|    104|   2025-01-03|      {desktop -> 4}|        [/dashboard]|             3.5|                   4|              0|
|    101|   2025-01-03|{mobile -> 1, des

In [39]:
total_sessions_all_time=(df_user_activity.select("user_id", "total_sessions_count")
                         .groupBy("user_id")
                         .agg(F.sum("total_sessions_count").alias("total_sessions_all_time"))
                         .orderBy(F.desc("total_sessions_all_time"))

                        )
total_sessions_all_time.show()

+-------+-----------------------+
|user_id|total_sessions_all_time|
+-------+-----------------------+
|    101|                      9|
|    105|                      9|
|    104|                      7|
|    106|                      6|
|    107|                      6|
|    102|                      4|
|    103|                      4|
+-------+-----------------------+



In [56]:
unique_visited_pages_all_time=(df_user_activity.select("user_id", F.explode_outer("visited_pages").alias("page"))
                               .groupBy("user_id")
                               .agg(F.collect_set("page").alias("unique_visited_pages_all_time"))
                               .orderBy("user_id")
                              )

unique_visited_pages_all_time.show()

+-------+-----------------------------+
|user_id|unique_visited_pages_all_time|
+-------+-----------------------------+
|    101|         [/products, /chec...|
|    102|         [/about, /setting...|
|    103|         [/products, /cont...|
|    104|         [/dashboard, /con...|
|    105|             [/pricing, /faq]|
|    106|         [/profile, /about...|
|    107|           [/products, /blog]|
+-------+-----------------------------+



In [61]:
df_user_activity=df_user_activity.where(F.col("usability_rating") > 3.5).orderBy(F.desc("usability_rating"))

df_user_activity.show()

+-------+-------------+--------------------+--------------------+----------------+--------------------+---------------+
|user_id|activity_date|   session_by_device|       visited_pages|usability_rating|total_sessions_count|mobile_sessions|
+-------+-------------+--------------------+--------------------+----------------+--------------------+---------------+
|    103|   2025-01-02|{mobile -> 1, tab...|   [/blog, /contact]|             5.0|                   2|              1|
|    107|   2025-01-08|{mobile -> 4, tab...|  [/products, /blog]|             4.9|                   6|              4|
|    106|   2025-01-05|{mobile -> 2, des...|[/login, /profile...|             4.7|                   5|              2|
|    101|   2025-01-01|{mobile -> 3, des...|[/home, /products...|             4.5|                   4|              3|
|    103|   2025-01-05|       {tablet -> 2}|         [/products]|             4.2|                   2|              0|
|    105|   2025-01-07|{mobile -> 3, des

In [62]:
spark.stop()