# Most Visited Floor

In [0]:
-- Switch to my Catalog
USE CATALOG workspace;

-- Create schema if not exists
CREATE SCHEMA IF NOT EXISTS sql_pyspark_practice;

-- Use this schema
USE sql_pyspark_practice;

In [0]:
create or replace table entries ( 
name varchar(20),
address varchar(20),
email varchar(20),
floor int,
resources varchar(10));

insert into entries 
values ('A','Bangalore','A@gmail.com',1,'CPU'),('A','Bangalore','A1@gmail.com',1,'CPU'),('A','Bangalore','A2@gmail.com',2,'DESKTOP')
,('B','Bangalore','B@gmail.com',2,'DESKTOP'),('B','Bangalore','B1@gmail.com',2,'DESKTOP'),('B','Bangalore','B2@gmail.com',1,'MONITOR');

select * from entries;

### Steps
- Count the floor visits by grouping it by Name and floor and counting all the rows in each group
- Upon counting it we can rank it based on the count 
- This will give us the floor visit counts
- Then total visits is calculted
- Group by name and count all the rows this gives the total visits
- String agg gives the aggregation of all rows based on the group,m distinct filters out the duplicate values
- Converting all these queires into ctes and querying a single query usign these ctes will give the required output table

In [0]:
%python
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Assuming 'entries' is your source table
df = spark.table("entries")

# -------------------------------
# Step 1️⃣ Floor visit counts + rank
# -------------------------------
window_rank = Window.partitionBy("name").orderBy(F.desc("floor_visits"))

floor_visit = (
    df.groupBy("name", "floor")
      .agg(F.count("*").alias("floor_visits"))
      .withColumn("rn", F.rank().over(window_rank))
)

# -------------------------------
# Step 2️⃣ Total visits + distinct resources
# -------------------------------
total_visits = (
    df.groupBy("name")
      .agg(
          F.count("*").alias("total_visits"),
          F.concat_ws(",", F.collect_set("resources")).alias("resources_used")
      )
)

# -------------------------------
# Step 3️⃣ Join and filter for most visited floor
# -------------------------------
result = (
    floor_visit.join(total_visits, on="name", how="inner")
               .filter(F.col("rn") == 1)
               .select(
                   "name",
                   F.col("floor").alias("most_visited_floor"),
                   "total_visits",
                   "resources_used"
               )
)

# -------------------------------
# Step 4️⃣ Show or display the final result
# -------------------------------
display(result)


In [0]:
with floor_visit as (
  select name, floor, count(1),
  rank() over(partition by name order by count(1) desc) as rn
  from entries
  group by name, floor
),total_visits as (
  select name, count(1) as total_visits, string_agg(distinct resources, ',') as resources_used
  from entries
  group by name
)

select fv.name, fv.floor as most_visited_floor, tv.total_visits, tv.resources_used
from floor_visit fv
inner join total_visits tv on fv.name = tv.name
where rn = 1;

