In [None]:
%pyspark
from pyspark.sql.functions import desc, rank
from pyspark.sql.window import Window

# Group by city and business ID, and calculate the rating count, average rating, and check-in count
grouped_df = joined_df.groupby('city', 'business_id').agg(
    count('business_id').alias('rating_count'),
    avg('stars').alias('avg_rating'),
    count('checkin_dates').alias('checkin_count')
)

# Rank the businesses within each city based on the three criteria
windowSpec = Window.partitionBy('city').orderBy(
    desc('rating_count'),
    desc('avg_rating'),
    desc('checkin_count')
)

ranked_df = grouped_df.withColumn('rank', rank().over(windowSpec))

# Filter the ranked businesses to include only the top 5 for each city
top_5_df = ranked_df.filter(ranked_df['rank'] <= 5)

# Show the result
z.show(top_5_df)

%pyspark
from pyspark.sql.functions import desc, rank
from pyspark.sql.window import Window

# Group by city and business ID, and calculate the average rating and review count
grouped_df = joined_df.groupby('city', 'business_id').agg(
    avg('stars').alias('avg_rating'),
    sum('review_count').alias('total_review_count')
)

# Rank the businesses within each city based on the review count and average rating
windowSpec = Window.partitionBy('city').orderBy(
    desc('total_review_count'),
    desc('avg_rating')
)

ranked_df = grouped_df.withColumn('rank', rank().over(windowSpec))

# Filter the ranked businesses to include only the top 5 for each city
top_5_df = ranked_df.filter(ranked_df['rank'] <= 5)

# Show the result
z.show(top_5_df)


%pyspark
from pyspark.sql.functions import desc, rank
from pyspark.sql.window import Window

# Group by city, business ID, and name, and calculate the average rating and review count
grouped_df = joined_df.groupby('city', 'business_id', 'name').agg(
    avg('stars').alias('avg_rating'),
    sum('review_count').alias('total_review_count')
)

# Rank the businesses within each city based on the review count and average rating
windowSpec = Window.partitionBy('city').orderBy(
    desc('total_review_count'),
    desc('avg_rating')
)

ranked_df = grouped_df.withColumn('rank', rank().over(windowSpec))

# Filter the ranked businesses to include only the top 5 for each city
top_5_df = ranked_df.filter(ranked_df['rank'] <= 5)

# Show the result
z.show(top_5_df,Truncate=False)

