# Data Wrangling with DataFrames Coding Quiz

Use this Jupyter notebook to find the answers to the quiz in the previous section. There is an answer key in the next part of the lesson.

In [None]:
import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import asc, desc
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.functions import count as Fcount
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType

from src.spark_lakehouse import get_spark_session
spark = get_spark_session("Wrangling Data Quiz")
spark.conf.getAll
path = "data/sparkify_log_small.json"
user_log = spark.read.json(path)

# Question 1

Which page did user id "" (empty string) NOT visit?

**ANSWER:** 
- Submit Upgrade
- Upgrade
- Submit Downgrade
- Error
- Downgrade
- Settings
- Save Settings
- Logout
- NextSong

In [None]:
all_pages = user_log.select("page").distinct().collect()
print("All pages in the dataset:")
print("\n".join([row.page for row in all_pages]))
visited_pages = user_log.where(user_log.userId == "").select("page").distinct().collect()
unvisited_pages = set(all_pages) - set(visited_pages)
print("\nPages not visited by the empty string user id:")
print("\n".join([row.page for row in unvisited_pages]))

# Question 2 - Reflect

What type of user does the empty string user id most likely refer to?

**ANSWER:** The empty string user id most likely refers to a user who is not logged in or a guest user.

In [None]:
it = user_log.select(["userId", "firstname", "page", "song"]).where(
    user_log.userId == ""
).toLocalIterator() # avoids loading all data at once, possibly causing OOM
for row in it:
    print(row)

# Question 3

How many female users do we have in the data set?

**ANSWER:** 462

In [None]:
n_female_users = user_log.select("userId").where(user_log.gender == "F").distinct().count()
print(f"Number of female users: {n_female_users}")

# Question 4

How many songs were played from the most played artist?

In [None]:
n_songs_most_played_artist = user_log.filter(user_log.page == "NextSong").groupBy("artist").count().orderBy("count", ascending=False).first()
print(f"Most played artist: {n_songs_most_played_artist.artist} with {n_songs_most_played_artist['count']} plays")

# Question 5 (challenge)

How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.



In [None]:
# clean user_log by removing rows with empty or null userId or sessionId
user_log_valid = user_log.dropna(how="any", subset=["userId", "sessionId"])
user_log_valid = user_log_valid.filter(user_log_valid["userId"] != "")

# flag homepage visits
flag_homepage_visit= udf(lambda x: 1 if x == "Home" else 0, IntegerType())
user_log_homepage = user_log_valid.withColumn("homepage_visit", flag_homepage_visit("page"))

# define window specification
windowval = (
    Window.partitionBy("userId")
    .orderBy(asc("ts"))
    .rangeBetween(Window.unboundedPreceding, 0)
)

# calculate session as cumulative sum of homepage visits
user_log_valid_home = user_log_homepage.withColumn("session", Fsum("homepage_visit").over(windowval))

# show data for userId 1138 to verify session calculation
it = user_log_valid_home.select(["userId", "firstname", "ts", "page", "session"]).where(
    user_log_valid_home.userId == "1138"
).sort("ts").toLocalIterator() # avoids loading all data at once, possibly causing OOM
for row in it:
    print(row)

In [None]:
# calculate number of songs per sessions per user
user_log_valid_home_n_songs = user_log_valid_home.filter(user_log_valid_home.page == "NextSong").groupBy("userId", "session").count().withColumnRenamed("count", "n_songs")
user_log_valid_home_n_songs.show(20)

In [None]:
# calculate total number of sessions and songs per user to compute average songs per session
user_avg_songs_per_session = user_log_valid_home_n_songs.groupBy("userId").agg(
    Fsum("n_songs").alias("total_songs"),
    Fcount("session").alias("total_sessions")
)
user_avg_songs_per_session = user_avg_songs_per_session.withColumn(
    "avg_songs_per_session",
    user_avg_songs_per_session.total_songs / user_avg_songs_per_session.total_sessions
)
user_avg_songs_per_session.show(20)

In [None]:
# compute overall average number of songs played per session across all users
average_songs_per_session_udf = udf(lambda total_songs, total_sessions: total_songs / total_sessions, IntegerType())
overall_avg_songs_per_session = user_avg_songs_per_session.agg(Fsum("avg_songs_per_session").alias("avg")).first()
print(f"Overall average number of songs played per session across all users: { int(overall_avg_songs_per_session['avg']/user_avg_songs_per_session.count()) }")
