# Top 10 Songs in Top 50 Longest Sessions

## Problem Statement
What are the top 10 songs played in the top 50 longest sessions by tracks count?

## Session Definition
A user "session" consists of one or more songs played by a given user, where each song is started within 20 minutes of the previous song's start time.

In [None]:
import sys
sys.path.append('..')
import os
from src.common.definition import (
    create_spark_session,
    load_track_data,
    add_sessions_id_columns,
)
from src.analysis_tracks_by_sessions import (
    top_tracks_from_longest_sessions,
    compute_session_duration
)

# Configuration
DATA_PATH = os.getenv("DATA_PATH")
SESSION_GAP_SEC = 20 * 60  # 20 minutes
TOP_N_SESSIONS = 50
TOP_N_TRACKS = 10

In [None]:
spark = create_spark_session("exercise_1_top_songs")

In [None]:
track_list = load_track_data(spark, DATA_PATH)
print(f"Total records loaded: {track_list.count():,}")
track_list.printSchema()

                                                                                

In [None]:
df_sessions = add_sessions_id_columns(track_list, SESSION_GAP_SEC)
df_sessions = df_sessions.select("userid", "timestamp", "track_name", "session_id")

print(f"Total sessions created: {df_sessions.select('userid', 'session_id').distinct().count():,}")
df_sessions.show(10)

In [None]:
session_durations = compute_session_duration(df_sessions)
session_durations.orderBy("session_duration_sec", ascending=False).show(20)

print("\nSession Duration Statistics:")
session_durations.select("session_duration_sec").describe().show()

In [None]:
top_10_tracks = top_tracks_from_longest_sessions(
    df_sessions,
    top_n_sessions=TOP_N_SESSIONS,
    top_n_tracks=TOP_N_TRACKS
)

print(f"\nTop {TOP_N_TRACKS} songs played in the top {TOP_N_SESSIONS} longest sessions:\n")
top_10_tracks.show(truncate=False)


In [None]:
top_10_tracks_pd = top_10_tracks.toPandas()
top_10_tracks_pd.to_csv("../answers/top_10_tracks.tsv", sep='\t', index=False)

## Cleanup

In [None]:
spark.stop()