In [1]:
! pip install pyspark 

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812363 sha256=42143e1e4c8d3edcc49a1f2b03235d40232ee7efcd2eea0332c2b656164bcd62
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Music Streaming") \
    .getOrCreate()

In [4]:
#1. Calculate the Total Listening Time for Each User
from pyspark.sql.functions import col, sum
df_music = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/content/sample_data/music_data.csv")
total_listening_time_per_user = df_music.groupBy("user_id").agg(sum("duration_seconds").alias("total_listening_time"))
total_listening_time_per_user.show()

+-------+--------------------+
|user_id|total_listening_time|
+-------+--------------------+
|      1|                 630|
|      3|                 610|
|      2|                 680|
+-------+--------------------+



In [5]:
#2. Filter Songs Streamed for More Than 200 Seconds
df_long_songs = df_music.filter(col("duration_seconds") > 200)
df_long_songs.show()

+-------+---------------+----------+----------------+-------------------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+---------------+----------+----------------+-------------------+-----------+
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|
|      3|      New Rules|  Dua Lipa|             230|2023-09-02 11:00:00|     London|
+-------+---------------+----------+----------------+-------------------+-----------+



In [6]:
#3. Find the Most Popular Artist (by Total Streams)
from pyspark.sql.functions import count
streams_per_artist = df_music.groupBy("artist").agg(count("song_title").alias("total_streams"))
most_popular_artist = streams_per_artist.orderBy(col("total_streams").desc()).limit(1)
most_popular_artist.show()

+--------+-------------+
|  artist|total_streams|
+--------+-------------+
|Dua Lipa|            3|
+--------+-------------+



In [7]:
#4. Identify the Song with the Longest Duration
from pyspark.sql.functions import max
longest_duration = df_music.select(max("duration_seconds").alias("max_duration"))
song_with_longest_duration = df_music.join(longest_duration, col("duration_seconds") == col("max_duration"))
song_with_longest_duration.show()

+-------+----------+----------+----------------+-------------------+-----------+------------+
|user_id|song_title|    artist|duration_seconds|     streaming_time|   location|max_duration|
+-------+----------+----------+----------------+-------------------+-----------+------------+
|      2|   Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|         250|
+-------+----------+----------+----------------+-------------------+-----------+------------+



In [8]:
#5. Calculate the Average Song Duration by Artist
from pyspark.sql.functions import avg
average_duration_per_artist = df_music.groupBy("artist").agg(avg("duration_seconds").alias("average_duration"))
average_duration_per_artist.show()

+----------+------------------+
|    artist|  average_duration|
+----------+------------------+
|  Dua Lipa|203.33333333333334|
|Ed Sheeran|226.66666666666666|
|The Weeknd|             210.0|
+----------+------------------+



In [9]:
#6. Find the Top 3 Most Streamed Songs per User
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
song_streams_per_user = df_music.groupBy("user_id", "song_title").count()
window_spec = Window.partitionBy("user_id").orderBy(col("count").desc())
ranked_songs_per_user = song_streams_per_user.withColumn("rank", row_number().over(window_spec))
top_3_songs_per_user = ranked_songs_per_user.filter(col("rank") <= 3)
top_3_songs_per_user.show()

+-------+---------------+-----+----+
|user_id|     song_title|count|rank|
+-------+---------------+-----+----+
|      1|Blinding Lights|    1|   1|
|      1|        Starboy|    1|   2|
|      1|Save Your Tears|    1|   3|
|      2|    Galway Girl|    1|   1|
|      2|   Shape of You|    1|   2|
|      2|        Perfect|    1|   3|
|      3|Don't Start Now|    1|   1|
|      3|      New Rules|    1|   2|
|      3|     Levitating|    1|   3|
+-------+---------------+-----+----+



In [10]:
#7. Calculate the Total Number of Streams per Day
from pyspark.sql.functions import to_date, count
df_music = df_music.withColumn("date", to_date(col("streaming_time")))
streams_per_day = df_music.groupBy("date").agg(count("song_title").alias("total_streams"))
streams_per_day.show()

+----------+-------------+
|      date|total_streams|
+----------+-------------+
|2023-09-01|            5|
|2023-09-02|            4|
+----------+-------------+



In [11]:
#8. Identify Users Who Streamed Songs from More Than One Artist
from pyspark.sql.functions import countDistinct
distinct_artists_per_user = df_music.groupBy("user_id").agg(countDistinct("artist").alias("distinct_artists"))
users_multiple_artists = distinct_artists_per_user.filter(col("distinct_artists") > 1)
users_multiple_artists.show()

+-------+----------------+
|user_id|distinct_artists|
+-------+----------------+
+-------+----------------+



In [12]:
#9. Calculate the Total Streams for Each Location
total_streams_per_location = df_music.groupBy("location").agg(count("song_title").alias("total_streams"))
total_streams_per_location.show()

+-----------+-------------+
|   location|total_streams|
+-----------+-------------+
|Los Angeles|            3|
|     London|            3|
|   New York|            3|
+-----------+-------------+



In [13]:
#10. Create a New Column to Classify Long and Short Songs
from pyspark.sql.functions import when
df_music = df_music.withColumn("song_length", when(col("duration_seconds") > 200, "Long").otherwise("Short"))
df_music.show()

+-------+---------------+----------+----------------+-------------------+-----------+----------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|      date|song_length|
+-------+---------------+----------+----------------+-------------------+-----------+----------+-----------+
|      1|Blinding Lights|The Weeknd|             200|2023-09-01 08:15:00|   New York|2023-09-01|      Short|
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|2023-09-01|       Long|
|      3|     Levitating|  Dua Lipa|             180|2023-09-01 10:30:00|     London|2023-09-01|      Short|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|2023-09-01|       Long|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|2023-09-01|       Long|
|      3|Don't Start Now|  Dua Lipa|             200|2023-09-02 08:10:00|     London|2023-09-02|      Short|
|      1|Save Your 