<a href="https://colab.research.google.com/github/TanishqLambhate/Data-Science-Training/blob/pyspark_coding_challenge/Music_Streaming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
spark=SparkSession.builder.appName('Coding Challenge').getOrCreate()
csv_path="/content/Music_Streaming.csv"
df=spark.read.csv(csv_path,header=True,inferSchema=True)
df.show()

+-------+---------------+----------+----------------+-------------------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+---------------+----------+----------------+-------------------+-----------+
|      1|Blinding Lights|The Weeknd|             200|2023-09-01 08:15:00|   New York|
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|
|      3|     Levitating|  Dua Lipa|             180|2023-09-01 10:30:00|     London|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
|      3|Don't Start Now|  Dua Lipa|             200|2023-09-02 08:10:00|     London|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|
|      2|    Galway Girl|Ed Sheeran|             190|2023-09-02 10:00:00|Los Angeles|
|      3|      New Rules|  Dua Lipa|             230|2

In [7]:
# Exercises:
# 1. Calculate the Total Listening Time for Each User
# Group the data by user_id and calculate the total time spent streaming
# (in seconds) for each user.
df.groupBy("user_id").agg(sum("duration_seconds").alias("total_listening_time")).show()

# 2. Filter Songs Streamed for More Than 200 Seconds
# Filter the dataset to show only the songs where the duration_seconds is
# greater than 200.
df.filter(df.duration_seconds > 200).show()

# 3. Find the Most Popular Artist (by Total Streams)
# Group the data by artist and find the artist with the most streams
# (i.e., the highest number of song plays).
df.groupBy("artist").agg(count("*").alias("total_streams")).orderBy(desc("total_streams")).show(1)

# 4. Identify the Song with the Longest Duration
# Identify the song with the longest duration in the dataset.
df.orderBy(desc("duration_seconds")).show(1)

# 5. Calculate the Average Song Duration by Artist
# Group the data by artist and calculate the average song duration for
# each artist.
df.groupBy("artist").agg(avg("duration_seconds").alias("average_song_duration")).show()

+-------+--------------------+
|user_id|total_listening_time|
+-------+--------------------+
|      1|                 630|
|      3|                 610|
|      2|                 680|
+-------+--------------------+

+-------+---------------+----------+----------------+-------------------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+---------------+----------+----------------+-------------------+-----------+
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|
|      3|      New Rules|  Dua Lipa|             230|2023-09-02 11:00:00|     London|
+-------+---------------+----------+----------------+-------------------+-----------+

+------

In [16]:
# 6. Find the Top 3 Most Streamed Songs per User
# For each user, find the top 3 most-streamed songs (i.e., songs they
# played most frequently).
windowSpec = Window.partitionBy('user_id').orderBy(desc('duration_seconds'))
df_with_rank = df.withColumn('rank', row_number().over(windowSpec))
df_top_songs = df_with_rank.filter(col('rank') <= 3)
df_top_songs.show()

# 7. Calculate the Total Number of Streams per Day
# Group the data by streaming_time (by extracting the date) and calculate
# the total number of streams for each day.
df_streams=df.withColumn("streaming_time",to_date(col("streaming_time")))
df_streams.groupBy("streaming_time").agg(count("*").alias("total_streams")).show()

# 8. Identify Users Who Streamed Songs from More Than One Artist
# Find users who listened to songs by more than one artist.
df.groupBy("user_id").agg(countDistinct("artist").alias("unique_artists")).filter(col("unique_artists") > 1).show()

# 9. Calculate the Total Streams for Each Location
# Group the data by location and calculate the total number of streams
# for each location.
df.groupBy("location").agg(count("*").alias("total_streams")).show()

# 10. Create a New Column to Classify Long and Short Songs
# Add a new column song_length that classifies a song as "Long" if
# duration_seconds > 200 , otherwise classify it as "Short."
df.withColumn("song_length", when(col("duration_seconds") > 200, "Long").otherwise("Short")).show()


+-------+---------------+----------+----------------+-------------------+-----------+----+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|rank|
+-------+---------------+----------+----------------+-------------------+-----------+----+
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|   1|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|   2|
|      1|Blinding Lights|The Weeknd|             200|2023-09-01 08:15:00|   New York|   3|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|   1|
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|   2|
|      2|    Galway Girl|Ed Sheeran|             190|2023-09-02 10:00:00|Los Angeles|   3|
|      3|      New Rules|  Dua Lipa|             230|2023-09-02 11:00:00|     London|   1|
|      3|Don't Start Now|  Dua Lipa|             200|2023-09-02 08:10:00|     London|   2|