In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("User Behavior Insights") \
    .getOrCreate()

df = spark.read.csv("C:\Big_Data/dataset_music.csv", header=True, inferSchema=True)


In [None]:
df.printSchema()
df.show()


In [4]:
relevant_columns = df.select("user_id", "track_genre", "artists", "track_id", "duration_listened")


In [7]:
df_cleaned = relevant_columns.dropna(subset=["user_id", "track_genre", "artists"])


In [8]:
from pyspark.sql.types import IntegerType

df_cleaned = df_cleaned.withColumn("duration_listened", df_cleaned["duration_listened"].cast(IntegerType()))


In [13]:
#Data cleaning using python
from pyspark.sql.functions import count

genre_popularity = df_cleaned.groupBy("user_id", "track_genre") \
                             .agg(count("track_id").alias("genre_play_count")) \
                             .orderBy("user_id", "genre_play_count", ascending=False)


In [30]:
artist_popularity = df_cleaned.groupBy("user_id", "artists") \
                              .agg(count("track_id").alias("artist_play_count")) \
                              .orderBy("user_id", "artist_play_count", ascending=False)


In [31]:
overall_genre_popularity = df_cleaned.groupBy("track_genre") \
                                     .agg(count("track_id").alias("total_genre_count")) \
                                     .orderBy("total_genre_count", ascending=False)


In [32]:
overall_artist_popularity = df_cleaned.groupBy("artists") \
                                      .agg(count("track_id").alias("total_artist_count")) \
                                      .orderBy("total_artist_count", ascending=False)


In [33]:
genre_popularity.write.mode("overwrite").csv(r"C:\Big_Data\user_genre_popularity.csv", header=True)


In [34]:
artist_popularity.write.mode("overwrite").csv(r"C:\Big_Data\user_artist_popularity.csv", header=True)
overall_genre_popularity.write.mode("overwrite").csv(r"C:\Big_Data\overall_genre_popularity.csv", header=True)
overall_artist_popularity.write.mode("overwrite").csv(r"C:\Big_Data\overall_artist_popularity.csv", header=True)
