In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SentimentAnalysis").getOrCreate()
df = spark.read.csv("dataset_music.csv", header=True, inferSchema=True)
df.printSchema()
df.show(5)


root
 |-- user_id: integer (nullable = true)
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- duration_listened: integer (nullable = true)
 |-- explicit: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: double (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: double (nullable = true)
 |-- track_genre: string (nullable = true)

+-------+--------------------+--------------------+--------------------+--------------------+----------+-----------------+----

In [2]:
from pyspark.sql.functions import regexp_replace, lower

df = df.withColumn("cleaned_genre", lower(regexp_replace("track_genre", "[^a-zA-Z0-9 ]", "")))
df.select("track_genre", "cleaned_genre").show(5)


+-----------+-------------+
|track_genre|cleaned_genre|
+-----------+-------------+
|   acoustic|     acoustic|
|   acoustic|     acoustic|
|   acoustic|     acoustic|
|   acoustic|     acoustic|
|   acoustic|     acoustic|
+-----------+-------------+
only showing top 5 rows



In [3]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="cleaned_genre", outputCol="words")
tokenized_df = tokenizer.transform(df)
tokenized_df.select("cleaned_genre", "words").show(5, truncate=False)


+-------------+----------+
|cleaned_genre|words     |
+-------------+----------+
|acoustic     |[acoustic]|
|acoustic     |[acoustic]|
|acoustic     |[acoustic]|
|acoustic     |[acoustic]|
|acoustic     |[acoustic]|
+-------------+----------+
only showing top 5 rows



In [6]:
from pyspark.ml.feature import HashingTF, IDF

# Convert text to raw features using HashingTF
hashing_tf = HashingTF(inputCol="words", outputCol="raw_features", numFeatures=500)
tf_df = hashing_tf.transform(tokenized_df)

# Use IDF to compute feature importance
idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(tf_df)
vectorized_df = idf_model.transform(tf_df)

vectorized_df.select("words", "features").show(5)


+----------+--------------------+
|     words|            features|
+----------+--------------------+
|[acoustic]|(500,[468],[2.302...|
|[acoustic]|(500,[468],[2.302...|
|[acoustic]|(500,[468],[2.302...|
|[acoustic]|(500,[468],[2.302...|
|[acoustic]|(500,[468],[2.302...|
+----------+--------------------+
only showing top 5 rows



In [7]:
from pyspark.ml.clustering import KMeans

# Apply K-Means clustering
kmeans = KMeans(k=4, seed=42, featuresCol="features", predictionCol="mood_cluster")
kmeans_model = kmeans.fit(vectorized_df)
clustered_df = kmeans_model.transform(vectorized_df)

clustered_df.select("track_genre", "mood_cluster").show(10)


+-----------+------------+
|track_genre|mood_cluster|
+-----------+------------+
|   acoustic|           0|
|   acoustic|           0|
|   acoustic|           0|
|   acoustic|           0|
|   acoustic|           0|
|   acoustic|           0|
|   acoustic|           0|
|   acoustic|           0|
|   acoustic|           0|
|   acoustic|           0|
+-----------+------------+
only showing top 10 rows



In [8]:
from pyspark.sql.functions import when

clustered_df = clustered_df.withColumn(
    "mood",
    when(clustered_df.mood_cluster == 0, "Happy")
    .when(clustered_df.mood_cluster == 1, "Sad")
    .when(clustered_df.mood_cluster == 2, "Energetic")
    .otherwise("Calm")
)
clustered_df.select("track_genre", "mood").show(10)


+-----------+-----+
|track_genre| mood|
+-----------+-----+
|   acoustic|Happy|
|   acoustic|Happy|
|   acoustic|Happy|
|   acoustic|Happy|
|   acoustic|Happy|
|   acoustic|Happy|
|   acoustic|Happy|
|   acoustic|Happy|
|   acoustic|Happy|
|   acoustic|Happy|
+-----------+-----+
only showing top 10 rows



In [None]:
from pyspark.sql.functions import collect_list

recommendations = clustered_df.groupBy("mood").agg(collect_list("track_genre").alias("songs"))
recommendations.show(truncate=False)


In [None]:
from pyspark.sql.functions import concat_ws

# Convert array of songs into a single string
recommendations = recommendations.withColumn("songs", concat_ws(",", recommendations["songs"]))

# Save to CSV
recommendations.write.csv(r"C:\Big_Data\mood_based_recommendations.csv", header=True)
