In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml import Pipeline

In [2]:
# Stop any existing Spark context
try:
    spark.stop()
except:
    pass

# Optimized for i5 11th gen + 16GB RAM
spark = SparkSession.builder \
    .appName("Music_Classifier") \
    .master("local[2]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "false") \
    .getOrCreate()

In [None]:
df_train = spark.read.csv("./notebook_data/Mendeley_cleaned_train.csv", header=True, inferSchema=True)

In [4]:
# Encode genre labels
genre_indexer = StringIndexer(inputCol="genre", outputCol="label")

In [5]:
# Tokenize and clean lyrics
tokenizer = Tokenizer(inputCol="lyrics", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

In [6]:
# TF-IDF
hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=10000)
idf = IDF(inputCol="raw_features", outputCol="features")

In [8]:
df_train.show(5)

+--------------------+------+
|              lyrics| genre|
+--------------------+------+
|aaaaaaaaaaah yeah...|reggae|
|aaahhh animal bea...| blues|
|aaaow microphone ...|   pop|
|aaaphrodisiac non...|   pop|
|abcdeg learn spel...|  jazz|
+--------------------+------+
only showing top 5 rows



In [9]:
# Build pipeline
pipeline = Pipeline(stages=[genre_indexer, tokenizer, remover, hashingTF, idf])
pipeline_model = pipeline.fit(df_train)
df_transformed = pipeline_model.transform(df_train)

In [10]:
df_transformed.show(5)

+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+
|              lyrics| genre|label|               words|      filtered_words|        raw_features|            features|
+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+
|aaaaaaaaaaah yeah...|reggae|  5.0|[aaaaaaaaaaah, ye...|[aaaaaaaaaaah, ye...|(10000,[57,1073,1...|(10000,[57,1073,1...|
|aaahhh animal bea...| blues|  2.0|[aaahhh, animal, ...|[aaahhh, animal, ...|(10000,[57,99,137...|(10000,[57,99,137...|
|aaaow microphone ...|   pop|  0.0|[aaaow, microphon...|[aaaow, microphon...|(10000,[150,166,3...|(10000,[150,166,3...|
|aaaphrodisiac non...|   pop|  0.0|[aaaphrodisiac, n...|[aaaphrodisiac, n...|(10000,[1210,1268...|(10000,[1210,1268...|
|abcdeg learn spel...|  jazz|  4.0|[abcdeg, learn, s...|[abcdeg, learn, s...|(10000,[393,453,4...|(10000,[393,453,4...|
+--------------------+------+-----+-----

In [None]:
# Save pipeline
pipeline_model.write().overwrite().save("./notebook_data/feature_pipeline_lyrics_only")