In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml import Pipeline

spark = SparkSession.builder.appName("FeatureEngineering").getOrCreate()

df = spark.read.csv("../data/Mendeley_cleaned.csv", header=True, inferSchema=True)

# Encode genre labels
genre_indexer = StringIndexer(inputCol="genre", outputCol="label")

# Tokenize and clean lyrics
tokenizer = Tokenizer(inputCol="lyrics", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# TF-IDF
hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=10000)
idf = IDF(inputCol="raw_features", outputCol="features")

# Build pipeline
pipeline = Pipeline(stages=[genre_indexer, tokenizer, remover, hashingTF, idf])
pipeline_model = pipeline.fit(df)
df_transformed = pipeline_model.transform(df)

df_transformed.select("lyrics", "genre", "label", "features").show(5)

# Save pipeline
pipeline_model.write().overwrite().save("../data/feature_pipeline_lyrics_only")
