In [1]:
# Import Spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [2]:
# Create Spark session
try:
    spark.stop()
except:
    pass

# Optimized for i5 11th gen + 16GB RAM
spark = SparkSession.builder \
    .appName("Music_Classifier") \
    .master("local[2]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "false") \
    .getOrCreate()

In [4]:
# Load raw dataset
df = spark.read.csv("./notebook_data/Mendeley_dataset.csv", header=True, inferSchema=True)
df.show(5)

+---+--------------------+--------------------+------------+-----+--------------------+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+----------+---+
|_c0|         artist_name|          track_name|release_date|genre|              lyrics|len|              dating|            violence|          world/life|          night/time|  shake the audience|       family/gospel|            romantic|       communication|             obscene|               music|     movement/places|light/visual perceptions|    family/spiritual|          like/girls|             sadness|            feelings| 

In [4]:
df = df.dropDuplicates()

In [5]:
df = df.select("lyrics", "genre")
df = df.dropna(subset=["lyrics", "genre"])

In [6]:
df.printSchema()
df.show(5)

root
 |-- lyrics: string (nullable = true)
 |-- genre: string (nullable = true)

+--------------------+-----+
|              lyrics|genre|
+--------------------+-----+
|bible thousand ye...|  pop|
|know fine wanna t...|  pop|
|little funny feel...|  pop|
|read read birth s...|  pop|
|lonely place musi...|  pop|
+--------------------+-----+
only showing top 5 rows



In [7]:
# Split the data into training and testing sets
(train_df, test_df) = df.randomSplit([0.8, 0.2], seed=42)

print(f"Training data count: {train_df.count()}")
print(f"Test data count: {test_df.count()}")

Training data count: 22813
Test data count: 5559


In [None]:
# Save cleaned dataset
train_df.write.mode("overwrite").csv("./notebook_data/Mendeley_cleaned_train.csv", header=True)
test_df.write.mode("overwrite").csv("./notebook_data/Mendeley_cleaned_test.csv", header=True)