In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import (
    RandomForestClassifier, 
    LogisticRegression, 
    NaiveBayes,
)
#from sparkxgb.classifier import XGBoostClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import PipelineModel
from functools import reduce
from pyspark.sql import DataFrame

In [None]:
import os

os.environ['PYSPARK_PYTHON'] = "C:/Users/rwkos/miniconda3/envs/music_classifier/python.exe"
os.environ['PYSPARK_DRIVER_PYTHON'] = "C:/Users/rwkos/miniconda3/envs/music_classifier/python.exe"

In [None]:
# Stop any existing Spark context
try:
    spark.stop()
except:
    pass

spark = SparkSession.builder \
    .appName("Music_Classifier") \
    .master("local[2]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "false") \
    .getOrCreate()

In [None]:
# Load cleaned dataset
df_train = spark.read.csv("./notebook_data/Mendeley_cleaned_train.csv", header=True, inferSchema=True)
df_test = spark.read.csv("./notebook_data/Mendeley_cleaned_test.csv", header=True, inferSchema=True)

In [None]:
# Under sampling train data

# min_count = df_train.groupBy("genre").count().agg({"count": "min"}).collect()[0][0]
# print(f"The smallest class has {min_count} songs. Sample other classes to match this.")

# genres = [row['genre'] for row in df_train.select('genre').distinct().collect()]

# balanced_df_list = []
# for genre in genres:
#     subset = df_train.where(df_train.genre == genre)
    
#     # If the class is larger than the smallest class, sample it down
#     if subset.count() > min_count:
#         # Calculate the fraction needed for sampling
#         sample_fraction = min_count / subset.count()
#         sampled_subset = subset.sample(withReplacement=False, fraction=sample_fraction, seed=42)
#         balanced_df_list.append(sampled_subset)
#     else:
#         # If it's the smallest class, keep all of it
#         balanced_df_list.append(subset)

# # Combine the balanced subsets into a single DataFrame
# df_train_balanced = reduce(DataFrame.unionAll, balanced_df_list)

# print("New balanced dataset counts:")
# df_train_balanced.groupBy("genre").count().show()

In [None]:
# Load feature pipeline
pipeline_model = PipelineModel.load("./notebook_data/feature_pipeline_lyrics_only")

# Apply the feature engineering pipeline
#df_train_transformed = pipeline_model.transform(df_train_balanced)
df_train_transformed = pipeline_model.transform(df_train)
df_test_transformed = pipeline_model.transform(df_test)

# Model Training And Evaluation

In [None]:
models = {
    "Random Forest": RandomForestClassifier(featuresCol="features", labelCol="label", seed=42),
    "Logistic Regression": LogisticRegression(featuresCol="features", labelCol="label"),
    "Naive Bayes": NaiveBayes(featuresCol="features", labelCol="label"),
}

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

In [None]:
for name, model in models.items():
    print("---" * 15)
    print(f"Training {name}...")
    
    # Train the model on the balanced training data
    fitted_model = model.fit(df_train_transformed)
    
    # Make predictions on the untouched test data
    preds = fitted_model.transform(df_test_transformed)
    
    # Calculate the F1 score
    f1_score = evaluator.evaluate(preds)
    
    print(f" F1-Score for {name}: {f1_score}\n")

In [None]:
import os
from pyspark.sql import SparkSession, DataFrame
from functools import reduce

# Import PySpark tools and the native GBTClassifier
from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import GBTClassifier, OneVsRest

from pyspark.sql.functions import col

class_counts = df_train_transformed.groupBy("label").count()

total_samples = df_train_transformed.count()
num_classes = class_counts.count()

class_weights = class_counts.withColumn("weight", total_samples / (num_classes * col("count")))

train_with_weights = df_train_transformed.join(class_weights, "label", "left")


gbt = GBTClassifier(
    featuresCol="features",
    labelCol="label",
    seed=42
)

ovr = OneVsRest(
    classifier=gbt,
    weightCol="weight" 
)

print("\\n--- Training One-vs-Rest with GBTClassifier and Class Weights ---")

ovr_model = ovr.fit(train_with_weights)

preds = ovr_model.transform(df_test_transformed)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(preds)

print(f"F1-Score for GBTClassifier (One-vs-Rest): {f1:.4f}\\n")

spark.stop()

In [None]:
# Save model
# rf_model.save("./notebook_data/rf_7class_model_lyrics_only")