
## Machine Learning Model Training (Multilabel Classification)

In [0]:
# Read dataset from checkpoint
model_df = spark.read.parquet("/tmp/project/preprocessed_data")
model_df.cache()

display(model_df)

### Model hyperparameter tuning

In [0]:
from pyspark.ml.classification import LogisticRegression

#Parameters
maxIter = 100
regParam = 0.0             # regularization didn't improve performance
elasticNetParam = 0.0      # regularization didn't improve performance

# Split dataset
train_data, test_data = model_df.randomSplit([0.9, 0.1], seed=32)
train_data = train_data.cache()

# Define Logistic Regression model
lr = LogisticRegression(maxIter=maxIter, regParam=regParam, elasticNetParam=elasticNetParam)
lr_model = lr.fit(train_data)
predictions = lr_model.transform(test_data)


### Preparing labels for multi-label classification

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, hamming_loss
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

# Define a UDF to extract labels with probabilities greater than some threshold
@udf(ArrayType(StringType()))
def extract_labels(probabilities, threshold=0.2):
    labels = [int(i) for i, prob in enumerate(probabilities) if prob > threshold]
    return labels

# Find predicted labels based on probability threshold
predictions_df = predictions.withColumn("predicted_labels", extract_labels(predictions.probability))

# Create a label dataframe containing lists of true and predicted labels for comparison
labels_df = predictions_df.select("true_labels", "predicted_labels")

mlb = MultiLabelBinarizer()

# Convert to Pandas dataframe to use with MultiLabelBinarizer
model_pd = model_df.toPandas()
predictions_pd = predictions_df.toPandas()

# Learning classes and creating a class mapping
mlb.fit(model_pd["true_labels"].apply(lambda lst: [int(label) for label in lst]))

# Finding labels as a binarized list of all labels for each post
true_labels_mlb = mlb.transform(predictions_pd["true_labels"].apply(lambda lst: [int(label) for label in lst]))
predicted_labels_mlb = mlb.transform(predictions_pd["predicted_labels"].apply(lambda lst: [int(label) for label in lst]))

### Model evaluation

In [0]:
"""
Evaluation method 2:
Accuracy is not considered a good metric for multi-label classification. Hamming loss is another metric that suits this problem better.
Small values of te Hamming loss metric (near zero) indicate good performance. Here, precision and recall are also used to give a broader 
perspective on the model performance.
"""
accuracy = accuracy_score(true_labels_mlb, predicted_labels_mlb)
precision = precision_score(true_labels_mlb, predicted_labels_mlb, average='micro')
recall = recall_score(true_labels_mlb, predicted_labels_mlb, average='micro')
hamming_loss = hamming_loss(true_labels_mlb, predicted_labels_mlb)

print("Accuracy on testing dataset:", accuracy)
print("Precision on testing dataset:", precision)
print("Recall on testing dataset:", recall)
print("Hamming Loss on testing dataset:", hamming_loss)

In [0]:
# Show true and predicted labels for visual comaprison
display(labels_df)

### Model training on full dataset for deployment

In [0]:
from pyspark.ml.classification import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer

# Read dataset from checkpoint
model_df = spark.read.parquet("/tmp/project/preprocessed_data")
model_df = model_df.cache()

#Parameters
maxIter = 100
regParam = 0.0
elasticNetParam = 0.0

# Define Logistic Regression model
lr = LogisticRegression(maxIter=maxIter, regParam=regParam, elasticNetParam=elasticNetParam)
lr_model = lr.fit(model_df)


In [0]:
# Save models temporarily for use in deployment notebook
lr_model.save('/mnt/bd-project/Models/lr_model')