In [0]:
df = spark.read.table("default.full_data")


In [0]:
from pyspark.sql.functions import col, max
df = df.withColumn("frontpage", col("frontpage").cast("integer"))
df.groupBy('frontpage').count().show()
# Group by 'aid' and find the maximum values for 'comment', 'vote', and 'frontpage'
max_values_df = df.groupBy("aid").agg(
    max("comments").alias("max_comments"),
    max("votes").alias("max_votes"),
    max("frontpage").alias("max_frontpage")
)

# Join the original DataFrame with max_values_df
joined_df = df.join(max_values_df, "aid")

# Filter out rows where 'comment', 'vote', or 'frontpage' are lower than the maximum values
filtered_df = joined_df.filter(
    (col("comments") == col("max_comments")) &
    (col("votes") == col("max_votes")) &
    (col("frontpage") == col("max_frontpage"))
)

df = filtered_df.dropDuplicates(["aid"])

df.show()
# Show the resulting DataFrame
df.groupBy('frontpage').count().show()

+---------+-----+
|frontpage|count|
+---------+-----+
|        1| 2129|
|        0| 9252|
+---------+-----+

+--------+--------+--------------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+-------------+-----+------------+---------+-------------+
|     aid|comments|              domain|frontpage|          posted_at|         source_text|        source_title|               title|                 url|         user|votes|max_comments|max_votes|max_frontpage|
+--------+--------+--------------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+-------------+-----+------------+---------+-------------+
|39949266|       0|       economist.com|        0|2024-04-06 01:40:43|The AI doctor wil...|The AI doctor wil...|The AI doctor wil...|https://www.econo...|        jdkee|    1|           0|        1|            0|
|39949302|       2|       bloomberg.com|   

In [0]:
from pyspark.sql.functions import col, hour, udf
from pyspark.sql.types import IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, Imputer, StopWordsRemover
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Convert posted_at to timestamp
df = df.withColumn("posted_at", col("posted_at").cast("timestamp"))
df = df.withColumn("frontpage", col("frontpage").cast("integer"))

# Extract hour from posted_at
hour_udf = udf(lambda x: x.hour if x else None, IntegerType())
df = df.withColumn("posted_hour", hour_udf(col("posted_at")))
df = df.fillna({"title": "", "source_text": "", "posted_at": ""})

# Impute missing values for numerical columns
imputer_numeric = Imputer(inputCols=["comments","votes"], outputCols=["comments_imputed","votes_imputed"], strategy="mean")

# Standardize numerical columns
assembler_numeric = VectorAssembler(inputCols=["comments_imputed", "votes_imputed"], outputCol="numeric_features")
scaler = StandardScaler(inputCol="numeric_features", outputCol="numeric_features_scaled")

#Text-mining
tokenizer_source_text = Tokenizer(inputCol="source_text", outputCol="source_text_words")
remover_source_text = StopWordsRemover(inputCol="source_text_words", outputCol="source_text_filtered")
hashingTF_source_text = HashingTF(inputCol="source_text_filtered", outputCol="source_text_tf", numFeatures=1024)
idf_source_text = IDF(inputCol="source_text_tf", outputCol="source_text_tfidf")

tokenizer_title = Tokenizer(inputCol="title", outputCol="title_words")
remover_title = StopWordsRemover(inputCol="title_words", outputCol="title_filtered")
hashingTF_title = HashingTF(inputCol="title_filtered", outputCol="title_tf", numFeatures=1024)
idf_title = IDF(inputCol="title_tf", outputCol="title_tfidf")

# Assemble all features
assembler_all = VectorAssembler(inputCols=["posted_hour", "numeric_features_scaled", "source_text_tfidf", "title_tfidf"],
                                outputCol="features")

# Create the pipeline
pipeline = Pipeline(stages=[imputer_numeric, assembler_numeric, scaler] + 
                             [tokenizer_source_text, remover_source_text, hashingTF_source_text, idf_source_text,
                              tokenizer_title, remover_title, hashingTF_title, idf_title,
                              assembler_all])


# Split the data into training, validation, and test sets
train_pre_mod, test = df.randomSplit([0.7, 0.3], seed=42)

# Split the DataFrame into majority class (frontpage=0) and minority class (frontpage=1)
majority_df = train_pre_mod.filter(col("frontpage") == 0)
minority_df = train_pre_mod.filter(col("frontpage") == 1)

# Sample the majority class DataFrame to match the size of the minority class DataFrame
undersampled_majority_df = majority_df.sample(withReplacement=False, fraction=1/4, seed=42)

# Concatenate the sampled majority class DataFrame with the minority class DataFrame
train = undersampled_majority_df.union(minority_df)

# Define models
lr = LogisticRegression(featuresCol="features", labelCol="frontpage")
rf = RandomForestClassifier(featuresCol="features", labelCol="frontpage")
gbt = GBTClassifier(featuresCol="features", labelCol="frontpage")

# Create param grids for hyperparameter tuning
# Create param grids for hyperparameter tuning
paramGridLR = (ParamGridBuilder()
               .addGrid(lr.regParam, [0.1, 0.01])
               .addGrid(lr.elasticNetParam, [0.0, 0.5])  # Adding elasticNetParam
               .addGrid(lr.maxIter, [10, 20])
               .build())

paramGridRF = (ParamGridBuilder()
               .addGrid(rf.numTrees, [10, 20])  # Adding numTrees
               .addGrid(rf.maxDepth, [5, 10])        # Adding maxDepth
               .build())

paramGridGBT = (ParamGridBuilder()
                .addGrid(gbt.maxIter, [10, 20])   # Adding maxIter
                .addGrid(gbt.maxDepth, [5, 10])       # Adding maxDepth
                .build())

# Define evaluators
#evaluator = BinaryClassificationEvaluator(labelCol="frontpage")
# Define evaluators
evaluator = MulticlassClassificationEvaluator(labelCol='frontpage', predictionCol='prediction', metricName='f1')
# Define cross-validators
crossvalLR = CrossValidator(estimator=lr,
                            estimatorParamMaps=paramGridLR,
                            evaluator=evaluator,
                            numFolds=5)

crossvalRF = CrossValidator(estimator=rf,
                            estimatorParamMaps=paramGridRF,
                            evaluator=evaluator,
                            numFolds=5)

crossvalGBT = CrossValidator(estimator=gbt,
                             estimatorParamMaps=paramGridGBT,
                             evaluator=evaluator,
                             numFolds=5)

# Create pipeline models with cross-validation
pipelineLR = Pipeline(stages=[pipeline, crossvalLR])
pipelineRF = Pipeline(stages=[pipeline, crossvalRF])
pipelineGBT = Pipeline(stages=[pipeline, crossvalGBT])

# Train models
modelLR = pipelineLR.fit(train)
modelRF = pipelineRF.fit(train)
modelGBT = pipelineGBT.fit(train)

# Evaluate models
predictionsLR = modelLR.transform(train)
predictionsRF = modelRF.transform(train)
predictionsGBT = modelGBT.transform(train)

aucLR = evaluator.evaluate(predictionsLR)
aucRF = evaluator.evaluate(predictionsRF)
aucGBT = evaluator.evaluate(predictionsGBT)

print(f"Logistic Regression AUC: {aucLR}")
print(f"Random Forest AUC: {aucRF}")
print(f"Gradient Boosted Trees AUC: {aucGBT}")

test_predictionsLR = modelLR.transform(test)
test_predictionsRF = modelRF.transform(test)
test_predictionsGBT = modelGBT.transform(test)

test_aucLR = evaluator.evaluate(test_predictionsLR)
test_aucRF = evaluator.evaluate(test_predictionsRF)
test_aucGBT = evaluator.evaluate(test_predictionsGBT)

print(f"Test Logistic Regression AUC: {test_aucLR}")
print(f"Test Random Forest AUC: {test_aucRF}")
print(f"Test Gradient Boosted Trees AUC: {test_aucGBT}")

Downloading artifacts:   0%|          | 0/61 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/66 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/66 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Logistic Regression AUC: 0.8875060736195266
Random Forest AUC: 0.9229624065596371
Gradient Boosted Trees AUC: 0.9568264803470583
Test Logistic Regression AUC: 0.9173395194263229
Test Random Forest AUC: 0.923716143076252
Test Gradient Boosted Trees AUC: 0.9425614765118919


In [0]:
# Save the model to DBFS
model_path_LR = "/dbfs/models/logistic_regression_model"
model_path_RF = "/dbfs/models/random_forest_model"
model_path_GBT = "/dbfs/models/gradient_boosted_model"
modelLR.write().overwrite().save(model_path_LR)
modelRF.write().overwrite().save(model_path_RF)
modelGBT.write().overwrite().save(model_path_GBT)


In [0]:
# Move model from DBFS to workspace directory
dbutils.fs.cp(model_path_LR, "dbfs:/FileStore/models/logistic_regression_model", recurse=True)
dbutils.fs.cp(model_path_RF, "dbfs:/FileStore/models/random_forest_model", recurse=True)
dbutils.fs.cp(model_path_GBT, "dbfs:/FileStore/models/gradient_boosted_model", recurse=True)


True

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
multi_evaluator = MulticlassClassificationEvaluator(labelCol='frontpage', predictionCol='prediction')
evaluator=BinaryClassificationEvaluator(labelCol='frontpage', metricName='areaUnderROC')
# Calculate metrics
def evaluate_model(predictions, model_name):
    auc = evaluator.evaluate(predictions, {evaluator.metricName: 'areaUnderROC'})
    accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: 'accuracy'})
    f1 = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: 'f1'})
    recall = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: 'recallByLabel'})
    return {
        'Model': model_name,
        'AUC': auc,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall
    }

# Evaluate all models
metricsLR = evaluate_model(test_predictionsLR, 'Logistic Regression')
metricsRF = evaluate_model(test_predictionsRF, 'Random Forest')
metricsGBT = evaluate_model(test_predictionsGBT, 'Gradient-Boosted Trees')

# Display comparison
import pandas as pd

results = pd.DataFrame([metricsLR, metricsRF, metricsGBT])
print(results)

# Show confusion matrix for each model
from sklearn.metrics import confusion_matrix

def compute_confusion_matrix(predictions):
    y_true = predictions.select("frontpage").rdd.flatMap(lambda x: x).collect()
    y_pred = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()
    return confusion_matrix(y_true, y_pred)

cmLR = compute_confusion_matrix(test_predictionsLR)
cmRF = compute_confusion_matrix(test_predictionsRF)
cmGBT = compute_confusion_matrix(test_predictionsGBT)

print(f"Confusion Matrix for Logistic Regression:\n{cmLR}")
print(f"Confusion Matrix for Random Forest:\n{cmRF}")
print(f"Confusion Matrix for Gradient-Boosted Trees:\n{cmGBT}")


                    Model       AUC  Accuracy  F1 Score    Recall
0     Logistic Regression  0.961797  0.914480  0.917340  0.921646
1           Random Forest  0.974520  0.919457  0.923716  0.905862
2  Gradient-Boosted Trees  0.982456  0.939819  0.942561  0.926156
Confusion Matrix for Logistic Regression:
[[1635  139]
 [  50  386]]
Confusion Matrix for Random Forest:
[[1607  167]
 [  11  425]]
Confusion Matrix for Gradient-Boosted Trees:
[[1643  131]
 [   2  434]]
