In [1]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [2]:
sc

In [3]:
spark

In [4]:
import random
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

In [16]:
from pyspark.ml.pipeline import PipelineModel
from pyspark.sql.types import IntegerType

pipeline_model = PipelineModel.load(r"C:\Users\Romain\databricks\gbt_modelw2v")

def process(time, rdd):
    if rdd.isEmpty():
        return

    df = spark.read.json(rdd)
    print("========= %s =========" % str(time))    
    # Convert posted_at to timestamp
    df = df.withColumn("posted_at", col("posted_at").cast("timestamp"))
    df = df.withColumn("frontpage", col("frontpage").cast("integer"))

    # Extract hour from posted_at
    hour_udf = udf(lambda x: x.hour if x else None, IntegerType())
    df = df.withColumn("posted_hour", hour_udf(col("posted_at")))
    df = df.fillna({"title": "", "source_text": "", "posted_at": ""})

    #some preprocessing done when training model
    #applying the pipeline to it
    df_with_preds = pipeline_model.transform(df)
    df_with_preds.select('aid','comments','votes','source_text','title','frontpage','prediction').show()

In [17]:
ssc = StreamingContext(sc, 10)

In [18]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [19]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

+--------+--------+-----+--------------------+--------------------+---------+----------+
|     aid|comments|votes|         source_text|               title|frontpage|prediction|
+--------+--------+-----+--------------------+--------------------+---------+----------+
|40472361|       0|    1|GitHub - wuyasong...|Show HN: Image To...|        0|       0.0|
+--------+--------+-----+--------------------+--------------------+---------+----------+

+--------+--------+-----+--------------------+--------------------+---------+----------+
|     aid|comments|votes|         source_text|               title|frontpage|prediction|
+--------+--------+-----+--------------------+--------------------+---------+----------+
|40472364|       0|    2|macOS 15 to bring...|Apple plans macOS...|        0|       0.0|
|40472374|       0|    5|Optimizing your t...|Optimize Your Tal...|        1|       1.0|
+--------+--------+-----+--------------------+--------------------+---------+----------+

+--------+--------+

+--------+--------+-----+--------------------+--------------------+---------+----------+
|     aid|comments|votes|         source_text|               title|frontpage|prediction|
+--------+--------+-----+--------------------+--------------------+---------+----------+
|40472994|       0|    2|GitHub - sinaatal...|RenderCV – A Late...|        0|       0.0|
|40473000|       0|    1|Hacking Hard-work...|   Hacking Hard-Work|        0|       0.0|
+--------+--------+-----+--------------------+--------------------+---------+----------+

+--------+--------+-----+--------------------+--------------------+---------+----------+
|     aid|comments|votes|         source_text|               title|frontpage|prediction|
+--------+--------+-----+--------------------+--------------------+---------+----------+
|40473012|       0|    3|MPs urge under-16...|MPs urge under-16...|        0|       1.0|
|40473014|       0|    2|Angle of repose -...|     Angle of Repose|        0|       0.0|
+--------+--------+-

+--------+--------+-----+--------------------+--------------------+---------+----------+
|     aid|comments|votes|         source_text|               title|frontpage|prediction|
+--------+--------+-----+--------------------+--------------------+---------+----------+
|40473618|       0|    2|Your parents are ...|Parents are getti...|        0|       0.0|
|40473619|       0|    1|BenjiPortheault's...|          Discipline|        0|       0.0|
+--------+--------+-----+--------------------+--------------------+---------+----------+

+--------+--------+-----+--------------------+--------------------+---------+----------+
|     aid|comments|votes|         source_text|               title|frontpage|prediction|
+--------+--------+-----+--------------------+--------------------+---------+----------+
|40473656|       0|    2|Three New Superco...|Three New Superco...|        0|       0.0|
|40473685|       0|    1|Amelia Wattenberg...|What makes a coun...|        0|       0.0|
|40473704|       0| 

In [15]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
+--------------------+---------+----------+
|            features|frontpage|prediction|
+--------------------+---------+----------+
|[6.0,4.1520477254...|        1|       1.0|
|[6.0,0.0,0.971620...|        1|       1.0|
|(259,[0,2,3,4,5,6...|        0|       0.0|
|[6.0,0.0,0.097162...|        0|       0.0|
+--------------------+---------+----------+

+--------------------+---------+----------+
|            features|frontpage|prediction|
+--------------------+---------+----------+
|(259,[0,2,3,4,5,6...|        1|       1.0|
+--------------------+---------+----------+



Feature Importance

In [9]:
import numpy as np
import pandas as pd
from pyspark.ml import PipelineModel

# Load the models
model_GBT_tf_idf = PipelineModel.load(r"C:\Users\Romain\databricks\gbt_model").stages[-1].bestModel
model_GBT_w2v = PipelineModel.load(r"C:\Users\Romain\databricks\gbt_modelw2v").stages[-1].bestModel
model_LR_tf_idf = PipelineModel.load(r"C:\Users\Romain\databricks\logistic_model").stages[-1].bestModel
model_LR_w2v = PipelineModel.load(r"C:\Users\Romain\databricks\logistic_modelw2v").stages[-1].bestModel
model_RF_tf_idf = PipelineModel.load(r"C:\Users\Romain\databricks\rf_model").stages[-1].bestModel
model_RF_w2v = PipelineModel.load(r"C:\Users\Romain\databricks\rf_modelw2v").stages[-1].bestModel

# Get feature importances
feature_importances_GBT_tf_idf = model_GBT_tf_idf.featureImportances
feature_importances_GBT_w2v = model_GBT_w2v.featureImportances
feature_importances_RF_tf_idf = model_RF_tf_idf.featureImportances
feature_importances_RF_w2v = model_RF_w2v.featureImportances
print(np.sum(np.abs(model_GBT_tf_idf.featureImportances)))

# Normalizing the logistic regression coefficients
feature_importances_LR_tf_idf = model_LR_tf_idf.coefficients / np.sum(np.abs(model_LR_tf_idf.coefficients))
feature_importances_LR_w2v = model_LR_w2v.coefficients / np.sum(np.abs(model_LR_w2v.coefficients))

features = ["'poster_hour'", "'comments'", "'votes'"]

# Collect feature importances in a list of dictionaries
feature_importances_data = []

for idx, (importance_GBT_tf_idf, importance_GBT_w2v, 
          importance_LR_tf_idf, importance_LR_w2v, 
          importance_RF_tf_idf, importance_RF_w2v) in enumerate(zip(feature_importances_GBT_tf_idf, feature_importances_GBT_w2v, 
                                                                     feature_importances_LR_tf_idf, feature_importances_LR_w2v, 
                                                                     feature_importances_RF_tf_idf, feature_importances_RF_w2v)):
    if (float(importance_GBT_tf_idf) >= 0.04 or float(importance_GBT_w2v) >= 0.04 or
        float(importance_LR_tf_idf) >= 0.04 or float(importance_LR_w2v) >= 0.04 or
        float(importance_RF_tf_idf) >= 0.04 or float(importance_RF_w2v) >= 0.04):
        
        feature_name = features[idx] if idx < len(features) else f"text-mining:{idx}"
        
        feature_importances_data.append({
            "Feature": feature_name,
            "GBT (TF-IDF)": importance_GBT_tf_idf,
            "GBT (Word2Vec)": importance_GBT_w2v,
            "LR (TF-IDF)": importance_LR_tf_idf,
            "LR (Word2Vec)": importance_LR_w2v,
            "RF (TF-IDF)": importance_RF_tf_idf,
            "RF (Word2Vec)": importance_RF_w2v
        })

# Create a DataFrame from the list of dictionaries
df_feature_importances = pd.DataFrame(feature_importances_data)

# Print the DataFrame
print(df_feature_importances)

1.0000000000000002


'\n# Normalizing the logistic regression coefficients\nfeature_importances_LR_tf_idf = model_LR_tf_idf.coefficients / np.sum(np.abs(model_LR_tf_idf.coefficients))\nfeature_importances_LR_w2v = model_LR_w2v.coefficients / np.sum(np.abs(model_LR_w2v.coefficients))\n\nfeatures = ["\'poster_hour\'", "\'comments\'", "\'votes\'"]\n\n# Collect feature importances in a list of dictionaries\nfeature_importances_data = []\n\nfor idx, (importance_GBT_tf_idf, importance_GBT_w2v, \n          importance_LR_tf_idf, importance_LR_w2v, \n          importance_RF_tf_idf, importance_RF_w2v) in enumerate(zip(feature_importances_GBT_tf_idf, feature_importances_GBT_w2v, \n                                                                     feature_importances_LR_tf_idf, feature_importances_LR_w2v, \n                                                                     feature_importances_RF_tf_idf, feature_importances_RF_w2v)):\n    if (float(importance_GBT_tf_idf) >= 0.04 or float(importance_GBT_w2v) >= 0.04