In [1]:
# Step 1: Set up Blob Storage Config
spark.conf.set(
  "fs.azure.account.key.team12storage.blob.core.windows.net",
  "ZdpCqxRwuQbJ8fUYN7EMbTLjOyj+I9rI6IjcQ/uU+4aFG46TQqjiz1Piq9SWKGkkjszo3r94l26M+AStR5PvPg=="
)

StatementMeta(stockPredict, 39, 2, Finished, Available, Finished)

In [2]:
# Step 2: Read Stream Data From Blob Storage
from pyspark.sql.functions import lit
from datetime import datetime, timedelta
from notebookutils import mssparkutils

start_date = (datetime.now()-timedelta(days=1)).strftime("%Y-%m-%d")
#start_date = "2025-04-28"
input_path = f"wasbs://team12blobcontainer@team12storage.blob.core.windows.net/Medallion/Gold/Stream/{start_date}.json"

try:
    # First verify file exists
    if mssparkutils.fs.exists(input_path):
        streaming_df = spark.read.option("multiline", "true").json(input_path)
        print(f"Successfully read streaming data from: {input_path}")
        
        # Add processing timestamp column
        streaming_df = streaming_df.withColumn("processing_time", lit(datetime.now()))
    else:
        print(f"File not found: {input_path}")
        # End notebook execution if file doesn't exist
        mssparkutils.notebook.exit("Input file not found - stopping execution")
        
except Exception as e:
    print(f"Error reading stream data: {str(e)}")
    # End notebook execution on error
    mssparkutils.notebook.exit(f"Failed to read stream data: {str(e)}")

StatementMeta(stockPredict, 39, 3, Finished, Available, Finished)

Successfully read streaming data from: wasbs://team12blobcontainer@team12storage.blob.core.windows.net/Medallion/Gold/Stream/2025-04-28.json


In [3]:
# Step 3: Load Preprocessing Pipeline and Model
from pyspark.ml import PipelineModel
from pyspark.ml.regression import RandomForestRegressionModel

# Define paths to model artifacts in ADLS Gen2
pipeline_path = "wasbs://team12blobcontainer@team12storage.blob.core.windows.net/Model/preprocess_pipeline"
model_path = "wasbs://team12blobcontainer@team12storage.blob.core.windows.net/Model/stock_volume_rf_model"

# Load the trained pipeline and model
preprocessing_pipeline = PipelineModel.load(pipeline_path)
trained_model = RandomForestRegressionModel.load(model_path)

StatementMeta(stockPredict, 39, 4, Finished, Available, Finished)

In [4]:
# Step 4: Transform and Predict
from pyspark.sql.functions import col, avg, current_timestamp
# Manually cast boolean columns to integers
streaming_df = streaming_df.withColumn("is_viral_int", col("is_viral").cast("int")) \
    .withColumn("is_new_account_int", col("is_new_account").cast("int")) \
    .withColumn("is_blue_verified_int", col("is_blue_verified").cast("int")) \
    .withColumn("is_influencer_int", col("is_influencer").cast("int"))

# Apply pipeline
transformed_df = preprocessing_pipeline.transform(streaming_df)

# Predict
predicted_df = trained_model.transform(transformed_df)

# # Average prediction per day
final_prediction_df = predicted_df.groupBy("tweet_created_at_date") \
    .agg(avg("prediction").alias("predicted_next_day_volume"))

display(final_prediction_df.limit(10))

StatementMeta(stockPredict, 39, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, db8d025b-44c4-4bd5-8649-14232a82a748)

In [5]:
# Add date partition column
final_prediction_df = final_prediction_df.withColumn("prediction_date", col("tweet_created_at_date"))

prediction_date = final_prediction_df.select("prediction_date").first()[0]

output_path = f"wasbs://team12blobcontainer@team12storage.blob.core.windows.net/Prediction/Streaming/{prediction_date}"

# use delta format to save
(final_prediction_df
 .write
 .format("delta")
 .mode("overwrite") 
 .save(output_path))

StatementMeta(stockPredict, 39, 6, Finished, Available, Finished)