In [1]:
# Set up the configuration for accessing the storage account
storage_account_name = "team12storage"
storage_account_key = "ZdpCqxRwuQbJ8fUYN7EMbTLjOyj+I9rI6IjcQ/uU+4aFG46TQqjiz1Piq9SWKGkkjszo3r94l26M+AStR5PvPg=="

spark.conf.set(
    "fs.azure.account.key." + storage_account_name + ".dfs.core.windows.net",
    storage_account_key
)
container = "team12blobcontainer"
abfss_base_path = f"abfss://{container}@{storage_account_name}.dfs.core.windows.net/"
df = spark.read.parquet(f"{abfss_base_path}Medallion/Gold/dataset_updated/*")
df.printSchema()

StatementMeta(stockPredict, 23, 2, Finished, Available, Finished)

root
 |-- user_id: string (nullable = true)
 |-- tweet_created_at_date: date (nullable = true)
 |-- is_blue_verified: boolean (nullable = true)
 |-- account_created_at: date (nullable = true)
 |-- followers_count: integer (nullable = true)
 |-- friends_count: integer (nullable = true)
 |-- account_favourites_count: integer (nullable = true)
 |-- listed_count: integer (nullable = true)
 |-- media_count: integer (nullable = true)
 |-- account_possibly_sensitive: boolean (nullable = true)
 |-- rest_id: string (nullable = true)
 |-- tweet_created_at_time: string (nullable = true)
 |-- view_count: string (nullable = true)
 |-- retweet_count: integer (nullable = true)
 |-- reply_count: integer (nullable = true)
 |-- quote_count: integer (nullable = true)
 |-- favorite_count: integer (nullable = true)
 |-- tweet_possibly_sensitive: boolean (nullable = true)
 |-- full_text: string (nullable = true)
 |-- sentiment_score: double (nullable = true)
 |-- interaction_score: double (nullable = true)


In [2]:
from pyspark.sql.functions import col
df = df.filter(col("current_volume").isNotNull())
df.count()

StatementMeta(stockPredict, 23, 3, Finished, Available, Finished)

41864

In [3]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

# Step 1: Convert Boolean columns to Integer
df = df.withColumn("is_viral_int", col("is_viral").cast("int")) \
       .withColumn("is_new_account_int", col("is_new_account").cast("int")) \
       .withColumn("is_blue_verified_int", col("is_blue_verified").cast("int")) \
       .withColumn("is_influencer_int", col("is_influencer").cast("int"))

# Step 2: Define categorical and binary columns
categorical_cols = ["day_of_week"]
binary_cols = [
    "is_viral_int", "is_new_account_int", "is_blue_verified_int",
    "is_influencer_int"
]

# Step 3: Drop rows with nulls in any features or label
required_cols = [
    "sentiment_score", "interaction_score", "favorite_ratio", "reply_ratio",
    "followers_count", "friends_count", "account_favourites_count",
    "listed_count", "media_count", "account_age_days", "credibility_score",
    "follower_activity_score", "current_open", "current_close",
    "current_high", "current_low", "current_volume", "next_available_volume"
] + categorical_cols + binary_cols

df = df.dropna(subset=required_cols)

# Step 4: Create indexers and encoders
indexers = [StringIndexer(inputCol=col_name, outputCol=f"{col_name}_index", handleInvalid="keep") for col_name in categorical_cols]
encoders = [OneHotEncoder(inputCol=f"{col_name}_index", outputCol=f"{col_name}_vec") for col_name in categorical_cols]
binary_encoders = [OneHotEncoder(inputCol=col_name, outputCol=f"{col_name}_vec") for col_name in binary_cols]

# Step 5: Define numeric columns to include in features
numeric_cols = [
    "sentiment_score", "interaction_score", "favorite_ratio", "reply_ratio",
    "followers_count", "friends_count", "account_favourites_count",
    "listed_count", "media_count", "account_age_days",
    "credibility_score", "follower_activity_score",
    "current_open", "current_close", "current_high", "current_low", "current_volume"
]

# Step 6: Assemble all features
feature_cols = numeric_cols + [f"{col}_vec" for col in categorical_cols + binary_cols]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

# Step 7: Create and run pipeline
pipeline = Pipeline(stages=indexers + encoders + binary_encoders + [assembler, scaler])
pipeline_model = pipeline.fit(df)
processed_model_df = pipeline_model.transform(df)

#Preview
processed_model_df.select("scaled_features", "next_available_volume").show(truncate=False)

StatementMeta(stockPredict, 23, 4, Finished, Available, Finished)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+
|scaled_features                                                                                                                                                                                                                                                                                                                                                                                                                                  |next_available_volume|
+-----------------------------------------------------------------------------------

In [4]:
# Save preprocess pipeline
abfss_base_path = f"abfss://{container}@{storage_account_name}.dfs.core.windows.net/Model"
pipeline_path = f"{abfss_base_path}/preprocess_pipeline"
pipeline_model.write().overwrite().save(pipeline_path)

StatementMeta(stockPredict, 23, 5, Finished, Available, Finished)

In [5]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Step 0: Split the data
train_df, test_df = processed_model_df.randomSplit([0.8, 0.2], seed=42)

# Step 1: Define the model
rf = RandomForestRegressor(
    featuresCol="scaled_features", 
    labelCol="next_available_volume", 
    predictionCol="prediction",
    seed=42
)

# Step 2: Build the parameter grid
param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [20, 50, 100]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .addGrid(rf.minInstancesPerNode, [1, 5]) \
    .build()

# Step 3: Define evaluator
evaluator = RegressionEvaluator(
    labelCol="next_available_volume", 
    predictionCol="prediction", 
    metricName="rmse"
)

# Step 4: CrossValidator setup
cv = CrossValidator(
    estimator=rf,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    numFolds=3,
    parallelism=2
)

# Step 5: Fit the model on training data
cv_model = cv.fit(train_df)

# Step 6: Evaluate on test data
best_model = cv_model.bestModel
predictions = best_model.transform(test_df)

rmse = evaluator.evaluate(predictions)
r2 = RegressionEvaluator(
    labelCol="next_available_volume", 
    predictionCol="prediction", 
    metricName="r2"
).evaluate(predictions)

print(f"Best model params:")
print(f" - numTrees: {best_model.getNumTrees}")
print(f" - maxDepth: {best_model.getOrDefault('maxDepth')}")
print(f" - minInstancesPerNode: {best_model.getOrDefault('minInstancesPerNode')}")
print(f"New RMSE: {rmse}")
print(f"New R²: {r2}")

StatementMeta(stockPredict, 22, 6, Finished, Available, Finished)

Best model params:
 - numTrees: 100
 - maxDepth: 15
 - minInstancesPerNode: 1
New RMSE: 6367598.103419246
New R²: 0.9982680518670121


In [6]:
# Save best trained model
abfss_base_path = f"abfss://{container}@{storage_account_name}.dfs.core.windows.net/Model"
best_model.write().overwrite().save(f"{abfss_base_path}/stock_volume_rf_model")
print(f"Model saved to {abfss_base_path}/stock_volume_rf_model")

StatementMeta(stockPredict, 22, 7, Finished, Available, Finished)

Model saved to abfss://team12blobcontainer@team12storage.dfs.core.windows.net/Model/stock_volume_rf_model
