In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, max as spark_max

import pyspark.sql.functions as F
from pyspark.sql import Window
import pandas as pd
from bsf_env import init_spark, init_mariadb_engine,set_spark_verbosity
from pyspark.sql.functions import lit, current_timestamp
import pandas as pd
import numpy as np
from pyspark.sql.types import *
from tqdm import tqdm
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from IPython.display import display, HTML
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import joblib
import tempfile
import os

spark = init_spark("bsf_candidates_analysis", log_level="WARN", show_progress=False, enable_ui=True, priority=False)
engine = init_mariadb_engine()

ingest_ts = spark.sql("SELECT current_timestamp()").collect()[0][0]

pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.width", 200)         # Adjust width for readability
pd.set_option("display.max_rows", 20)       # Show only top 20 rows by default

# -----------------------------
# STAGE 0: Select top 20 candidates per timeframe
# -----------------------------
top_n = 20
df_last = spark.table("bsf.daily_signals_last_allcol ")
df_all = spark.table("bsf.daily_signals")

df_all.groupBy("Action").count().orderBy(F.desc("count")).show(truncate=False)

print(df_all.groupBy("TimeFrame", "Action") \
  .count() \
  .orderBy("TimeFrame", F.desc("count")) \
  .show(truncate=False))

df = df_last.cache()

# df_last = spark.table("bsf.daily_signals_last_allcol")
# df_all = spark.table("bsf.daily_signals")
df = df_last.cache()

# Aggregate Buy/Sell/Hold counts per company per timeframe
df_counts = df.groupBy("CompanyId", "TimeFrame").agg(
    F.sum(F.when(F.col("Action") == "Buy", 1).otherwise(0)).alias("BuyCount"),
    F.sum(F.when(F.col("Action") == "Sell", 1).otherwise(0)).alias("SellCount"),
    F.sum(F.when(F.col("Action") == "Hold", 1).otherwise(0)).alias("HoldCount"),
    F.sum("Return").alias("Return")
)

# Define window partitioned by timeframe for ranking
w_buy = Window.partitionBy("TimeFrame").orderBy(F.desc("BuyCount"))
w_sell = Window.partitionBy("TimeFrame").orderBy(F.desc("SellCount"))
w_hold = Window.partitionBy("TimeFrame").orderBy(F.desc("HoldCount"))

# Add separate rank columns
df_ranked = (
    df_counts
    .withColumn("BuyRank", F.row_number().over(w_buy))
    .withColumn("SellRank", F.row_number().over(w_sell))
    .withColumn("HoldRank", F.row_number().over(w_hold))
)

# Select top 20 per timeframe using BuyRank (or adjust metric)
top_candidates_ranked = df_ranked.filter(F.col("BuyRank") <= top_n)

# Join back to original df to get full rows for these top candidates
top_candidates_df = df.join(top_candidates_ranked, on=["CompanyId","TimeFrame"], how="inner")

# Convert to Pandas for Stage 1 regression
timeframes = ["Short", "Swing", "Long", "Daily"]
timeframe_dfs = {tf: top_candidates_df.filter(F.col("TimeFrame") == tf) for tf in timeframes}
timeframe_dfs_all = {tf: df_all.filter(F.col("TimeFrame") == tf) for tf in timeframes}

# Optional: convert to Pandas for regression
pdf_timeframe_dfs = {tf: timeframe_dfs[tf].toPandas() for tf in timeframes}
pdf_timeframe_dfs_all = {tf: timeframe_dfs_all[tf].toPandas() for tf in timeframes}

print("✅ Stage 0 completed: Top 20 candidates selected per timeframe")



:: loading settings :: url = jar:file:/home/jupyter/.venv/python3.9_bsf/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jupyter/.ivy2/cache
The jars for the packages stored in: /home/jupyter/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-3a82f7e8-b807-45f1-98ad-9e6e877aade2;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.0.0rc1 in spark-list
	found io.delta#delta-storage;3.0.0rc1 in spark-list
	found org.antlr#antlr4-runtime;4.9.3 in spark-list
:: resolution report :: resolve 572ms :: artifacts dl 20ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.0.0rc1 from spark-list in [default]
	io.delta#delta-storage;3.0.0rc1 from spark-list in [default]
	org.antlr#antlr4-runtime;4.9.3 from spark-list in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	

[Spark] Started 'bsf_candidates_analysis' log_level=WARN (effective=WARN), progress=False


25/09/14 19:40:30 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/09/14 19:40:30 WARN HiveConf: HiveConf of name hive.metastore.client.connect.timeout does not exist
25/09/14 19:40:30 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
25/09/14 19:40:31 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/09/14 19:40:31 WARN HiveConf: HiveConf of name hive.metastore.client.connect.timeout does not exist
25/09/14 19:40:31 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
25/09/14 19:40:34 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/09/14 19:40:34 WARN HiveConf: HiveConf of name hive.metastore.client.connect.timeout does not exist
25/09/14 19:40:34 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
25/09/14 19:40:38 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


+------+-------+
|Action|count  |
+------+-------+
|Hold  |1304346|
|Buy   |295321 |
|Sell  |237073 |
+------+-------+

+---------+------+------+
|TimeFrame|Action|count |
+---------+------+------+
|Daily    |Hold  |322321|
|Daily    |Buy   |77109 |
|Daily    |Sell  |59755 |
|Long     |Hold  |336220|
|Long     |Buy   |68103 |
|Long     |Sell  |54862 |
|Short    |Hold  |323472|
|Short    |Buy   |74652 |
|Short    |Sell  |61061 |
|Swing    |Hold  |322333|
|Swing    |Buy   |75457 |
|Swing    |Sell  |61395 |
+---------+------+------+

None


25/09/14 19:44:00 WARN CacheManager: Asked to cache already cached data.


✅ Stage 0 completed: Top 20 candidates selected per timeframe


In [None]:
# -------------------------
# CONFIG
# -------------------------
top_n = 5
forecast_steps_map = {
    "Daily": 1,
    "Short": 3,
    "Swing": 5,
    "Long": 10
}
epsilon = 1e-6  # for log transform if needed

# -------------------------
# INIT SPARK
# -------------------------
spark = SparkSession.builder.appName("Stage1_Ensemble_Stage2_SARIMAX").getOrCreate()

# Assume timeframe_dfs_all is a dict of Spark DataFrames keyed by timeframe
# e.g., timeframe_dfs_all = {"Short": sdf_short, "Swing": sdf_swing, ...}

# -------------------------
# STAGE 1: Fast Ensemble Regression (Linear + Lasso + Ridge)
# -------------------------
all_stage1_predictions = []

for tf, sdf_tf in timeframe_dfs_all.items():
    pdf_tf = sdf_tf.toPandas()
    print(f"\n=== Stage 1: Processing timeframe: {tf} ===")

    for cid in pdf_tf['CompanyId'].unique():
        df_c = pdf_tf[pdf_tf['CompanyId'] == cid].copy()

        # Training rows
        train_df = df_c[df_c['TomorrowClose'].notna()].copy()
        if train_df.empty:
            continue

        # Numeric features excluding target
        numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
        if 'TomorrowClose' in numeric_cols:
            numeric_cols.remove('TomorrowClose')

        # Minimal correlation filter
        corr = train_df[numeric_cols + ['TomorrowClose']].corr()['TomorrowClose'].abs()
        good_features = corr[corr >= 0.03].index.tolist()
        good_features = [f for f in good_features if f != 'TomorrowClose']

        if not good_features:
            continue

        X_train = train_df[good_features].fillna(0)
        y_train = train_df['TomorrowClose']

        # Ensemble regressions
        lr_model = LinearRegression().fit(X_train, y_train)
        lasso_model = Lasso(alpha=0.01).fit(X_train, y_train)
        ridge_model = Ridge(alpha=1.0, solver="svd").fit(X_train, y_train)

        # Predict future rows
        future_df = df_c[df_c['TomorrowClose'].isna()].copy()
        if not future_df.empty:
            X_future = future_df[good_features].fillna(0)
            future_df['Pred_Linear'] = lr_model.predict(X_future)
            future_df['Pred_Lasso'] = lasso_model.predict(X_future)
            future_df['Pred_Ridge'] = ridge_model.predict(X_future)

            # Ensemble average
            future_df['Pred_Ensemble'] = future_df[['Pred_Linear', 'Pred_Lasso', 'Pred_Ridge']].mean(axis=1)

            future_df['TimeFrame'] = tf
            future_df['CompanyId'] = cid
            all_stage1_predictions.append(future_df)

# Combine predictions
if all_stage1_predictions:
    stage1_df = pd.concat(all_stage1_predictions, ignore_index=True)
else:
    stage1_df = pd.DataFrame()
    print("⚠️ No Stage 1 predictions generated.")

# -------------------------
# Select top-N candidates per timeframe (ensemble)
# -------------------------
def select_top_n(df, pred_col='Pred_Ensemble', n=top_n):
    top_list = []
    for tf in df['TimeFrame'].unique():
        tf_df = df[df['TimeFrame'] == tf].copy()
        tf_df = tf_df.sort_values(pred_col, ascending=False)
        top_list.append(tf_df.head(n))
    return pd.concat(top_list, ignore_index=True) if top_list else pd.DataFrame()

stage1_top_df = select_top_n(stage1_df, pred_col='Pred_Ensemble', n=top_n)
print("\n=== Stage 1 Top Candidates per Timeframe (Ensemble) ===")
print(stage1_top_df[['TimeFrame', 'CompanyId', 'Pred_Ensemble']])

25/09/14 20:48:40 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
25/09/14 20:48:40 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:978)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce

In [3]:
# -------------------------
# STAGE 2: SARIMAX on top candidates
# -------------------------
sarimax_results = []

# Define simple SARIMAX orders per timeframe
sarimax_orders = {
    "Daily": (1,1,1,1),
    "Short": (1,1,1,3),
    "Swing": (1,1,1,5),
    "Long": (1,1,1,7)
}

for tf, steps in forecast_steps_map.items():
    df_tf = timeframe_dfs_all[tf].toPandas().copy()
    top_companies = stage1_top_df.loc[stage1_top_df['TimeFrame'] == tf, 'CompanyId'].unique()

    for cid in top_companies:
        df_c = df_tf[df_tf['CompanyId'] == cid].copy().dropna(subset=['TomorrowReturn'])
        if len(df_c) < 30:
            print(f"⏭️ Skipping {cid}-{tf} (not enough data)")
            continue

        ts = df_c['TomorrowReturn']
        p,d,q,s = sarimax_orders[tf]

        try:
            model = SARIMAX(
                ts,
                order=(p,d,q),
                seasonal_order=(0,1,1,s),
                enforce_stationarity=False,
                enforce_invertibility=False
            )
            fit = model.fit(disp=False)
            forecast = fit.get_forecast(steps=steps)
            mean_pred = forecast.predicted_mean.mean()

            sarimax_results.append({
                'CompanyId': cid,
                'TimeFrame': tf,
                'Pred_SARIMAX': float(mean_pred),
                'order': str((p,d,q)),
                'seasonal_order': str((0,1,1,s))
            })
        except Exception as e:
            print(f"❌ SARIMAX failed for {cid}-{tf}: {e}")

# Convert to DataFrame
sarimax_df = pd.DataFrame(sarimax_results)
final_df = stage1_top_df.merge(sarimax_df, on=['CompanyId','TimeFrame'], how='left')

❌ SARIMAX failed for 82853-Daily: Seasonal periodicity must be greater than 1.
❌ SARIMAX failed for 34193-Daily: Seasonal periodicity must be greater than 1.
❌ SARIMAX failed for 292453-Daily: Seasonal periodicity must be greater than 1.
❌ SARIMAX failed for 240168-Daily: Seasonal periodicity must be greater than 1.
❌ SARIMAX failed for 53081-Daily: Seasonal periodicity must be greater than 1.


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction

In [4]:
# -------------------------
# Write top candidates to Delta
# -------------------------
top_out = final_df[['CompanyId', 'TimeFrame', 'Pred_Ensemble', 'Pred_SARIMAX']].copy()

table_name = "bsf.final_top_candidates"
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {table_name} (
    CompanyId STRING,
    TimeFrame STRING,
    Pred_Ensemble DOUBLE,
    Pred_SARIMAX DOUBLE,
    run_id INT
)
USING DELTA
""")

# Determine next run_id
if spark._jsparkSession.catalog().tableExists(table_name):
    latest_run_id = spark.read.table(table_name).agg(spark_max("run_id")).collect()[0][0]
    run_id = (latest_run_id or 0) + 1
else:
    run_id = 1

# Add run_id and write
top_out_df = spark.createDataFrame(top_out).withColumn("run_id", lit(run_id))
top_out_df.write.format("delta").mode("append").saveAsTable(table_name)

print(f"✅ Stage 2 run {run_id} written to {table_name}")
top_out_df.show()

spark.stop()

25/09/14 20:04:41 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`bsf`.`final_top_candidates` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
25/09/14 20:04:42 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
25/09/14 20:04:44 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
25/09/14 20:04:44 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/09/14 20:04:44 WARN HiveConf: HiveConf of name hive.metastore.client.connect.timeout does not exist
25/09/14 20:04:44 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist


AnalysisException: Failed to merge fields 'CompanyId' and 'CompanyId'. Failed to merge incompatible data types StringType and LongType