In [3]:
from bsf_env import init_spark, init_mariadb_engine,set_spark_verbosity
from pyspark.sql.functions import lit, current_timestamp
import pandas as pd
import numpy as np
from pyspark.sql.types import *
from tqdm import tqdm
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from IPython.display import display, HTML
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import joblib
import tempfile
import os

spark = init_spark("bsf_candidates_analysis", log_level="WARN", show_progress=False, enable_ui=True, priority=False)
engine = init_mariadb_engine()

ingest_ts = spark.sql("SELECT current_timestamp()").collect()[0][0]

pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.width", 200)         # Adjust width for readability
pd.set_option("display.max_rows", 20)       # Show only top 20 rows by default

# Show tables
tables_df = spark.sql("SHOW TABLES IN bsf")
tables_df.show(truncate=False)

# Add row count for each table
for row in tables_df.collect():
    table_name = row['tableName']
    full_name = f"bsf.{table_name}"
    
    try:
        count = spark.table(full_name).count()
    except Exception as e:
        count = f"Error: {e}"
    
    print(f"Table: {full_name} | Rows: {count}")


df_last = spark.table("bsf.daily_signals_last_allcol ")
df_all = spark.table("bsf.daily_signals")

df_all.groupBy("Action").count().orderBy(F.desc("count")).show(truncate=False)

print(df_all.groupBy("TimeFrame", "Action") \
  .count() \
  .orderBy("TimeFrame", F.desc("count")) \
  .show(truncate=False))

df = df_last.cache()
# -----------------------------
# Aggregate Buy/Sell/Hold counts per company per timeframe
# -----------------------------
df_counts = df.groupBy("CompanyId", "TimeFrame").agg(
    F.sum(F.when(F.col("Action") == "Buy", 1).otherwise(0)).alias("BuyCount"),
    F.sum(F.when(F.col("Action") == "Sell", 1).otherwise(0)).alias("SellCount"),
    F.sum(F.when(F.col("Action") == "Hold", 1).otherwise(0)).alias("HoldCount"),
    F.sum("Return").alias("Return")
)

# -----------------------------
# Define window partitioned by timeframe
# -----------------------------
w_buy = Window.partitionBy("TimeFrame").orderBy(F.desc("BuyCount"))
w_sell = Window.partitionBy("TimeFrame").orderBy(F.desc("SellCount"))
w_hold = Window.partitionBy("TimeFrame").orderBy(F.desc("HoldCount"))

# -----------------------------
# Add separate rank columns
# -----------------------------
df_ranked = (
    df_counts
    .withColumn("BuyRank", F.row_number().over(w_buy))
    .withColumn("SellRank", F.row_number().over(w_sell))
    .withColumn("HoldRank", F.row_number().over(w_hold))
)

# -----------------------------
# Select what you want
# -----------------------------
ranked_companies = df_ranked.select(
    "CompanyId", "TimeFrame", "BuyCount", "SellCount", "HoldCount", 
    "Return", "BuyRank", "SellRank", "HoldRank"
)

# -----------------------------
# Join back to the original df to get full rows with rank by last return
# -----------------------------
ranked_rows = df.join(ranked_companies, on=["CompanyId", "TimeFrame"], how="inner")

display(ranked_rows.filter(F.col("BuyRank") <= 1).orderBy("TimeFrame", "BuyRank").toPandas())


# -----------------------------

# Convert to Pandas and save as csv
# -----------------------------
#List of timeframes
timeframes = ["Short", "Swing", "Long", "Daily"]

for tf in timeframes:
    df_name = f"pdf_{tf.lower()}"  # e.g., "pdf_short"
    df_name_all = f"pdf_{tf.lower()}_all"  # e.g., "pdf_short_all"
    globals()[df_name] = ranked_rows.filter(F.col("TimeFrame") == tf).toPandas()
    globals()[df_name_all] = df_all.filter(F.col("TimeFrame") == tf).toPandas()
    #ranked_rows.filter(F.col("TimeFrame") == tf).toPandas().to_csv(f"cvs/{tf.lower()}_output.csv", index=False)
    
# pdf_short = ranked_rows.filter(F.col("TimeFrame") == "Short").toPandas()
# pdf_short.to_csv(f"cvs/short_output.csv", index=False)

timeframe_dfs = {
    "Short": pdf_short,
    "Swing": pdf_swing,
    "Long": pdf_long,
    "Daily": pdf_daily
}
timeframe_dfs_all = {
    "Short": pdf_short_all,
    "Swing": pdf_swing_all,
    "Long": pdf_long_all,
    "Daily": pdf_daily_all
}

timeframes = ["Short", "Swing", "Long", "Daily"]

# Store Spark DFs only (no toPandas here)
timeframe_dfs = {tf: ranked_rows.filter(F.col("TimeFrame") == tf) for tf in timeframes}
timeframe_dfs_all = {tf: df_all.filter(F.col("TimeFrame") == tf) for tf in timeframes}


#full
# -------------------------
# Full Stage 1 → Stage 2 Pipeline
# -------------------------
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pycaret.regression import setup, compare_models, predict_model, finalize_model

# -------------------------
# Stage 1: Predict TomorrowClose
# -------------------------
target_stage1 = "TomorrowClose"
epsilon = 1e-6
all_stage1_predictions = []
top_n=5
# forecast steps per timeframe for Stage 2
forecast_steps_map = {
    "Daily": 1,
    "Short": 3,
    "Swing": 5,
    "Long": 10
}

# Loop over timeframes
for tf, sdf_tf in timeframe_dfs_all.items():
    pdf_tf = sdf_tf.toPandas()
    companies = pdf_tf['CompanyId'].unique()
    print(f"\n=== Phase 1 - Processing timeframe: {tf} ===")
    # Loop over companies
    for cid in companies:
        df_c = pdf_tf[pdf_tf['CompanyId'] == cid].copy()

        # -------------------------------
        # Log-transform OHLC to normalize scale
        # -------------------------------
        for col in ["Open","High","Low","Close"]:
            df_c[f"log_{col}"] = np.log(df_c[col].replace(0, epsilon))

        # -------------------------------
        # Training data: rows where target is known
        # -------------------------------
        train_df = df_c[df_c[target_stage1].notna()].copy()
        if train_df.empty:
            continue

        # -------------------------------
        # Feature selection: numeric columns correlated with target
        # -------------------------------
        numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
        if target_stage1 in numeric_cols:
            numeric_cols.remove(target_stage1)

        corr = train_df[numeric_cols + [target_stage1]].corr()[target_stage1].abs()
        threshold = 0.03  # minimal correlation
        good_features = corr[corr >= threshold].index.tolist()

        X_train = train_df[good_features].fillna(0)
        y_train = train_df[target_stage1]

        # -------------------------------
        # Stage 1 models
        # -------------------------------
        lr_model = LinearRegression().fit(X_train, y_train)
        lasso_model = Lasso(alpha=0.01).fit(X_train, y_train)
        ridge_model = Ridge(alpha=1.0, solver="svd").fit(X_train, y_train)

        # -------------------------------
        # Predict future rows (target is NaN)
        # -------------------------------
        future_df = df_c[df_c[target_stage1].isna()].copy()
        if not future_df.empty:
            X_future = future_df[good_features].fillna(0)
            future_df["Pred_Linear"] = lr_model.predict(X_future)
            future_df["Pred_Lasso"] = lasso_model.predict(X_future)
            future_df["Pred_Ridge"] = ridge_model.predict(X_future)
            future_df["TimeFrame"] = tf
            future_df["CompanyId"] = cid
            all_stage1_predictions.append(future_df)

# -------------------------------
# Combine Stage 1 predictions
# -------------------------------
if all_stage1_predictions:
    stage1_df = pd.concat(all_stage1_predictions, ignore_index=True)
else:
    stage1_df = pd.DataFrame()
    print("No predictions generated.")

# -------------------------------
# Top-N selection per timeframe (using average of Linear/Lasso/Ridge)
# -------------------------------
if not stage1_df.empty:
    stage1_df["PredictedTomorrowClose"] = stage1_df[["Pred_Linear","Pred_Lasso","Pred_Ridge"]].mean(axis=1)

    top_list = []
    for tf in stage1_df["TimeFrame"].unique():
        tf_df = stage1_df[stage1_df["TimeFrame"] == tf].copy()
        tf_df = tf_df.sort_values("PredictedTomorrowClose", ascending=False)
        top_list.append(tf_df.head(top_n))

    stage1_top_df = pd.concat(top_list, ignore_index=True)

    print("\n=== Stage 1 Top Predictions per Timeframe ===")
    print(stage1_top_df[["TimeFrame", "CompanyId", "PredictedTomorrowClose"]])
else:
    stage1_top_df = pd.DataFrame()

# -------------------------
# Phase 2: SARIMAX + PyCaret (optimized)
# -------------------------
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pycaret.regression import setup, create_model, tune_model, finalize_model, predict_model

# Stage 2 targets
target_stage1 = "PredictedTomorrowClose"
target_stage2 = "TomorrowReturn"

# Forecast steps per timeframe
forecast_steps_map = {
    "Daily": 1,
    "Short": 3,
    "Swing": 5,
    "Long": 10
}

# Option 1: Use all Stage 1 predictions
combined_top_df_clean = stage1_df.fillna(0)

# Option 2: Use only the top-N per timeframe
combined_top_df_clean = stage1_top_df.fillna(0)

# Numeric features (exclude targets)
stage2_features = [c for c in combined_top_df_clean.select_dtypes(include=[np.number]).columns 
                   if c not in [target_stage1, target_stage2]]

# -------------------------
# Phase 2: SARIMAX + PyCaret (optimized)
# -------------------------
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
import mlflow
import mlflow.sklearn
from pycaret.regression import setup, create_model, tune_model, finalize_model, predict_model

mlflow.set_tracking_uri("http://localhost:8001")
mlflow.set_experiment("Stage2_SARIMAX_PyCaret")

sarimax_results = []
pycaret_results = []

# with memory issues may want to limit
pycaret_models = ["lr", "lasso", "ridge", "en"]
pycaret_models = ["lasso"]


"""
for tf, steps in forecast_steps_map.items():
    df_tf = timeframe_dfs_all[tf].toPandas().copy()

    top_companies = combined_top_df_clean.loc[
        combined_top_df_clean["TimeFrame"] == tf, "CompanyId"
    ].unique()

    for cid in top_companies:
        df_c = df_tf[df_tf["CompanyId"] == cid].copy().dropna(subset=[target_stage2])

        if df_c.empty or len(df_c) < 120:
            print(f"⏭️ Skipping {cid}-{tf} (not enough data)")
            continue

        ts = df_c[target_stage2]

        # -------------------------
        # SARIMAX
        # -------------------------
        try:
            sarimax_entry = {"CompanyId": cid, "TimeFrame": tf}

            with mlflow.start_run(run_name=f"SARIMAX_{cid}_{tf}"):
                # Example fixed order (replace with auto_arima search results if you have them)
                order = (1, 1, 1)
                seasonal_order = (0, 1, 1, 7)

                sarimax_model = SARIMAX(
                    ts, order=order, seasonal_order=seasonal_order,
                    enforce_stationarity=False, enforce_invertibility=False
                )
                sarimax_fit = sarimax_model.fit(disp=False)

                forecast = sarimax_fit.get_forecast(steps=steps)
                mean_pred = forecast.predicted_mean.mean()
                '''
                sarimax_entry["Pred_SARIMAX"] = mean_pred
                mlflow.log_metric("MeanPred_SARIMAX", mean_pred)

                # Log params
                mlflow.log_params({"order": order, "seasonal_order": seasonal_order})

            sarimax_results.append(sarimax_entry)
            '''
                sarimax_entry = {
                    "CompanyId": cid,
                    "TimeFrame": tf,
                    "Pred_SARIMAX": float(mean_pred),  # ensure scalar
                    "order": str(order),
                    "seasonal_order": str(seasonal_order)
                }
                mlflow.log_metric("MeanPred_SARIMAX", mean_pred)
                mlflow.log_params({"order": order, "seasonal_order": seasonal_order})
            
                # save the model object as an artifact
                with tempfile.TemporaryDirectory() as tmpdir:
                    model_path = os.path.join(tmpdir, "sarimax_model.pkl")
                    joblib.dump(sarimax_fit, model_path)
                    mlflow.log_artifact(model_path, name="SARIMAX_model")


        except Exception as e:
            print(f"❌ SARIMAX failed for {cid}-{tf}: {e}")
"""
for tf, steps in forecast_steps_map.items():
    df_tf = timeframe_dfs_all[tf].toPandas().copy()

    top_companies = combined_top_df_clean.loc[
        combined_top_df_clean["TimeFrame"] == tf, "CompanyId"
    ].unique()

    for cid in top_companies:
        df_c = df_tf[df_tf["CompanyId"] == cid].copy().dropna(subset=[target_stage2])

        if df_c.empty or len(df_c) < 120:
            print(f"⏭️ Skipping {cid}-{tf} (not enough data)")
            continue

        ts = df_c[target_stage2]

        # -------------------------
        # SARIMAX
        # -------------------------
        try:
            sarimax_entry = {"CompanyId": cid, "TimeFrame": tf}

            with mlflow.start_run(run_name=f"SARIMAX_{cid}_{tf}"):
                # Example fixed order (replace with auto_arima search results if you have them)
                order = (1, 1, 1)
                seasonal_order = (0, 1, 1, 7)

                sarimax_model = SARIMAX(
                    ts, order=order, seasonal_order=seasonal_order,
                    enforce_stationarity=False, enforce_invertibility=False
                )
                sarimax_fit = sarimax_model.fit(disp=False)

                forecast = sarimax_fit.get_forecast(steps=steps)
                mean_pred = forecast.predicted_mean.mean()

                # Store results
                sarimax_entry.update({
                    "Pred_SARIMAX": float(mean_pred),
                    "order": str(order),
                    "seasonal_order": str(seasonal_order)
                })

                # Log metric and params
                mlflow.log_metric("MeanPred_SARIMAX", mean_pred)
                mlflow.log_params({"order": order, "seasonal_order": seasonal_order})

                # save model to temp file
                model_path = f"sarimax_{cid}_{tf}.pkl"
                with open(model_path, "wb") as f:
                    pickle.dump(sarimax_fit, f)
                mlflow.log_artifact(model_path)
                os.remove(model_path)
                
                sarimax_results.append(sarimax_entry)

        except Exception as e:
            print(f"❌ SARIMAX failed for {cid}-{tf}: {e}")

        # -------------------------
        # PyCaret
        # -------------------------
        try:
            stage2_features_c = [
                c for c in df_c.select_dtypes(include=[np.number]).columns
                if c not in [target_stage1, target_stage2]
            ]

            pycaret_entry = {"CompanyId": cid, "TimeFrame": tf}

            s = setup(
                data=df_c,
                target=target_stage2,
                numeric_features=stage2_features_c,
                session_id=42,
                log_experiment=False,   # ✅ manual MLflow control
                html=False
            )

            for model_name in pycaret_models:
                with mlflow.start_run(run_name=f"PyCaret_{model_name}_{cid}_{tf}"):
                    '''
                    Memory issues: 
                    Model list is above
                    options
                    ------------
                    df_c_small = df_c.sample(frac=0.3, random_state=42)
                    tuned = tune_model(model, fold=2, optimize="MAE", n_iter=10)
                    
                    model = create_model(model_name, fold=2)  # or even 1
                    tuned = tune_model(model, fold=2, optimize="MAE")

                    suggested:
                    model = create_modelmodel_name, fold=2)
                    final = finalize_model(model)
                    preds = predict_model(final, data=df_c)
                    ------------
                    '''
                    try:
                        model = create_model(model_name, fold=3)
                        tuned = tune_model(model, fold=3, optimize="MAE")
                        final = finalize_model(tuned)
                    except Exception as e:
                        print(f"⚠️ Tuning failed for {model_name} {cid}-{tf}: {e}")
                        # fallback to untuned
                        final = finalize_model(model)

                    preds = predict_model(final, data=df_c)
                    pred_col = next(
                        (c for c in ["Label", "prediction_label", "prediction"] if c in preds.columns),
                        None
                    )
                    mean_pred = preds[pred_col].mean()
                    pycaret_entry[f"Pred_{model_name}_PyCaret"] = mean_pred

                    mlflow.log_metric(f"MeanPred_{model_name}", mean_pred)

                    # Log sklearn model directly
                    mlflow.pycaret.log_model(
                        model=final_model,
                        name=f"{model_name}_model",
                        input_example=df_c.head(1)  # just one row is enough
                    )

            pycaret_results.append(pycaret_entry)

        except Exception as e:
            print(f"❌ PyCaret failed for {cid}-{tf}: {e}")


for entry in sarimax_results:
    print(entry.keys())


# Convert results
sarimax_df = pd.DataFrame(sarimax_results)
pycaret_df = pd.DataFrame(pycaret_results)

# Merge SARIMAX + PyCaret outputs
final_df = combined_top_df_clean.merge(sarimax_df, on=['CompanyId','TimeFrame'], how='left')
final_df = final_df.merge(pycaret_df, on=['CompanyId','TimeFrame'], how='left')

# -------------------------
# Select Top N per timeframe
# -------------------------
top_n = 5

def select_top_n(df, pred_col, n=5):
    top_list = []
    for tf in df['TimeFrame'].unique():
        tf_df = df[df['TimeFrame'] == tf].copy()
        tf_df = tf_df.sort_values(pred_col, ascending=False)
        top_list.append(tf_df.head(n))
    return pd.concat(top_list, ignore_index=True)

# Example: select top 5 by SARIMAX
top_sarimax_df = select_top_n(final_df, 'Pred_SARIMAX', top_n)

# Example: select top 5 by Ridge
def select_top_n(df, pred_col, n=5):
    if pred_col not in df.columns:
        print(f"⚠️ Column {pred_col} not found in DataFrame")
        return pd.DataFrame()  # return empty
    top_list = []
    for tf in df['TimeFrame'].unique():
        tf_df = df[df['TimeFrame'] == tf].copy()
        tf_df = tf_df.sort_values(pred_col, ascending=False)
        top_list.append(tf_df.head(n))
    return pd.concat(top_list, ignore_index=True) if top_list else pd.DataFrame()


# Merge for comparison
top_combined_df = top_sarimax_df.merge(
    pycaret_df,
    on=['CompanyId','TimeFrame'],
    how='outer',
    suffixes=('_SARIMAX','_PyCaret')
)

#print("Top N companies per timeframe (combined SARIMAX + PyCaret):")
#print(top_combined_df)

# pdf_short = ranked_rows.filter(F.col("TimeFrame") == "Short").toPandas()
#top_combined_df.to_csv(f"cvs/final_top_combined_df.csv", index=False)




def select_top_n_final(df, pred_col="PredictedTomorrowClose", n=5):
    """
    Select top-N rows per TimeFrame by prediction column.
    """
    top_list = []
    for tf in df['TimeFrame'].unique():
        tf_df = df[df['TimeFrame'] == tf].copy()
        tf_df = tf_df.sort_values(pred_col, ascending=False)
        top_list.append(tf_df.head(n))
    return pd.concat(top_list, ignore_index=True)

# Pick top-N by PredictedTomorrowClose
top_candidates = select_top_n_final(final_df, pred_col="PredictedTomorrowClose", n=5)

# Reduce to just what you need for DB write
top_out = top_candidates[[
    "CompanyId",
    "TimeFrame",
    "PredictedTomorrowClose"
]]

print(top_out)
from pyspark.sql.functions import lit, max as spark_max

# --- Step 1: Reduce to needed columns ---
top_out = top_candidates[["CompanyId", "TimeFrame", "PredictedTomorrowClose"]]

# --- Step 2: Create managed Delta table if not exists ---
table_name = "bsf.final_top_candidates"

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {table_name} (
    CompanyId STRING,
    TimeFrame STRING,
    PredictedTomorrowClose DOUBLE,
    run_id INT
)
USING DELTA
""")

# --- Step 3: Determine next run_id ---
if spark._jsparkSession.catalog().tableExists(table_name):
    latest_run_id = (
        spark.read.table(table_name)
        .agg(spark_max("run_id"))
        .collect()[0][0]
    )
    run_id = (latest_run_id or 0) + 1
else:
    run_id = 1

# --- Step 4: Add run_id and write ---
top_out_df = top_out.withColumn("run_id", lit(run_id))
top_out_df.write.format("delta").mode("append").saveAsTable(table_name)

print(f"✅ Run {run_id} written to {table_name}")
print(top_out_df.show())
# pdf_short = ranked_rows.filter(F.col("TimeFrame") == "Short").toPandas()
#top_combined_df.to_csv(f"cvs/final_top_combined_1_df.csv", index=False)
from pyspark.sql.functions import lit, col, max as spark_max

table_name = "bsf.final_top_combined"

# --- Step 1: Create managed Delta table if not exists ---
# (I’ll include all your columns explicitly)
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {table_name} (
    CompanyId STRING,
    StockDate DATE,
    Open DOUBLE,
    High DOUBLE,
    Low DOUBLE,
    Close DOUBLE,
    TomorrowClose DOUBLE,
    Return DOUBLE,
    TomorrowReturn DOUBLE,
    Doji BOOLEAN,
    Hammer BOOLEAN,
    InvertedHammer BOOLEAN,
    ShootingStar BOOLEAN,
    BullishEngulfing BOOLEAN,
    BearishEngulfing BOOLEAN,
    PiercingLine BOOLEAN,
    DarkCloudCover BOOLEAN,
    MorningStar BOOLEAN,
    EveningStar BOOLEAN,
    ThreeWhiteSoldiers BOOLEAN,
    ThreeBlackCrows BOOLEAN,
    TweezerTop BOOLEAN,
    TweezerBottom BOOLEAN,
    InsideBar BOOLEAN,
    OutsideBar BOOLEAN,
    MA DOUBLE,
    MA_slope DOUBLE,
    UpTrend_MA BOOLEAN,
    DownTrend_MA BOOLEAN,
    MomentumUp BOOLEAN,
    MomentumDown BOOLEAN,
    ConfirmedUpTrend BOOLEAN,
    ConfirmedDownTrend BOOLEAN,
    RecentReturn DOUBLE,
    UpTrend_Return BOOLEAN,
    DownTrend_Return BOOLEAN,
    Volatility DOUBLE,
    LowVolatility BOOLEAN,
    HighVolatility BOOLEAN,
    ROC DOUBLE,
    MomentumZ DOUBLE,
    SignalStrength INT,
    SignalStrengthHybrid DOUBLE,
    ActionConfidence DOUBLE,
    BullishStrengthHybrid DOUBLE,
    BearishStrengthHybrid DOUBLE,
    SignalDuration DOUBLE,
    ValidAction BOOLEAN,
    HasValidSignal BOOLEAN,
    MomentumAction STRING,
    PatternAction STRING,
    CandleAction STRING,
    CandidateAction STRING,
    Action STRING,
    TomorrowAction STRING,
    TomorrowActionSource STRING,
    BatchId STRING,
    IngestedAt STRING,
    TimeFrame STRING,
    log_Open DOUBLE,
    log_High DOUBLE,
    log_Low DOUBLE,
    log_Close DOUBLE,
    Pred_Linear DOUBLE,
    Pred_Lasso DOUBLE,
    Pred_Ridge DOUBLE,
    PredictedTomorrowClose DOUBLE,
    Pred_SARIMAX DOUBLE,
    order STRING,
    seasonal_order STRING,
    Pred_lr_PyCaret_SARIMAX DOUBLE,
    Pred_lasso_PyCaret_SARIMAX DOUBLE,
    Pred_ridge_PyCaret_SARIMAX DOUBLE,
    Pred_en_PyCaret_SARIMAX DOUBLE,
    Pred_lr_PyCaret_PyCaret DOUBLE,
    Pred_lasso_PyCaret_PyCaret DOUBLE,
    Pred_ridge_PyCaret_PyCaret DOUBLE,
    Pred_en_PyCaret_PyCaret DOUBLE,
    run_id INT
)
USING DELTA
""")

# --- Step 2: Get next run_id ---
if spark._jsparkSession.catalog().tableExists(table_name):
    latest_run_id = (
        spark.read.table(table_name)
        .agg(spark_max("run_id"))
        .collect()[0][0]
    )
    run_id = (latest_run_id or 0) + 1
else:
    run_id = 1

# --- Step 3: Add run_id to DataFrame and save ---
df_with_id = top_combined_df.withColumn("run_id", lit(run_id))

df_with_id.write.format("delta").mode("append").saveAsTable(table_name)

print(f"✅ Run {run_id} written to {table_name}")


spark.stop()

[Spark] Started 'bsf_candidates_analysis' log_level=WARN (effective=WARN), progress=False
+---------+-------------------------+-----------+
|namespace|tableName                |isTemporary|
+---------+-------------------------+-----------+
|bsf      |company                  |false      |
|bsf      |companystockhistory      |false      |
|bsf      |daily_signals            |false      |
|bsf      |daily_signals_allcol     |false      |
|bsf      |daily_signals_last       |false      |
|bsf      |daily_signals_last_allcol|false      |
+---------+-------------------------+-----------+

Table: bsf.company | Rows: 30949
Table: bsf.companystockhistory | Rows: 458254
Table: bsf.daily_signals | Rows: 3713208
Table: bsf.daily_signals_allcol | Rows: 3713208
Table: bsf.daily_signals_last | Rows: 14784
Table: bsf.daily_signals_last_allcol | Rows: 14784
+------+-------+
|Action|count  |
+------+-------+
|Hold  |2638632|
|Buy   |596065 |
|Sell  |478511 |
+------+-------+

+---------+------+------+


25/09/13 09:12:01 WARN CacheManager: Asked to cache already cached data.


Unnamed: 0,CompanyId,TimeFrame,StockDate,Open,High,Low,Close,Doji,Hammer,HangingMan,InvertedHammer,ShootingStar,BullishMarubozu,BearishMarubozu,SuspiciousCandle,BullishEngulfing,BearishEngulfing,BullishHarami,BearishHarami,HaramiCross,PiercingLine,DarkCloudCover,MorningStar,EveningStar,ThreeWhiteSoldiers,ThreeBlackCrows,TweezerTop,TweezerBottom,InsideBar,OutsideBar,NearHigh,NearLow,PatternCount,PatternType,MA,MA_slope,UpTrend_MA,DownTrend_MA,RecentReturn,UpTrend_Return,DownTrend_Return,Volatility,LowVolatility,HighVolatility,ROC,MomentumUp,MomentumDown,ConfirmedUpTrend,ConfirmedDownTrend,ValidHammer,ValidBullishEngulfing,ValidPiercingLine,ValidMorningStar,ValidThreeWhiteSoldiers,ValidBullishMarubozu,ValidTweezerBottom,ValidShootingStar,ValidBearishEngulfing,ValidDarkCloud,ValidEveningStar,ValidThreeBlackCrows,ValidBearishMarubozu,ValidTweezerTop,ValidHaramiCross,ValidBullishHarami,ValidBearishHarami,ValidInsideBar,ValidOutsideBar,TomorrowClose,TomorrowReturn,Return,AvgReturn,MomentumZ,BuyThresh,SellThresh,MomentumAction,BullScore,BearScore,PatternScore,PatternScoreNorm,PatternAction,CandleAction,CandidateAction,Action,TomorrowAction,TomorrowActionSource,SignalStrengthHybrid,ActionConfidence,BullishStrengthHybrid,BearishStrengthHybrid,SignalDuration,ValidAction,HasValidSignal,SignalStrength,BatchId,IngestedAt,BuyCount,SellCount,HoldCount,Return.1,BuyRank,SellRank,HoldRank
0,100325,Daily,2025-09-10,0.0126,0.0126,0.0126,0.0126,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,True,5,,0.012573,-0.036035,False,True,0.0,False,False,0.185284,True,False,-0.020218,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,0.0,-0.005814,0.03138,0.022422,-0.033779,Buy,0.0,2.0,-2.0,-2.0,Sell,Hold,Buy,Buy,Hold,LastRowHold,0.348937,0.348937,0.348937,0.348937,162.0,True,True,1,100325_20250912_202542,20250912_202542,2,0,0,0.0,1,462,1853
1,100325,Daily,2025-09-10,0.0126,0.0126,0.0126,0.0126,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,True,5,,0.012573,-0.036035,False,True,0.0,False,False,0.185284,True,False,-0.020218,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,0.0,-0.005814,0.03138,0.019497,-0.036794,Buy,0.0,2.0,-2.0,-2.0,Sell,Hold,Buy,Buy,Hold,LastRowHold,0.348937,0.348937,0.348937,0.348937,162.0,True,True,1,100325_20250911_183349,20250911_183349,2,0,0,0.0,1,462,1853
2,34813,Long,2025-08-29,0.008,0.008,0.008,0.008,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,3,BullishMarubozu,0.0068,0.307692,True,False,0.6,True,False,0.189737,False,True,0.6,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,0.0,0.06,-0.316228,-0.001078,-0.070165,Sell,14.0,8.0,6.0,0.6,Buy,Hold,Buy,Buy,Hold,LastRowHold,0.199524,0.199524,0.199524,0.199524,65.0,True,True,1,34813_20250911_174811,20250911_174811,2,0,0,0.0,1,403,1845
3,34813,Long,2025-08-29,0.008,0.008,0.008,0.008,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,3,BullishMarubozu,0.0068,0.307692,True,False,0.6,True,False,0.189737,False,True,0.6,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,0.0,0.06,-0.316228,-0.00116,-0.070403,Sell,14.0,8.0,6.0,0.6,Buy,Hold,Buy,Buy,Hold,LastRowHold,0.199524,0.199524,0.199524,0.199524,65.0,True,True,1,34813_20250912_182720,20250912_182720,2,0,0,0.0,1,403,1845
4,67893,Short,2025-08-29,0.075,0.075,0.075,0.075,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,True,5,BullishEngulfing,0.07125,0.054774,True,False,0.111111,True,False,0.035101,False,True,0.111111,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,0.111111,0.01126,2.84467,0.010723,-0.004445,Buy,3.0,2.0,1.0,0.333333,Buy,Hold,Buy,Buy,Hold,LastRowHold,0.470283,0.470283,0.470283,0.470283,33.0,True,True,1,67893_20250911_164057,20250911_164057,2,0,0,0.222222,1,461,1855
5,67893,Short,2025-08-29,0.075,0.075,0.075,0.075,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,True,5,BullishEngulfing,0.07125,0.054774,True,False,0.111111,True,False,0.035101,False,True,0.111111,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,0.111111,0.01126,2.84467,0.01077,-0.00445,Buy,3.0,2.0,1.0,0.333333,Buy,Hold,Buy,Buy,Hold,LastRowHold,0.470283,0.470283,0.470283,0.470283,33.0,True,True,1,67893_20250912_164049,20250912_164049,2,0,0,0.222222,1,461,1855
6,52938,Swing,2025-08-29,0.02175,0.02175,0.02175,0.02175,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,3,BullishMarubozu,0.020642,0.064679,True,False,0.145943,True,False,0.050551,False,True,0.145943,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,0.0,0.009494,-0.187815,0.066529,0.03492,Sell,8.0,5.0,3.0,0.6,Buy,Hold,Buy,Buy,Hold,LastRowHold,0.180148,0.180148,0.180148,0.180148,76.0,True,True,1,52938_20250911_171534,20250911_171534,2,0,0,0.0,1,470,1851
7,52938,Swing,2025-08-29,0.02175,0.02175,0.02175,0.02175,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,3,BullishMarubozu,0.020642,0.064679,True,False,0.145943,True,False,0.050551,False,True,0.145943,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,,0.0,0.009494,-0.187815,0.069717,0.038114,Sell,8.0,5.0,3.0,0.6,Buy,Hold,Buy,Buy,Hold,LastRowHold,0.180148,0.180148,0.180148,0.180148,74.0,True,True,1,52938_20250912_172154,20250912_172154,2,0,0,0.0,1,470,1851


25/09/13 09:35:06 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
25/09/13 09:35:14 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:978)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce

Py4JJavaError: An error occurred while calling o2670.collectToPython.
: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:481)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
py4j.ClientServerConnection.run(ClientServerConnection.java:106)
java.base/java.lang.Thread.run(Thread.java:840)

The currently active SparkContext was created at:

(No active SparkContext.)
         
	at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:120)
	at org.apache.spark.SparkContext.defaultParallelism(SparkContext.scala:2559)
	at org.apache.spark.sql.execution.adaptive.CoalesceShufflePartitions.$anonfun$apply$1(CoalesceShufflePartitions.scala:60)
	at scala.runtime.java8.JFunction0$mcI$sp.apply(JFunction0$mcI$sp.java:23)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.execution.adaptive.CoalesceShufflePartitions.apply(CoalesceShufflePartitions.scala:57)
	at org.apache.spark.sql.execution.adaptive.CoalesceShufflePartitions.apply(CoalesceShufflePartitions.scala:33)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$optimizeQueryStage$1(AdaptiveSparkPlanExec.scala:157)
	at scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
	at scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
	at scala.collection.immutable.List.foldLeft(List.scala:91)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.optimizeQueryStage(AdaptiveSparkPlanExec.scala:156)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.newQueryStage(AdaptiveSparkPlanExec.scala:539)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:500)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$createQueryStages$2(AdaptiveSparkPlanExec.scala:530)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:530)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:496)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$createQueryStages$2(AdaptiveSparkPlanExec.scala:530)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:530)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$createQueryStages$2(AdaptiveSparkPlanExec.scala:530)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:530)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$createQueryStages$2(AdaptiveSparkPlanExec.scala:530)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:530)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$createQueryStages$2(AdaptiveSparkPlanExec.scala:530)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:530)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$createQueryStages$2(AdaptiveSparkPlanExec.scala:530)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:530)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$createQueryStages$2(AdaptiveSparkPlanExec.scala:530)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:530)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$createQueryStages$2(AdaptiveSparkPlanExec.scala:530)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:530)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:496)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$createQueryStages$2(AdaptiveSparkPlanExec.scala:530)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:530)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$createQueryStages$2(AdaptiveSparkPlanExec.scala:530)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:530)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$1(AdaptiveSparkPlanExec.scala:241)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.getFinalPhysicalPlan(AdaptiveSparkPlanExec.scala:236)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:381)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:354)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3997)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4167)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4165)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4165)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3994)
	at jdk.internal.reflect.GeneratedMethodAccessor63.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
