In [None]:
from bsf_env import init_spark, init_mariadb_engine,set_spark_verbosity
from pyspark.sql.functions import lit, current_timestamp
import pandas as pd
import numpy as np
from pyspark.sql.types import *
from tqdm import tqdm
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from IPython.display import display, HTML
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import joblib
import tempfile
import os

spark = init_spark("bsf_candidates_analysis", log_level="WARN", show_progress=False, enable_ui=True, priority=False)
engine = init_mariadb_engine()

ingest_ts = spark.sql("SELECT current_timestamp()").collect()[0][0]

pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.width", 200)         # Adjust width for readability
pd.set_option("display.max_rows", 20)       # Show only top 20 rows by default

df_last = spark.table("bsf.daily_signals_last_allcol ")
df_all = spark.table("bsf.daily_signals")

#df_all.groupBy("Action").count().orderBy(F.desc("count")).show(truncate=False)

#print(df_all.groupBy("TimeFrame", "Action") \
#  .count() \
#  .orderBy("TimeFrame", F.desc("count")) \
#  .show(truncate=False))
#from pyspark import StorageLevel

#df = df_last.persist(StorageLevel.MEMORY_AND_DISK)


#df = df_last.cache()
#df.count()   # forces Spark to actually materialize & cache

# -----------------------------
# Aggregate Buy/Sell/Hold counts per company per timeframe
# -----------------------------
df_counts = df_last.groupBy("CompanyId", "TimeFrame").agg(
    F.sum(F.when(F.col("Action") == "Buy", 1).otherwise(0)).alias("BuyCount"),
    F.sum(F.when(F.col("Action") == "Sell", 1).otherwise(0)).alias("SellCount"),
    F.sum(F.when(F.col("Action") == "Hold", 1).otherwise(0)).alias("HoldCount"),
    F.sum("Return").alias("Return")
)

# -----------------------------
# Define window partitioned by timeframe
# -----------------------------
w_buy = Window.partitionBy("TimeFrame").orderBy(F.desc("BuyCount"))
w_sell = Window.partitionBy("TimeFrame").orderBy(F.desc("SellCount"))
w_hold = Window.partitionBy("TimeFrame").orderBy(F.desc("HoldCount"))

# -----------------------------
# Add separate rank columns
# -----------------------------
df_ranked = (
    df_counts
    .withColumn("BuyRank", F.row_number().over(w_buy))
    .withColumn("SellRank", F.row_number().over(w_sell))
    .withColumn("HoldRank", F.row_number().over(w_hold))
)

# -----------------------------
# Select what you want
# -----------------------------
ranked_companies = df_ranked.select(
    "CompanyId", "TimeFrame", "BuyCount", "SellCount", "HoldCount", 
    "Return", "BuyRank", "SellRank", "HoldRank"
)

# -----------------------------
# Join back to the original df to get full rows with rank by last return
# -----------------------------
df_ranked_last = df_last.join(ranked_companies, on=["CompanyId", "TimeFrame"], how="inner") # adds the sellcount etc from the ranked_company df
df_ranked_all = df_all.join(
    ranked_companies.select("CompanyId", "TimeFrame"), 
    on=["CompanyId", "TimeFrame"], 
    how="inner"
)


top_n = 20

# --- Filter last-ranked DF to top 20 ---
df_ranked_last_top = df_ranked_last.filter(F.col("BuyRank") <= top_n)

# --- Get the top 20 companies per timeframe ---
top_companies = df_ranked_last_top.select("CompanyId", "TimeFrame").distinct()

# --- Filter all historical DF to only include top 20 companies ---
df_ranked_last_topN = df_ranked_last.join(top_companies, on=["CompanyId", "TimeFrame"], how="inner")
df_ranked_all_topN = df_ranked_all.join(top_companies, on=["CompanyId", "TimeFrame"], how="inner")

from pyspark.sql import functions as F

In [None]:
display(df_ranked_last_top.toPandas())

In [None]:
y=1/0

In [None]:
# List of timeframes
timeframes = ["Short", "Swing", "Long", "Daily"]
top_n = 20  # number of top-ranked companies per timeframe

# --- Filter top N per timeframe for last-ranked DF ---
timeframe_dfs = {}
timeframe_dfs_all = {}

for tf in timeframes:
    # All rows for this timeframe
    sdf_all = df_ranked_all.filter((F.col("TimeFrame") == tf) & (F.col("BuyRank") <= top_n))
    timeframe_dfs_all[tf] = sdf_all
    
    # Top N rows for this timeframe (using BuyRank)
    sdf_top = df_ranked_last.filter((F.col("TimeFrame") == tf) & (F.col("BuyRank") <= top_n))
    timeframe_dfs[tf] = sdf_top
    

# -----------------------------
# Convert to Pandas and save as csv
# -----------------------------
#List of timeframes
timeframes = ["Short", "Swing", "Long", "Daily"]

for tf in timeframes:
    df_name = f"pdf_{tf.lower()}"  # e.g., "pdf_short"
    df_name_all = f"pdf_{tf.lower()}_all"  # e.g., "pdf_short_all"
    globals()[df_name] = df_ranked_last.filter(F.col("TimeFrame") == tf)
    globals()[df_name_all] = df_ranked_all.filter(F.col("TimeFrame") == tf)
    #ranked_rows.filter(F.col("TimeFrame") == tf).toPandas().to_csv(f"cvs/{tf.lower()}_output.csv", index=False)

# pdf_short = ranked_rows.filter(F.col("TimeFrame") == "Short").toPandas()
# pdf_short.to_csv(f"cvs/short_output.csv", index=False)

timeframe_dfs = {
    "Short": pdf_short,
    "Swing": pdf_swing,
    "Long": pdf_long,
    "Daily": pdf_daily
}
timeframe_dfs_all = {
    "Short": pdf_short_all,
    "Swing": pdf_swing_all,
    "Long": pdf_long_all,
    "Daily": pdf_daily_all
}


# Store Spark DFs only (no toPandas here)
timeframe_dfs = {tf: df_ranked_last.filter(F.col("TimeFrame") == tf) for tf in timeframes}
timeframe_dfs_all = {tf: df_ranked_all.filter(F.col("TimeFrame") == tf) for tf in timeframes}

In [None]:
display(df_ranked_last.filter(F.col("BuyRank") <= 20).orderBy("TimeFrame", "BuyRank").toPandas())
print("✅ Stage 1 completed: Top 20 candidates selected per timeframe")


In [None]:
print(timeframe_dfs_all)

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pycaret.regression import setup, compare_models, predict_model, finalize_model
from pycaret.regression import setup, create_model, tune_model, finalize_model, predict_model
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
#full
# -------------------------
# Full Stage 2 → Stage 3 Pipeline
# -------------------------
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pycaret.regression import setup, compare_models, predict_model, finalize_model
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
# -------------------------
# Stage 2: Predict TomorrowClose Regression
# -------------------------
target_stage2 = "TomorrowClose"
epsilon = 1e-6
all_stage2_predictions = []
top_n=5
# forecast steps per timeframe for Stage 2
forecast_steps_map = {
    "Daily": 1,
    "Short": 3,
    "Swing": 5,
    "Long": 10
}
# with memory issues may want to limit
pycaret_models = ["lr", "lasso", "ridge", "en"]
pycaret_models = ["lasso"]
'''
from tqdm import tqdm

# Loop over timeframes with tqdm
for tf, sdf_tf in tqdm(timeframe_dfs_all.items(), desc="TimeFrames"):
    pdf_tf = sdf_tf.toPandas()
    companies = pdf_tf['CompanyId'].unique()
    
    # Inner loop over companies with tqdm
    for cid in tqdm(companies, desc=f"Companies ({tf})", leave=False):
        df_c = pdf_tf[pdf_tf['CompanyId'] == cid].copy()


# Loop over timeframes
for tf, sdf_tf in timeframe_dfs_all.items():
    pdf_tf = sdf_tf.toPandas()
    companies = pdf_tf['CompanyId'].unique()
    print(f"\n=== Phase 2 - Processing timeframe: {tf} ===")
    # Loop over companies
    for cid in companies:
        df_c = pdf_tf[pdf_tf['CompanyId'] == cid].copy()
'''   
for tf, sdf_tf in timeframe_dfs_all.items():
    pdf_tf = sdf_tf.toPandas()
    print(f"\n=== Phase 2 - Processing top 20 for timeframe: {tf} ===")
    companies = pdf_tf['CompanyId'].unique()
    # Loop over companies
    for cid in companies:
        df_c = pdf_tf[pdf_tf['CompanyId'] == cid].copy()
    
        # -------------------------------
        # Log-transform OHLC to normalize scale
        # -------------------------------
        for col in ["Open","High","Low","Close"]:
            df_c[f"log_{col}"] = np.log(df_c[col].replace(0, epsilon))

        # -------------------------------
        # Training data: rows where target is known
        # -------------------------------
        train_df = df_c[df_c[target_stage2].notna()].copy()
        if train_df.empty:
            print(f"skipped company {cid} no train_df")
            continue

        # -------------------------------
        # Feature selection: numeric columns correlated with target
        # -------------------------------
        numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
        if target_stage2 in numeric_cols:
            numeric_cols.remove(target_stage2)
        # Automatically pick numeric and boolean columns
        bool_cols = train_df.select_dtypes(include=["bool"]).columns.tolist()
        
        all_features = numeric_cols + bool_cols

        corr = train_df[all_features + [target_stage2]].corr()[target_stage2].abs()
        threshold = 0.03  # minimal correlation
        good_features = corr[corr >= threshold].index.tolist()

        X_train = train_df[good_features].fillna(0)
        y_train = train_df[target_stage2]

        # -------------------------------
        # Stage 1 models
        # -------------------------------
        lr_model = LinearRegression().fit(X_train, y_train)
        lasso_model = Lasso(alpha=0.01).fit(X_train, y_train)
        ridge_model = Ridge(alpha=1.0, solver="svd").fit(X_train, y_train)
        
        lr_rmse = mean_squared_error(y_train, lr_model.predict(X_train), squared=False)
        lasso_rmse = mean_squared_error(y_train, lasso_model.predict(X_train), squared=False)
        ridge_rmse = mean_squared_error(y_train, ridge_model.predict(X_train), squared=False)
        
        total_inv = 1/lr_rmse + 1/lasso_rmse + 1/ridge_rmse
        weights = {
            "Linear": (1/lr_rmse)/total_inv,
            "Lasso": (1/lasso_rmse)/total_inv,
            "Ridge": (1/ridge_rmse)/total_inv
        }
        
        # -------------------------------
        # Predict future rows (target is NaN)
        # -------------------------------
        future_df = df_c[df_c[target_stage2].isna()].copy()  ##Predicts just tomorrowclose=nan which is the last records in the df so it should be 1 day
        if not future_df.empty:
            X_future = future_df[good_features].fillna(0)
            
            future_df["Pred_Linear"] = lr_model.predict(X_future)
            future_df["Pred_Lasso"] = lasso_model.predict(X_future)
            future_df["Pred_Ridge"] = ridge_model.predict(X_future)
            # Weighted prediction
            future_df["Pred_Sklearn"] = (
                future_df["Pred_Linear"] * weights["Linear"] +
                future_df["Pred_Lasso"] * weights["Lasso"] +
                future_df["Pred_Ridge"] * weights["Ridge"]
            )

            future_df["PredictedReturn_Sklearn"] = (future_df["Pred_Sklearn"] - future_df["Close"]) / future_df["Close"]
            # -------------------------------
            # PyCaret regression embedded
            # -------------------------------
            '''
            try:
                # PyCaret setup (on train_df only)
                train_df_clean = train_df.dropna(subset=[target_stage2])

                pycaret_exp = setup(
                    data=train_df_clean[all_features + [target_stage2]],
                    target=target_stage2,
                    session_id=42,
                    log_experiment=False,   # ✅ manual MLflow control
                    #html=False,
                    #verbose=False
                )
                
                best_model = compare_models(verbose=False)
                final_model = finalize_model(best_model)
                
                pycaret_preds = predict_model(final_model, data=future_df[all_features])
                # Add PyCaret prediction to future_df
                future_df["Pred_PyCaret"] = pycaret_preds["prediction_label"]
                future_df["PredictedReturn_PyCaret"] = (future_df["Pred_PyCaret"] - future_df["Close"]) / future_df["Close"]
                  
            except Exception as e:
                print(f"PyCaret failed for {cid} {tf}: {e}")
                future_df["Pred_PyCaret"] = float("nan") 
                future_df["PredictedReturn_PyCaret"] = float("nan") 
            '''
            train_df_clean = train_df.dropna(subset=[target_stage2])
            s = setup(
                data=train_df_clean[all_features + [target_stage2]],
                target=target_stage2,
                session_id=42,
                log_experiment=False,   # ✅ manual MLflow control
                html=False,
                verbose=False
            )
            for model_name in pycaret_models:
                    try:
                        model = create_model(model_name, fold=2)
                        tuned = tune_model(model, fold=2, optimize="MAE", n_iter=3)
                        final = finalize_model(tuned)
                    except Exception as e:
                        print(f"⚠️ Tuning failed for {model_name} {cid}-{tf}: {e}")
                        # fallback to untuned
                        final = finalize_model(model)

                    pycaret_preds = predict_model(final, data=future_df[all_features])
                    #pred_col = next(
                        #(c for c in ["Label", "prediction_label", "prediction"] if c in pycaret_preds.columns),
                        #None
                    #)
                    #mean_pred = preds[pred_col].mean()     
                    future_df["Pred_PyCaret"] = pycaret_preds["prediction_label"]
                    future_df["PredictedReturn_PyCaret"] = (future_df["Pred_PyCaret"] - future_df["Close"]) / future_df["Close"]
                    
    
            #future_df["Pred_PyCaret"] = float("nan") 
            #future_df["PredictedReturn_PyCaret"] = float("nan") 

            
            future_df["TimeFrame"] = tf
            future_df["CompanyId"] = cid
            all_stage2_predictions.append(future_df)

# -------------------------------
# Combine Stage 1 predictions
# -------------------------------
if all_stage2_predictions:
    stage2_df = pd.concat(all_stage2_predictions, ignore_index=True)
else:
    stage2_df = pd.DataFrame()
    print("No predictions generated.")
    
top_list = []

for tf in stage2_df["TimeFrame"].unique():
    tf_df = stage2_df[stage2_df["TimeFrame"] == tf].copy()
    
    # Take the higher of the two predicted returns
    tf_df["MaxPredictedReturn"] = tf_df[["PredictedReturn_Sklearn", "PredictedReturn_PyCaret"]].max(axis=1)
    
    # Sort by this max predicted return
    tf_df = tf_df.sort_values("MaxPredictedReturn", ascending=False)
    
    # Take top N
    top_list.append(tf_df.head(top_n))

# Concatenate top stocks across all timeframes
stage2_top_df = pd.concat(top_list, ignore_index=True)

# Optional: keep only relevant columns
cols_to_keep = ["TimeFrame", "CompanyId", "Close",
                "Pred_Sklearn", "PredictedReturn_Sklearn",
                "Pred_PyCaret", "PredictedReturn_PyCaret", "MaxPredictedReturn"]
stage2_top_df = stage2_top_df[cols_to_keep]

print("\n=== Stage 2 Top Predictions per Timeframe ===")
print(stage2_top_df[["TimeFrame", "CompanyId", "Close", "Pred_Sklearn", "Pred_PyCaret", "MaxPredictedReturn"]])

In [None]:
'''
# -------------------------------
# Top-N selection per timeframe (using average of Linear/Lasso/Ridge)
# -------------------------------
if not stage2_df.empty:
    top_list = []
    for tf in stage2_df["TimeFrame"].unique():
        tf_df = stage2_df[stage2_df["TimeFrame"] == tf].copy()  
        tf_df = tf_df.sort_values("PredictedTomorrowClose", ascending=False)
        top_list.append(tf_df.head(top_n))
    
    stage2_top_df = pd.concat(top_list, ignore_index=True)
    
    print("\n=== Stage 2 Top Predictions per Timeframe ===")
    print(stage2_top_df[["TimeFrame", "CompanyId", "Close", "PredictedTomorrowClose", "PredictedReturn"]])

else:
    stage2_top_df = pd.DataFrame()
'''

In [None]:
# -------------------------
# Phase 2: SARIMAX + PyCaret (optimized)
# -------------------------
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pycaret.regression import setup, create_model, tune_model, finalize_model, predict_model

# Stage 2 targets
target_stage1 = "PredictedTomorrowClose"
target_stage2 = "TomorrowReturn"

# Forecast steps per timeframe
forecast_steps_map = {
    "Daily": 1,
    "Short": 3,
    "Swing": 5,
    "Long": 10
}

# Option 1: Use all Stage 1 predictions
combined_top_df_clean = stage1_df.fillna(0)

# Option 2: Use only the top-N per timeframe
combined_top_df_clean = stage1_top_df.fillna(0)

# Numeric features (exclude targets)
stage2_features = [c for c in combined_top_df_clean.select_dtypes(include=[np.number]).columns 
                   if c not in [target_stage1, target_stage2]]

# -------------------------
# Phase 2: SARIMAX + PyCaret (optimized)
# -------------------------
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
import mlflow
import mlflow.sklearn
from pycaret.regression import setup, create_model, tune_model, finalize_model, predict_model

mlflow.set_tracking_uri("http://localhost:8001")
mlflow.set_experiment("Stage2_SARIMAX_PyCaret")

sarimax_results = []
pycaret_results = []

# with memory issues may want to limit
pycaret_models = ["lr", "lasso", "ridge", "en"]
pycaret_models = ["lasso"]


"""
for tf, steps in forecast_steps_map.items():
    df_tf = timeframe_dfs_all[tf].toPandas().copy()

    top_companies = combined_top_df_clean.loc[
        combined_top_df_clean["TimeFrame"] == tf, "CompanyId"
    ].unique()

    for cid in top_companies:
        df_c = df_tf[df_tf["CompanyId"] == cid].copy().dropna(subset=[target_stage2])

        if df_c.empty or len(df_c) < 120:
            print(f"⏭️ Skipping {cid}-{tf} (not enough data)")
            continue

        ts = df_c[target_stage2]

        # -------------------------
        # SARIMAX
        # -------------------------
        try:
            sarimax_entry = {"CompanyId": cid, "TimeFrame": tf}

            with mlflow.start_run(run_name=f"SARIMAX_{cid}_{tf}"):
                # Example fixed order (replace with auto_arima search results if you have them)
                order = (1, 1, 1)
                seasonal_order = (0, 1, 1, 7)

                sarimax_model = SARIMAX(
                    ts, order=order, seasonal_order=seasonal_order,
                    enforce_stationarity=False, enforce_invertibility=False
                )
                sarimax_fit = sarimax_model.fit(disp=False)

                forecast = sarimax_fit.get_forecast(steps=steps)
                mean_pred = forecast.predicted_mean.mean()
                '''
                sarimax_entry["Pred_SARIMAX"] = mean_pred
                mlflow.log_metric("MeanPred_SARIMAX", mean_pred)

                # Log params
                mlflow.log_params({"order": order, "seasonal_order": seasonal_order})

            sarimax_results.append(sarimax_entry)
            '''
                sarimax_entry = {
                    "CompanyId": cid,
                    "TimeFrame": tf,
                    "Pred_SARIMAX": float(mean_pred),  # ensure scalar
                    "order": str(order),
                    "seasonal_order": str(seasonal_order)
                }
                mlflow.log_metric("MeanPred_SARIMAX", mean_pred)
                mlflow.log_params({"order": order, "seasonal_order": seasonal_order})
            
                # save the model object as an artifact
                with tempfile.TemporaryDirectory() as tmpdir:
                    model_path = os.path.join(tmpdir, "sarimax_model.pkl")
                    joblib.dump(sarimax_fit, model_path)
                    mlflow.log_artifact(model_path, name="SARIMAX_model")


        except Exception as e:
            print(f"❌ SARIMAX failed for {cid}-{tf}: {e}")
"""
for tf, steps in forecast_steps_map.items():
    df_tf = timeframe_dfs_all[tf].toPandas().copy()

    top_companies = combined_top_df_clean.loc[
        combined_top_df_clean["TimeFrame"] == tf, "CompanyId"
    ].unique()

    for cid in top_companies:
        df_c = df_tf[df_tf["CompanyId"] == cid].copy().dropna(subset=[target_stage2])

        if df_c.empty or len(df_c) < 120:
            print(f"⏭️ Skipping {cid}-{tf} (not enough data)")
            continue

        ts = df_c[target_stage2]

        # -------------------------
        # SARIMAX
        # -------------------------
        try:
            sarimax_entry = {"CompanyId": cid, "TimeFrame": tf}

            with mlflow.start_run(run_name=f"SARIMAX_{cid}_{tf}"):
                # Example fixed order (replace with auto_arima search results if you have them)
                order = (1, 1, 1)
                seasonal_order = (0, 1, 1, 7)

                sarimax_model = SARIMAX(
                    ts, order=order, seasonal_order=seasonal_order,
                    enforce_stationarity=False, enforce_invertibility=False
                )
                sarimax_fit = sarimax_model.fit(disp=False)

                forecast = sarimax_fit.get_forecast(steps=steps)
                mean_pred = forecast.predicted_mean.mean()

                # Store results
                sarimax_entry.update({
                    "Pred_SARIMAX": float(mean_pred),
                    "order": str(order),
                    "seasonal_order": str(seasonal_order)
                })

                # Log metric and params
                mlflow.log_metric("MeanPred_SARIMAX", mean_pred)
                mlflow.log_params({"order": order, "seasonal_order": seasonal_order})

                # save model to temp file
                model_path = f"sarimax_{cid}_{tf}.pkl"
                with open(model_path, "wb") as f:
                    pickle.dump(sarimax_fit, f)
                mlflow.log_artifact(model_path)
                os.remove(model_path)
                
                sarimax_results.append(sarimax_entry)

        except Exception as e:
            print(f"❌ SARIMAX failed for {cid}-{tf}: {e}")

        # -------------------------
        # PyCaret
        # -------------------------
        try:
            stage2_features_c = [
                c for c in df_c.select_dtypes(include=[np.number]).columns
                if c not in [target_stage1, target_stage2]
            ]

            pycaret_entry = {"CompanyId": cid, "TimeFrame": tf}

            s = setup(
                data=df_c,
                target=target_stage2,
                numeric_features=stage2_features_c,
                session_id=42,
                log_experiment=False,   # ✅ manual MLflow control
                html=False
            )

            for model_name in pycaret_models:
                with mlflow.start_run(run_name=f"PyCaret_{model_name}_{cid}_{tf}"):
                    '''
                    Memory issues: 
                    Model list is above
                    options
                    ------------
                    df_c_small = df_c.sample(frac=0.3, random_state=42)
                    tuned = tune_model(model, fold=2, optimize="MAE", n_iter=10)
                    
                    model = create_model(model_name, fold=2)  # or even 1
                    tuned = tune_model(model, fold=2, optimize="MAE")

                    suggested:
                    model = create_modelmodel_name, fold=2)
                    final = finalize_model(model)
                    preds = predict_model(final, data=df_c)
                    ------------
                    '''
                    try:
                        model = create_model(model_name, fold=3)
                        tuned = tune_model(model, fold=3, optimize="MAE")
                        final = finalize_model(tuned)
                    except Exception as e:
                        print(f"⚠️ Tuning failed for {model_name} {cid}-{tf}: {e}")
                        # fallback to untuned
                        final = finalize_model(model)

                    preds = predict_model(final, data=df_c)
                    pred_col = next(
                        (c for c in ["Label", "prediction_label", "prediction"] if c in preds.columns),
                        None
                    )
                    mean_pred = preds[pred_col].mean()
                    pycaret_entry[f"Pred_{model_name}_PyCaret"] = mean_pred

                    mlflow.log_metric(f"MeanPred_{model_name}", mean_pred)

                    # Log sklearn model directly
                    mlflow.pycaret.log_model(
                        model=final_model,
                        name=f"{model_name}_model",
                        input_example=df_c.head(1)  # just one row is enough
                    )

            pycaret_results.append(pycaret_entry)

        except Exception as e:
            print(f"❌ PyCaret failed for {cid}-{tf}: {e}")


for entry in sarimax_results:
    print(entry.keys())


# Convert results
sarimax_df = pd.DataFrame(sarimax_results)
pycaret_df = pd.DataFrame(pycaret_results)

# Merge SARIMAX + PyCaret outputs
final_df = combined_top_df_clean.merge(sarimax_df, on=['CompanyId','TimeFrame'], how='left')
final_df = final_df.merge(pycaret_df, on=['CompanyId','TimeFrame'], how='left')

# -------------------------
# Select Top N per timeframe
# -------------------------
top_n = 5

def select_top_n(df, pred_col, n=5):
    top_list = []
    for tf in df['TimeFrame'].unique():
        tf_df = df[df['TimeFrame'] == tf].copy()
        tf_df = tf_df.sort_values(pred_col, ascending=False)
        top_list.append(tf_df.head(n))
    return pd.concat(top_list, ignore_index=True)

# Example: select top 5 by SARIMAX
top_sarimax_df = select_top_n(final_df, 'Pred_SARIMAX', top_n)

# Example: select top 5 by Ridge
def select_top_n(df, pred_col, n=5):
    if pred_col not in df.columns:
        print(f"⚠️ Column {pred_col} not found in DataFrame")
        return pd.DataFrame()  # return empty
    top_list = []
    for tf in df['TimeFrame'].unique():
        tf_df = df[df['TimeFrame'] == tf].copy()
        tf_df = tf_df.sort_values(pred_col, ascending=False)
        top_list.append(tf_df.head(n))
    return pd.concat(top_list, ignore_index=True) if top_list else pd.DataFrame()


# Merge for comparison
top_combined_df = top_sarimax_df.merge(
    pycaret_df,
    on=['CompanyId','TimeFrame'],
    how='outer',
    suffixes=('_SARIMAX','_PyCaret')
)

#print("Top N companies per timeframe (combined SARIMAX + PyCaret):")
#print(top_combined_df)

# pdf_short = ranked_rows.filter(F.col("TimeFrame") == "Short").toPandas()
#top_combined_df.to_csv(f"cvs/final_top_combined_df.csv", index=False)




def select_top_n_final(df, pred_col="PredictedTomorrowClose", n=5):
    """
    Select top-N rows per TimeFrame by prediction column.
    """
    top_list = []
    for tf in df['TimeFrame'].unique():
        tf_df = df[df['TimeFrame'] == tf].copy()
        tf_df = tf_df.sort_values(pred_col, ascending=False)
        top_list.append(tf_df.head(n))
    return pd.concat(top_list, ignore_index=True)

# Pick top-N by PredictedTomorrowClose
top_candidates = select_top_n_final(final_df, pred_col="PredictedTomorrowClose", n=5)

# Reduce to just what you need for DB write
top_out = top_candidates[[
    "CompanyId",
    "TimeFrame",
    "PredictedTomorrowClose"
]]

print(top_out)
from pyspark.sql.functions import lit, max as spark_max

# --- Step 1: Reduce to needed columns ---
top_out = top_candidates[["CompanyId", "TimeFrame", "PredictedTomorrowClose"]]

# --- Step 2: Create managed Delta table if not exists ---
table_name = "bsf.final_top_candidates"

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {table_name} (
    CompanyId STRING,
    TimeFrame STRING,
    PredictedTomorrowClose DOUBLE,
    run_id INT
)
USING DELTA
""")

# --- Step 3: Determine next run_id ---
if spark._jsparkSession.catalog().tableExists(table_name):
    latest_run_id = (
        spark.read.table(table_name)
        .agg(spark_max("run_id"))
        .collect()[0][0]
    )
    run_id = (latest_run_id or 0) + 1
else:
    run_id = 1

# --- Step 4: Add run_id and write ---
top_out_df = top_out.withColumn("run_id", lit(run_id))
top_out_df.write.format("delta").mode("append").saveAsTable(table_name)

print(f"✅ Run {run_id} written to {table_name}")
print(top_out_df.show())
# pdf_short = ranked_rows.filter(F.col("TimeFrame") == "Short").toPandas()
#top_combined_df.to_csv(f"cvs/final_top_combined_1_df.csv", index=False)
from pyspark.sql.functions import lit, col, max as spark_max

table_name = "bsf.final_top_combined"

# --- Step 1: Create managed Delta table if not exists ---
# (I’ll include all your columns explicitly)
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {table_name} (
    CompanyId STRING,
    StockDate DATE,
    Open DOUBLE,
    High DOUBLE,
    Low DOUBLE,
    Close DOUBLE,
    TomorrowClose DOUBLE,
    Return DOUBLE,
    TomorrowReturn DOUBLE,
    Doji BOOLEAN,
    Hammer BOOLEAN,
    InvertedHammer BOOLEAN,
    ShootingStar BOOLEAN,
    BullishEngulfing BOOLEAN,
    BearishEngulfing BOOLEAN,
    PiercingLine BOOLEAN,
    DarkCloudCover BOOLEAN,
    MorningStar BOOLEAN,
    EveningStar BOOLEAN,
    ThreeWhiteSoldiers BOOLEAN,
    ThreeBlackCrows BOOLEAN,
    TweezerTop BOOLEAN,
    TweezerBottom BOOLEAN,
    InsideBar BOOLEAN,
    OutsideBar BOOLEAN,
    MA DOUBLE,
    MA_slope DOUBLE,
    UpTrend_MA BOOLEAN,
    DownTrend_MA BOOLEAN,
    MomentumUp BOOLEAN,
    MomentumDown BOOLEAN,
    ConfirmedUpTrend BOOLEAN,
    ConfirmedDownTrend BOOLEAN,
    RecentReturn DOUBLE,
    UpTrend_Return BOOLEAN,
    DownTrend_Return BOOLEAN,
    Volatility DOUBLE,
    LowVolatility BOOLEAN,
    HighVolatility BOOLEAN,
    ROC DOUBLE,
    MomentumZ DOUBLE,
    SignalStrength INT,
    SignalStrengthHybrid DOUBLE,
    ActionConfidence DOUBLE,
    BullishStrengthHybrid DOUBLE,
    BearishStrengthHybrid DOUBLE,
    SignalDuration DOUBLE,
    ValidAction BOOLEAN,
    HasValidSignal BOOLEAN,
    MomentumAction STRING,
    PatternAction STRING,
    CandleAction STRING,
    CandidateAction STRING,
    Action STRING,
    TomorrowAction STRING,
    TomorrowActionSource STRING,
    BatchId STRING,
    IngestedAt STRING,
    TimeFrame STRING,
    log_Open DOUBLE,
    log_High DOUBLE,
    log_Low DOUBLE,
    log_Close DOUBLE,
    Pred_Linear DOUBLE,
    Pred_Lasso DOUBLE,
    Pred_Ridge DOUBLE,
    PredictedTomorrowClose DOUBLE,
    Pred_SARIMAX DOUBLE,
    order STRING,
    seasonal_order STRING,
    Pred_lr_PyCaret_SARIMAX DOUBLE,
    Pred_lasso_PyCaret_SARIMAX DOUBLE,
    Pred_ridge_PyCaret_SARIMAX DOUBLE,
    Pred_en_PyCaret_SARIMAX DOUBLE,
    Pred_lr_PyCaret_PyCaret DOUBLE,
    Pred_lasso_PyCaret_PyCaret DOUBLE,
    Pred_ridge_PyCaret_PyCaret DOUBLE,
    Pred_en_PyCaret_PyCaret DOUBLE,
    run_id INT
)
USING DELTA
""")

# --- Step 2: Get next run_id ---
if spark._jsparkSession.catalog().tableExists(table_name):
    latest_run_id = (
        spark.read.table(table_name)
        .agg(spark_max("run_id"))
        .collect()[0][0]
    )
    run_id = (latest_run_id or 0) + 1
else:
    run_id = 1

# --- Step 3: Add run_id to DataFrame and save ---
df_with_id = top_combined_df.withColumn("run_id", lit(run_id))

df_with_id.write.format("delta").mode("append").saveAsTable(table_name)

print(f"✅ Run {run_id} written to {table_name}")


spark.stop()