In [1]:
from bsf_env import init_spark, init_mariadb_engine,set_spark_verbosity
from pyspark.sql.functions import lit, current_timestamp
import pandas as pd
import numpy as np
from pyspark.sql.types import *
from tqdm import tqdm
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from IPython.display import display, HTML
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import joblib
import tempfile
import os

spark = init_spark("bsf_candidates_analysis", log_level="WARN", show_progress=False, enable_ui=True, priority=True)
engine = init_mariadb_engine()

ingest_ts = spark.sql("SELECT current_timestamp()").collect()[0][0]

pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.width", 200)         # Adjust width for readability
pd.set_option("display.max_rows", 20)       # Show only top 20 rows by default

# Show tables
tables_df = spark.sql("SHOW TABLES IN bsf")
tables_df.show(truncate=False)

# Add row count for each table
for row in tables_df.collect():
    table_name = row['tableName']
    full_name = f"bsf.{table_name}"
    
    try:
        count = spark.table(full_name).count()
    except Exception as e:
        count = f"Error: {e}"
    
    print(f"Table: {full_name} | Rows: {count}")


:: loading settings :: url = jar:file:/home/jupyter/.venv/python3.9_bsf/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jupyter/.ivy2/cache
The jars for the packages stored in: /home/jupyter/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9658b9c4-0eed-46e7-8d14-765de0dfa0da;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.0.0rc1 in spark-list
	found io.delta#delta-storage;3.0.0rc1 in spark-list
	found org.antlr#antlr4-runtime;4.9.3 in spark-list
:: resolution report :: resolve 605ms :: artifacts dl 32ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.0.0rc1 from spark-list in [default]
	io.delta#delta-storage;3.0.0rc1 from spark-list in [default]
	org.antlr#antlr4-runtime;4.9.3 from spark-list in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	

[Spark] Started 'bsf_candidates_analysis' log_level=WARN (effective=WARN), progress=False


25/09/13 20:57:11 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/09/13 20:57:11 WARN HiveConf: HiveConf of name hive.metastore.client.connect.timeout does not exist
25/09/13 20:57:11 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
25/09/13 20:57:12 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/09/13 20:57:12 WARN HiveConf: HiveConf of name hive.metastore.client.connect.timeout does not exist
25/09/13 20:57:12 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
25/09/13 20:57:15 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/09/13 20:57:15 WARN HiveConf: HiveConf of name hive.metastore.client.connect.timeout does not exist
25/09/13 20:57:15 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
25/09/13 20:57:19 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


+---------+-------------------------+-----------+
|namespace|tableName                |isTemporary|
+---------+-------------------------+-----------+
|bsf      |company                  |false      |
|bsf      |companystockhistory      |false      |
|bsf      |daily_signals            |false      |
|bsf      |daily_signals_allcol     |false      |
|bsf      |daily_signals_last       |false      |
|bsf      |daily_signals_last_allcol|false      |
+---------+-------------------------+-----------+

Table: bsf.company | Rows: 30949
Table: bsf.companystockhistory | Rows: 459399
Table: bsf.daily_signals | Rows: 1377555
Table: bsf.daily_signals_allcol | Rows: 1377555
Table: bsf.daily_signals_last | Rows: 5478
Table: bsf.daily_signals_last_allcol | Rows: 5478


In [10]:
# Get and display default locations with description
warehouse_dir = spark.conf.get("spark.sql.warehouse.dir", "Not set")
delta_base_path = spark.conf.get("spark.delta.basePath", "Not set")
filesource_path = spark.conf.get("spark.sql.filesource.path", "Not set")
nond2rd_path = spark.conf.get("spark.nond2rd.defaultpath", "Not set")

# Print configuration values
print(f"⚡️ spark.sql.warehouse.dir     : {warehouse_dir}")
print(f"⚡️ spark.delta.basePath        : {delta_base_path}")
print(f"⚡️ spark.sql.filesource.path   : {filesource_path}")
print(f"⚡️ spark.nond2rd.defaultpath   : {nond2rd_path}")

# Show all databases in Hive - Using Spark SQL
spark.sql("SHOW DATABASES").show(truncate=False)

# List all databases using Spark catalog - Using Catalog API
db_list = spark.catalog.listDatabases()
         
# Display databases

for db in db_list:
    print(f"ℹ️ Database Name: {db.name}, Location: {db.locationUri}")


⚡️ spark.sql.warehouse.dir     : file:/srv/lakehouse/tables
⚡️ spark.delta.basePath        : /srv/lakehouse/delta
⚡️ spark.sql.filesource.path   : /srv/lakehouse/files
⚡️ spark.nond2rd.defaultpath   : /srv/lakehouse/files
+---------+
|namespace|
+---------+
|bsf      |
|default  |
+---------+

ℹ️ Database Name: bsf, Location: file:/srv/lakehouse/tables/bsf.db
ℹ️ Database Name: default, Location: file:/srv/lakehouse/tables/default.db


In [5]:
# 3️⃣ Approximate size on disk
def get_size(path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

In [12]:
spark.sql("SHOW TABLES IN bsf").show(truncate=False)
spark.sql("SHOW TABLES IN default").show(truncate=False)


+---------+-------------------------+-----------+
|namespace|tableName                |isTemporary|
+---------+-------------------------+-----------+
|bsf      |company                  |false      |
|bsf      |companystockhistory      |false      |
|bsf      |daily_signals            |false      |
|bsf      |daily_signals_allcol     |false      |
|bsf      |daily_signals_last       |false      |
|bsf      |daily_signals_last_allcol|false      |
+---------+-------------------------+-----------+

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|default  |stores   |false      |
+---------+---------+-----------+



25/09/13 21:10:16 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
25/09/13 21:10:17 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:978)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce

In [11]:
table_name="bsf.company"

num_rows = spark.sql(f"SELECT COUNT(*) AS cnt FROM {table_name}").collect()[0]['cnt']

# 2️⃣ Table location
desc = spark.sql(f"DESCRIBE FORMATTED {table_name}").collect()
location_row = next(row for row in desc if row['col_name'].strip() == 'Location')
table_location = location_row['data_type'].strip()



size_bytes = get_size(table_location)
size_mb = size_bytes / (1024 * 1024)

print (f"rows: {num_rows}, size_mb: {size_mb}, location: {table_location}")

rows: 30949, size_mb: 0.0, location: file:/srv/lakehouse/tables/bsf.db/company


In [None]:
spark.sql(f"DROP TABLE IF EXISTS {table_name}")