In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType, StructField, StringType, FloatType,
    LongType, TimestampType, IntegerType
)
from pyspark.sql import functions as F, Window
from delta import configure_spark_with_delta_pip
from pathlib import Path
from helpers import build_schema, load_schema_json, paths

builder = (
    SparkSession.builder
    .appName("TesteLocal")
    .master("local[*]")
    .config("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS")
    .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
    .config("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("Spark version:", spark.version)
print(spark.sparkContext.applicationId)
print(spark.sparkContext.uiWebUrl)

In [None]:
spark.stop()

In [None]:

repo_root = Path.cwd()               
schema_base_dir = (repo_root/"data"/"retail_db").as_posix()
print(schema_base_dir) ## /workspaces/ApacheSpark-CD/data/retail_db

In [None]:
type_mapping = {
    "integer": IntegerType(),
    "string": StringType(),
    "timestamp": TimestampType(),
    "float": FloatType()
}

In [None]:
schema_base_dir = (repo_root/"data"/"retail_db").as_posix()
schema_paths = paths(schema_base_dir,"file", "schemas*")
schema_json = load_schema_json(schema_paths)
ds_list = paths(schema_base_dir,"folder")

for ds in ds_list:
    # print(f"Processing {ds}")
    ds = Path(ds).name
    print(f"Processing {ds.capitalize()} data")
    print(f"Processing {ds} data")
    
    output_parq = (Path(f"{schema_base_dir}_parquet")/ds).as_posix()
    output_delta = (Path(schema_base_dir)/ds).as_posix()
    
    schema_table = build_schema(ds,schema_json)
    files=paths(f"{schema_base_dir}/{ds}","file", "part-*")
    if not files:
        continue
    print(files)

    df = (
        spark.read
        .schema(schema_table)
        .option("header", "false")
        .option("sep", ",")
        .option("mode", "PERMISSIVE")
        .csv(files)
    )
     
    df.cache()  ### df.persist(StorageLevel.MEMORY_AND_DISK) -->>> estudar
    # print(output_dir)
    # df.show(5)
    (
        df.write
        .mode("overwrite")      # ou "append"
        .format("parquet")
        .save(output_parq)
    )
    print(f"{output_parq} written successfully.")
    (
        df.write
        .mode("overwrite")      # ou "append"
        .format("delta")
        .save(output_delta)
    )
    
    print(f"{output_delta} written successfully.")
    
    
    df.unpersist()


In [None]:
from pathlib import Path
import shutil

tabela_deltasql = "/workspaces/ApacheSpark-CD/minha_delta_table"

# 1) Dropa a tabela do catálogo (se existir)
spark.sql("DROP TABLE IF EXISTS minha_tabela_delta")

# 2) Apaga o diretório físico
path = Path(tabela_deltasql)
shutil.rmtree(path, ignore_errors=True)

# 3) Recria a tabela Delta
spark.sql(f"""
CREATE TABLE minha_tabela_delta
USING DELTA
LOCATION '{tabela_deltasql}'
AS
SELECT 'b' as letra, 2 as numero
""")

In [None]:
spark.sql(f"""
CREATE TABLE minha_tabela_delta
USING DELTA
LOCATION '{tabela_deltasql}'
AS
SELECT 'b' as letra, 2 as numero
""")

# primeiras consultas

In [None]:
schema = StructType([
    StructField("stock_id",    StringType(), True),
    StructField("trans_date",  StringType(), True),
    StructField("open_price",  FloatType(),  True),
    StructField("low_price",   FloatType(),  True),
    StructField("high_price",  FloatType(),  True),
    StructField("close_price", FloatType(),  True),
    StructField("volume",      LongType(),   True)
])

dir_data = (repo_root/"data"/"nyse_all/nyse_data/*.txt.gz").as_posix()
df = spark.read.csv(
    dir_data,
    schema=schema,
    header=True,
    sep=","
)
df.show(5)

In [None]:
dir_data = (repo_root/"data/nyse_data_parquet").as_posix()
df.write.mode("overwrite").parquet(dir_data)

In [None]:
dict(df.dtypes)

df.printSchema()

df.count()

In [None]:
count_filter=(
df
#  df.filter(F.col("stock_id" ) == "ABRN")
 .groupBy("stock_id")
 .agg(F.count("*").alias("num_records"))
)

In [None]:
w = Window.partitionBy("stock_id").orderBy(F.desc("trans_date"))

count_filter = (
    df
    # .filter(F.col("stock_id") == "ABRN")
    .select(
        "stock_id",
        "trans_date",
        "close_price"
    )
    .withColumn("num_records",F.row_number().over(w))
)

In [None]:
w = Window.partitionBy("stock_id").orderBy(F.desc("trans_date"))

count_filter = (
    df
    .filter(F.col("stock_id") == "ABRN")
    .select(
        "stock_id",
        "trans_date",
        "close_price"
    )
    .withColumn("num_records", F.row_number().over(w))
)

In [None]:
count_filter.explain(True)
