In [None]:
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, LongType,TimestampType,IntegerType
from pyspark.sql import functions as F,Window
from pathlib import Path
import os
import sys

spark = (
    SparkSession.builder
    .appName("TesteLocal")
    .master("local[*]")
    .config("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS")
    .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
    .config("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")
    .getOrCreate()
)




print("Spark version:", spark.version)

print(spark.sparkContext.applicationId)
print(spark.sparkContext.uiWebUrl)

26/01/03 02:08:39 WARN Utils: Your hostname, codespaces-ef10a1 resolves to a loopback address: 127.0.0.1; using 10.0.0.152 instead (on interface eth0)
26/01/03 02:08:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/03 02:08:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark version: 3.5.2
local-1767406121254
http://ee435b6a-2673-4663-964f-1c8e2569230d.internal.cloudapp.net:4040


In [2]:
from pathlib import Path

repo_root = Path.cwd()               
schema_base_dir = (repo_root/"data"/"retail_db").as_posix()
print(schema_base_dir)


/workspaces/ApacheSpark-CD/data/retail_db


In [None]:
import warnings


type_mapping = {
    "integer": IntegerType(),
    "string": StringType(),
    "timestamp": TimestampType(),
    "float": FloatType()
}

def _paths(folder_path,kind: str,pattern:str = "*"):
    
    kind = kind.lower()
    
    if kind not in {"file", "folder"}:
        raise ValueError("Nenhum file/folder path atribuido")
    
    base_path = Path(folder_path)
    
    if kind == "file":
        path = [path.as_posix() for path in base_path.glob(pattern) if path.is_file()]

    else:
        path = [path.as_posix() for path in base_path.glob(pattern) if path.is_dir()]
    
    if not path:
        warnings.warn(f"[WARN] Nenhum {kind} encontrado em {base_path} com pattern='{pattern}'")
    return path

def _load_schema_json(schema_paths) -> dict:

    schema_path = schema_paths[0] if isinstance(schema_paths, (list, tuple)) else schema_paths
    
    with open(schema_path, "r", encoding="utf-8") as f:
        return json.load(f)

def _build_schema(table_name: str, schema_json: dict) -> StructType:
    if table_name not in schema_json:
        raise KeyError(f"Tabela {table_name} n√£o encontrada no JSON de schema")
    fields = sorted(schema_json[table_name], key= lambda col: col["column_position"], reverse = False)
    
    return StructType(
       [ StructField(
            field["column_name"]
            ,type_mapping.get(field["data_type"].lower(),StringType())
            ,True
        )
        for field in fields]
    )

schema_base_dir = (repo_root/"data"/"retail_db").as_posix()
output_dir_parq = f"{schema_base_dir}_parquet"
schema_paths = _paths(schema_base_dir,"file", "schemas*")
schema_json = _load_schema_json(schema_paths)
ds_list = _paths(schema_base_dir,"folder")

for ds in ds_list:
    ds = Path(ds).name
    print(f"Processing {ds} data")
    
    schema_table = _build_schema(ds,schema_json)
    files=_paths(f"{schema_base_dir}/{ds}","file", "part-*")
    if not files:
        continue
    print(files)

    df = (
        spark.read
        .schema(schema_table)
        .option("header", "false")
        .option("sep", ",")
        .option("mode", "PERMISSIVE")
        .csv(files)
    )
    
    output_dir = (Path(output_dir_parq)/ds).as_posix()
    # print(output_dir)
    # df.show(5)
    (
        df.write
        .mode("overwrite")      # ou "append"
        .parquet(output_dir)
    )
    print(f"{output_dir} written successfully.")



Processing departments data
['/workspaces/ApacheSpark-CD/data/retail_db/departments/part-00000']


                                                                                

/workspaces/ApacheSpark-CD/data/retail_db_parquet/departments written successfully.
Processing categories data
['/workspaces/ApacheSpark-CD/data/retail_db/categories/part-00000']
/workspaces/ApacheSpark-CD/data/retail_db_parquet/categories written successfully.
Processing orders data
['/workspaces/ApacheSpark-CD/data/retail_db/orders/part-00000']


                                                                                

/workspaces/ApacheSpark-CD/data/retail_db_parquet/orders written successfully.
Processing customers data
['/workspaces/ApacheSpark-CD/data/retail_db/customers/part-00000']
/workspaces/ApacheSpark-CD/data/retail_db_parquet/customers written successfully.
Processing products data
['/workspaces/ApacheSpark-CD/data/retail_db/products/part-00000']
/workspaces/ApacheSpark-CD/data/retail_db_parquet/products written successfully.
Processing order_items data
['/workspaces/ApacheSpark-CD/data/retail_db/order_items/part-00000']




/workspaces/ApacheSpark-CD/data/retail_db_parquet/order_items written successfully.


                                                                                

26/01/03 02:08:57 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


# primeiras consultas

In [9]:
schema = StructType([
    StructField("stock_id",    StringType(), True),
    StructField("trans_date",  StringType(), True),
    StructField("open_price",  FloatType(),  True),
    StructField("low_price",   FloatType(),  True),
    StructField("high_price",  FloatType(),  True),
    StructField("close_price", FloatType(),  True),
    StructField("volume",      LongType(),   True)
])

dir_data = (repo_root/"data"/"nyse_all/nyse_data/*.txt.gz").as_posix()
df = spark.read.csv(
    dir_data,
    schema=schema,
    header=True,
    sep=","
)
df.show(5)

26/01/03 02:16:24 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: A, 20160101, 41.81, 41.81, 41.81, 41.81, 0
 Schema: stock_id, trans_date, open_price, low_price, high_price, close_price, volume
Expected: stock_id but found: A
CSV file: file:///workspaces/ApacheSpark-CD/data/nyse_all/nyse_data/NYSE_2016.txt.gz


+--------+----------+----------+---------+----------+-----------+------+
|stock_id|trans_date|open_price|low_price|high_price|close_price|volume|
+--------+----------+----------+---------+----------+-----------+------+
|      AA|  20160101|     29.61|    29.61|     29.61|      29.61|     0|
|     AAC|  20160101|     19.06|    19.06|     19.06|      19.06|     0|
|     AAN|  20160101|     22.39|    22.39|     22.39|      22.39|     0|
|     AAP|  20160101|    150.51|   150.51|    150.51|     150.51|     0|
|     AAT|  20160101|     38.35|    38.35|     38.35|      38.35|     0|
+--------+----------+----------+---------+----------+-----------+------+
only showing top 5 rows



In [10]:
dir_data = (repo_root/"data/nyse_data_parquet").as_posix()
df.write.mode("overwrite").parquet(dir_data)

26/01/03 02:17:42 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: A, 20080101, 36.74, 36.74, 36.74, 36.74, 0
 Schema: stock_id, trans_date, open_price, low_price, high_price, close_price, volume
Expected: stock_id but found: A
CSV file: file:///workspaces/ApacheSpark-CD/data/nyse_all/nyse_data/NYSE_2008.txt.gz
26/01/03 02:17:42 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: A, 20160101, 41.81, 41.81, 41.81, 41.81, 0
 Schema: stock_id, trans_date, open_price, low_price, high_price, close_price, volume
Expected: stock_id but found: A
CSV file: file:///workspaces/ApacheSpark-CD/data/nyse_all/nyse_data/NYSE_2016.txt.gz
26/01/03 02:17:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: A, 20070101, 34.85, 34.85, 34.85, 34.85, 0
 Schema: stock_id, trans_date, open_price, low_price, high_price, close_price, volume
Expected: stock_id but found: A
CSV file: file:///workspaces/ApacheSpark-CD/data/nyse_all/nyse_data/

In [None]:
dict(df.dtypes)

df.printSchema()

df.count()

In [None]:
count_filter=(
df
#  df.filter(F.col("stock_id" ) == "ABRN")
 .groupBy("stock_id")
 .agg(F.count("*").alias("num_records"))
)

In [None]:
w = Window.partitionBy("stock_id").orderBy(F.desc("trans_date"))

count_filter = (
    df
    # .filter(F.col("stock_id") == "ABRN")
    .select(
        "stock_id",
        "trans_date",
        "close_price"
    )
    .withColumn("num_records",F.row_number().over(w))
)

In [None]:
w = Window.partitionBy("stock_id").orderBy(F.desc("trans_date"))

count_filter = (
    df
    .filter(F.col("stock_id") == "ABRN")
    .select(
        "stock_id",
        "trans_date",
        "close_price"
    )
    .withColumn("num_records", F.row_number().over(w))
)

In [None]:
count_filter.explain(True)
