In [1]:
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType, StructField, StringType, FloatType,
    LongType, TimestampType, IntegerType
)
from pyspark.sql import functions as F, Window
from pathlib import Path
from delta import configure_spark_with_delta_pip
import os
import sys

builder = (
    SparkSession.builder
    .appName("TesteLocal")
    .master("local[*]")
    .config("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS")
    .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
    .config("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("Spark version:", spark.version)
print(spark.sparkContext.applicationId)
print(spark.sparkContext.uiWebUrl)

ModuleNotFoundError: No module named 'pyspark'

In [7]:
spark.stop()

In [2]:
from pathlib import Path

repo_root = Path.cwd()               
schema_base_dir = (repo_root/"data"/"retail_db").as_posix()
print(schema_base_dir) ## /workspaces/ApacheSpark-CD/data/retail_db

/workspaces/ApacheSpark-CD/data/retail_db


In [None]:
import warnings


type_mapping = {
    "integer": IntegerType(),
    "string": StringType(),
    "timestamp": TimestampType(),
    "float": FloatType()
}

def _paths(folder_path,kind: str,pattern:str = "*"):
    """
    Retorna uma lista de caminhos (paths) para arquivos ou pastas dentro de um diretório,
    opcionalmente filtrados por um padrão (glob).

    Parâmetros
    ----------
    folder_path : str ou pathlib.Path
        Caminho da pasta base onde a busca será realizada.
    kind : {"file", "folder"}
        Tipo de caminho a retornar:
        - "file"  : retorna apenas arquivos.
        - "folder": retorna apenas diretórios.
    pattern : str, opcional
        Padrão de busca no estilo glob (por padrão "*").
        Exemplos:
        - "*.csv"   : todos os arquivos CSV
        - "sub_*"   : arquivos ou pastas cujo nome começa com "sub_"
        - "**/*.py" : todos os arquivos .py recursivamente (se usado com rglob)

    Retorna
    -------
    list[str]
        Lista de caminhos em formato POSIX (strings) correspondentes
        ao tipo (`kind`) e padrão informado dentro de `folder_path`.

    Exemplos
    --------
    >>> _paths("dados", kind="file", pattern="*.csv")
    ['dados/tabela1.csv', 'dados/tabela2.csv']

    >>> _paths("projetos", kind="folder", pattern="exp_*")
    ['projetos/exp_01', 'projetos/exp_02']
    """
    kind = kind.lower()
    
    if kind not in {"file", "folder"}:
        raise ValueError("Nenhum file/folder path atribuido")
    
    base_path = Path(folder_path)
    
    if kind == "file":
        path = [path.as_posix() for path in base_path.glob(pattern) if path.is_file()]

    else:
        path = [path.as_posix() for path in base_path.glob(pattern) if path.is_dir()]
    
    if not path:
        warnings.warn(f"[WARN] Nenhum {kind} encontrado em {base_path} com pattern='{pattern}'")
    return path

def _load_schema_json(schema_paths) -> dict:

    schema_path = schema_paths[0] if isinstance(schema_paths, (list)) else schema_paths
    
    with open(schema_path, "r", encoding="utf-8") as f:
        return json.load(f)

def _build_schema(table_name: str, schema_json: dict) -> StructType:
    if table_name not in schema_json:
        raise KeyError(f"Tabela {table_name} não encontrada no JSON de schema")
    fields = sorted(schema_json[table_name], key= lambda col: col["column_position"], reverse = False)
    
    return StructType(
       [ StructField(
            field["column_name"]
            ,type_mapping.get(field["data_type"].lower(),StringType())
            ,True
        )
        for field in fields]
    )


26/01/06 15:57:35 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [4]:
schema_base_dir = (repo_root/"data"/"retail_db").as_posix()
output_dir_parq = f"{schema_base_dir}_parquet"
schema_paths = _paths(schema_base_dir,"file", "schemas*")
schema_json = _load_schema_json(schema_paths)
ds_list = _paths(schema_base_dir,"folder")

for ds in ds_list:
    # print(f"Processing {ds}")
    ds = Path(ds).name
    print(f"Processing {ds.capitalize()} data")
    print(f"Processing {ds} data")
    
    schema_table = _build_schema(ds,schema_json)
    files=_paths(f"{schema_base_dir}/{ds}","file", "part-*")
    if not files:
        continue
    print(files)

    df = (
        spark.read
        .schema(schema_table)
        .option("header", "false")
        .option("sep", ",")
        .option("mode", "PERMISSIVE")
        .csv(files)
    )
    
    output_dir = (Path(output_dir_parq)/ds).as_posix()
    # print(output_dir)
    # df.show(5)
    (
        df.write
        .mode("overwrite")      # ou "append"
        .format("delta")
        # .parquet(output_dir)
        .save(output_dir)
    )
    print(f"{output_dir} written successfully.")
    
    



Processing Departments data
Processing departments data
['/workspaces/ApacheSpark-CD/data/retail_db/departments/part-00000']


                                                                                

/workspaces/ApacheSpark-CD/data/retail_db_parquet/departments written successfully.
Processing Categories data
Processing categories data
['/workspaces/ApacheSpark-CD/data/retail_db/categories/part-00000']
/workspaces/ApacheSpark-CD/data/retail_db_parquet/categories written successfully.
Processing Orders data
Processing orders data
['/workspaces/ApacheSpark-CD/data/retail_db/orders/part-00000']


                                                                                

/workspaces/ApacheSpark-CD/data/retail_db_parquet/orders written successfully.
Processing Customers data
Processing customers data
['/workspaces/ApacheSpark-CD/data/retail_db/customers/part-00000']


                                                                                

/workspaces/ApacheSpark-CD/data/retail_db_parquet/customers written successfully.
Processing Products data
Processing products data
['/workspaces/ApacheSpark-CD/data/retail_db/products/part-00000']
/workspaces/ApacheSpark-CD/data/retail_db_parquet/products written successfully.
Processing Order_items data
Processing order_items data
['/workspaces/ApacheSpark-CD/data/retail_db/order_items/part-00000']


                                                                                

/workspaces/ApacheSpark-CD/data/retail_db_parquet/order_items written successfully.


In [29]:
from pathlib import Path
import shutil

tabela_deltasql = "/workspaces/ApacheSpark-CD/minha_delta_table"

# 1) Dropa a tabela do catálogo (se existir)
spark.sql("DROP TABLE IF EXISTS minha_tabela_delta")

# 2) Apaga o diretório físico
path = Path(tabela_deltasql)
shutil.rmtree(path, ignore_errors=True)

# 3) Recria a tabela Delta
spark.sql(f"""
CREATE TABLE minha_tabela_delta
USING DELTA
LOCATION '{tabela_deltasql}'
AS
SELECT 'b' as letra, 2 as numero
""")

Py4JJavaError: An error occurred while calling o40.sql.
: java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Error accessing configuration file
	at java.base/java.util.ServiceLoader.fail(ServiceLoader.java:586)
	at java.base/java.util.ServiceLoader$LazyClassPathLookupIterator.parse(ServiceLoader.java:1180)
	at java.base/java.util.ServiceLoader$LazyClassPathLookupIterator.nextProviderClass(ServiceLoader.java:1213)
	at java.base/java.util.ServiceLoader$LazyClassPathLookupIterator.hasNextService(ServiceLoader.java:1228)
	at java.base/java.util.ServiceLoader$LazyClassPathLookupIterator.hasNext(ServiceLoader.java:1273)
	at java.base/java.util.ServiceLoader$2.hasNext(ServiceLoader.java:1309)
	at java.base/java.util.ServiceLoader$3.hasNext(ServiceLoader.java:1393)
	at scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:45)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.filterImpl(TraversableLike.scala:303)
	at scala.collection.TraversableLike.filterImpl$(TraversableLike.scala:297)
	at scala.collection.AbstractTraversable.filterImpl(Traversable.scala:108)
	at scala.collection.TraversableLike.filter(TraversableLike.scala:395)
	at scala.collection.TraversableLike.filter$(TraversableLike.scala:395)
	at scala.collection.AbstractTraversable.filter(Traversable.scala:108)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:629)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.catalyst.analysis.ResolveSessionCatalog.org$apache$spark$sql$catalyst$analysis$ResolveSessionCatalog$$isV2Provider(ResolveSessionCatalog.scala:605)
	at org.apache.spark.sql.catalyst.analysis.ResolveSessionCatalog$$anonfun$apply$1.applyOrElse(ResolveSessionCatalog.scala:181)
	at org.apache.spark.sql.catalyst.analysis.ResolveSessionCatalog$$anonfun$apply$1.applyOrElse(ResolveSessionCatalog.scala:52)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$3(AnalysisHelper.scala:138)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$1(AnalysisHelper.scala:138)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:323)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning(AnalysisHelper.scala:134)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning$(AnalysisHelper.scala:130)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUpWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp(AnalysisHelper.scala:111)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp$(AnalysisHelper.scala:110)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUp(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.analysis.ResolveSessionCatalog.apply(ResolveSessionCatalog.scala:52)
	at org.apache.spark.sql.catalyst.analysis.ResolveSessionCatalog.apply(ResolveSessionCatalog.scala:46)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:222)
	at scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
	at scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
	at scala.collection.immutable.List.foldLeft(List.scala:91)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:219)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:211)
	at scala.collection.immutable.List.foreach(List.scala:431)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:211)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:226)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$execute$1(Analyzer.scala:222)
	at org.apache.spark.sql.catalyst.analysis.AnalysisContext$.withNewAnalysisContext(Analyzer.scala:173)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:222)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:188)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:182)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:89)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:182)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:209)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:330)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:208)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:77)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:138)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:219)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:219)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:218)
	at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:77)
	at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:74)
	at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:66)
	at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:99)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97)
	at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:638)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:629)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:659)
	at jdk.internal.reflect.GeneratedMethodAccessor111.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: java.nio.file.NoSuchFileException: /workspaces/ApacheSpark-CD/venv/lib/python3.11/site-packages/pyspark/jars/spark-mllib_2.12-3.5.0.jar
	at java.base/sun.nio.fs.UnixException.translateToIOException(UnixException.java:92)
	at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:106)
	at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:111)
	at java.base/sun.nio.fs.UnixFileAttributeViews$Basic.readAttributes(UnixFileAttributeViews.java:55)
	at java.base/sun.nio.fs.UnixFileSystemProvider.readAttributes(UnixFileSystemProvider.java:171)
	at java.base/sun.nio.fs.LinuxFileSystemProvider.readAttributes(LinuxFileSystemProvider.java:99)
	at java.base/java.nio.file.Files.readAttributes(Files.java:1854)
	at java.base/java.util.zip.ZipFile$Source.get(ZipFile.java:1445)
	at java.base/java.util.zip.ZipFile$CleanableResource.<init>(ZipFile.java:724)
	at java.base/java.util.zip.ZipFile.<init>(ZipFile.java:251)
	at java.base/java.util.zip.ZipFile.<init>(ZipFile.java:180)
	at java.base/java.util.jar.JarFile.<init>(JarFile.java:345)
	at java.base/sun.net.www.protocol.jar.URLJarFile.<init>(URLJarFile.java:100)
	at java.base/sun.net.www.protocol.jar.URLJarFile.getJarFile(URLJarFile.java:69)
	at java.base/sun.net.www.protocol.jar.JarFileFactory.get(JarFileFactory.java:168)
	at java.base/sun.net.www.protocol.jar.JarFileFactory.getOrCreate(JarFileFactory.java:91)
	at java.base/sun.net.www.protocol.jar.JarURLConnection.connect(JarURLConnection.java:117)
	at java.base/sun.net.www.protocol.jar.JarURLConnection.getInputStream(JarURLConnection.java:160)
	at java.base/java.util.ServiceLoader$LazyClassPathLookupIterator.parse(ServiceLoader.java:1172)
	... 82 more


In [26]:
spark.sql(f"""
CREATE TABLE minha_tabela_delta
USING DELTA
LOCATION '{tabela_deltasql}'
AS
SELECT 'b' as letra, 2 as numero
""")

Py4JJavaError: An error occurred while calling o40.sql.
: java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Error accessing configuration file
	at java.base/java.util.ServiceLoader.fail(ServiceLoader.java:586)
	at java.base/java.util.ServiceLoader$LazyClassPathLookupIterator.parse(ServiceLoader.java:1180)
	at java.base/java.util.ServiceLoader$LazyClassPathLookupIterator.nextProviderClass(ServiceLoader.java:1213)
	at java.base/java.util.ServiceLoader$LazyClassPathLookupIterator.hasNextService(ServiceLoader.java:1228)
	at java.base/java.util.ServiceLoader$LazyClassPathLookupIterator.hasNext(ServiceLoader.java:1273)
	at java.base/java.util.ServiceLoader$2.hasNext(ServiceLoader.java:1309)
	at java.base/java.util.ServiceLoader$3.hasNext(ServiceLoader.java:1393)
	at scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:45)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.filterImpl(TraversableLike.scala:303)
	at scala.collection.TraversableLike.filterImpl$(TraversableLike.scala:297)
	at scala.collection.AbstractTraversable.filterImpl(Traversable.scala:108)
	at scala.collection.TraversableLike.filter(TraversableLike.scala:395)
	at scala.collection.TraversableLike.filter$(TraversableLike.scala:395)
	at scala.collection.AbstractTraversable.filter(Traversable.scala:108)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:629)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.catalyst.analysis.ResolveSessionCatalog.org$apache$spark$sql$catalyst$analysis$ResolveSessionCatalog$$isV2Provider(ResolveSessionCatalog.scala:605)
	at org.apache.spark.sql.catalyst.analysis.ResolveSessionCatalog$$anonfun$apply$1.applyOrElse(ResolveSessionCatalog.scala:181)
	at org.apache.spark.sql.catalyst.analysis.ResolveSessionCatalog$$anonfun$apply$1.applyOrElse(ResolveSessionCatalog.scala:52)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$3(AnalysisHelper.scala:138)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$1(AnalysisHelper.scala:138)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:323)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning(AnalysisHelper.scala:134)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning$(AnalysisHelper.scala:130)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUpWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp(AnalysisHelper.scala:111)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUp$(AnalysisHelper.scala:110)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUp(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.analysis.ResolveSessionCatalog.apply(ResolveSessionCatalog.scala:52)
	at org.apache.spark.sql.catalyst.analysis.ResolveSessionCatalog.apply(ResolveSessionCatalog.scala:46)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:222)
	at scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
	at scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
	at scala.collection.immutable.List.foldLeft(List.scala:91)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:219)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:211)
	at scala.collection.immutable.List.foreach(List.scala:431)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:211)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:226)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$execute$1(Analyzer.scala:222)
	at org.apache.spark.sql.catalyst.analysis.AnalysisContext$.withNewAnalysisContext(Analyzer.scala:173)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:222)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:188)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:182)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:89)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:182)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:209)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:330)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:208)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:77)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:138)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:219)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:219)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:218)
	at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:77)
	at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:74)
	at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:66)
	at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:99)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97)
	at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:638)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:629)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:659)
	at jdk.internal.reflect.GeneratedMethodAccessor111.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: java.nio.file.NoSuchFileException: /workspaces/ApacheSpark-CD/venv/lib/python3.11/site-packages/pyspark/jars/spark-mllib_2.12-3.5.0.jar
	at java.base/sun.nio.fs.UnixException.translateToIOException(UnixException.java:92)
	at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:106)
	at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:111)
	at java.base/sun.nio.fs.UnixFileAttributeViews$Basic.readAttributes(UnixFileAttributeViews.java:55)
	at java.base/sun.nio.fs.UnixFileSystemProvider.readAttributes(UnixFileSystemProvider.java:171)
	at java.base/sun.nio.fs.LinuxFileSystemProvider.readAttributes(LinuxFileSystemProvider.java:99)
	at java.base/java.nio.file.Files.readAttributes(Files.java:1854)
	at java.base/java.util.zip.ZipFile$Source.get(ZipFile.java:1445)
	at java.base/java.util.zip.ZipFile$CleanableResource.<init>(ZipFile.java:724)
	at java.base/java.util.zip.ZipFile.<init>(ZipFile.java:251)
	at java.base/java.util.zip.ZipFile.<init>(ZipFile.java:180)
	at java.base/java.util.jar.JarFile.<init>(JarFile.java:345)
	at java.base/sun.net.www.protocol.jar.URLJarFile.<init>(URLJarFile.java:100)
	at java.base/sun.net.www.protocol.jar.URLJarFile.getJarFile(URLJarFile.java:69)
	at java.base/sun.net.www.protocol.jar.JarFileFactory.get(JarFileFactory.java:168)
	at java.base/sun.net.www.protocol.jar.JarFileFactory.getOrCreate(JarFileFactory.java:91)
	at java.base/sun.net.www.protocol.jar.JarURLConnection.connect(JarURLConnection.java:117)
	at java.base/sun.net.www.protocol.jar.JarURLConnection.getInputStream(JarURLConnection.java:160)
	at java.base/java.util.ServiceLoader$LazyClassPathLookupIterator.parse(ServiceLoader.java:1172)
	... 82 more


# primeiras consultas

In [None]:
schema = StructType([
    StructField("stock_id",    StringType(), True),
    StructField("trans_date",  StringType(), True),
    StructField("open_price",  FloatType(),  True),
    StructField("low_price",   FloatType(),  True),
    StructField("high_price",  FloatType(),  True),
    StructField("close_price", FloatType(),  True),
    StructField("volume",      LongType(),   True)
])

dir_data = (repo_root/"data"/"nyse_all/nyse_data/*.txt.gz").as_posix()
df = spark.read.csv(
    dir_data,
    schema=schema,
    header=True,
    sep=","
)
df.show(5)

26/01/06 12:37:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: A, 20160101, 41.81, 41.81, 41.81, 41.81, 0
 Schema: stock_id, trans_date, open_price, low_price, high_price, close_price, volume
Expected: stock_id but found: A
CSV file: file:///workspaces/ApacheSpark-CD/data/nyse_all/nyse_data/NYSE_2016.txt.gz


+--------+----------+----------+---------+----------+-----------+------+
|stock_id|trans_date|open_price|low_price|high_price|close_price|volume|
+--------+----------+----------+---------+----------+-----------+------+
|      AA|  20160101|     29.61|    29.61|     29.61|      29.61|     0|
|     AAC|  20160101|     19.06|    19.06|     19.06|      19.06|     0|
|     AAN|  20160101|     22.39|    22.39|     22.39|      22.39|     0|
|     AAP|  20160101|    150.51|   150.51|    150.51|     150.51|     0|
|     AAT|  20160101|     38.35|    38.35|     38.35|      38.35|     0|
+--------+----------+----------+---------+----------+-----------+------+
only showing top 5 rows



In [6]:
dir_data = (repo_root/"data/nyse_data_parquet").as_posix()
df.write.mode("overwrite").parquet(dir_data)

26/01/06 12:37:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: A, 20080101, 36.74, 36.74, 36.74, 36.74, 0
 Schema: stock_id, trans_date, open_price, low_price, high_price, close_price, volume
Expected: stock_id but found: A
CSV file: file:///workspaces/ApacheSpark-CD/data/nyse_all/nyse_data/NYSE_2008.txt.gz
26/01/06 12:37:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: A, 20160101, 41.81, 41.81, 41.81, 41.81, 0
 Schema: stock_id, trans_date, open_price, low_price, high_price, close_price, volume
Expected: stock_id but found: A
CSV file: file:///workspaces/ApacheSpark-CD/data/nyse_all/nyse_data/NYSE_2016.txt.gz
26/01/06 12:37:56 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: A, 20070101, 34.85, 34.85, 34.85, 34.85, 0
 Schema: stock_id, trans_date, open_price, low_price, high_price, close_price, volume
Expected: stock_id but found: A
CSV file: file:///workspaces/ApacheSpark-CD/data/nyse_all/nyse_data/

In [8]:
dict(df.dtypes)

df.printSchema()

df.count()

root
 |-- stock_id: string (nullable = true)
 |-- trans_date: string (nullable = true)
 |-- open_price: float (nullable = true)
 |-- low_price: float (nullable = true)
 |-- high_price: float (nullable = true)
 |-- close_price: float (nullable = true)
 |-- volume: long (nullable = true)



                                                                                

9384718

In [None]:
count_filter=(
df
#  df.filter(F.col("stock_id" ) == "ABRN")
 .groupBy("stock_id")
 .agg(F.count("*").alias("num_records"))
)

In [None]:
w = Window.partitionBy("stock_id").orderBy(F.desc("trans_date"))

count_filter = (
    df
    # .filter(F.col("stock_id") == "ABRN")
    .select(
        "stock_id",
        "trans_date",
        "close_price"
    )
    .withColumn("num_records",F.row_number().over(w))
)

In [None]:
w = Window.partitionBy("stock_id").orderBy(F.desc("trans_date"))

count_filter = (
    df
    .filter(F.col("stock_id") == "ABRN")
    .select(
        "stock_id",
        "trans_date",
        "close_price"
    )
    .withColumn("num_records", F.row_number().over(w))
)

In [None]:
count_filter.explain(True)


== Parsed Logical Plan ==
'Project [stock_id#354, trans_date#355, close_price#359, row_number() windowspecdefinition('stock_id, 'trans_date DESC NULLS LAST, unspecifiedframe$()) AS num_records#455]
+- Project [stock_id#354, trans_date#355, close_price#359]
   +- Filter (stock_id#354 = ABRN)
      +- Relation [stock_id#354,trans_date#355,open_price#356,low_price#357,high_price#358,close_price#359,volume#360L] csv

== Analyzed Logical Plan ==
stock_id: string, trans_date: string, close_price: float, num_records: int
Project [stock_id#354, trans_date#355, close_price#359, num_records#455]
+- Project [stock_id#354, trans_date#355, close_price#359, num_records#455, num_records#455]
   +- Window [row_number() windowspecdefinition(stock_id#354, trans_date#355 DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS num_records#455], [stock_id#354], [trans_date#355 DESC NULLS LAST]
      +- Project [stock_id#354, trans_date#355, close_price#359]
         +- Pro