In [1]:
import pandas as pd

In [2]:
!pip install pyspark




[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip




## Customers

In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]").appName("teste").getOrCreate()

spark.sparkContext.setLogLevel("WARN")

df = (
    spark
    .read
    .format("csv")
    .options(header='true', inferSchema='true', delimiter=",")
    .load("C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\olist_customers_dataset.csv")
)

df.printSchema()

(
    df
    .write
    .mode("overwrite")
    .format("parquet")
    .save("C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\parquet\\")
)

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [4]:
# Import data:
df = spark.read.parquet('C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\parquet\\')

# Clean data:
df = df.na.drop()

# Select important features from data
columns = ["customer_id","customer_unique_id","customer_zip_code_prefix","customer_state"]
customers = df.select(*columns)

customers.show()

+--------------------+--------------------+------------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|customer_state|
+--------------------+--------------------+------------------------+--------------+
|06b8999e2fba1a1fb...|861eff4711a542e4b...|                   14409|            SP|
|18955e83d337fd6b2...|290c77bc529b7ac93...|                    9790|            SP|
|4e7b3e00288586ebd...|060e732b5b29e8181...|                    1151|            SP|
|b2b6027bc5c5109e5...|259dac757896d24d7...|                    8775|            SP|
|4f2d8ab171c80ec83...|345ecd01c38d18a90...|                   13056|            SP|
|879864dab9bc30475...|4c93744516667ad3b...|                   89254|            SC|
|fd826e7cf63160e53...|addec96d2e059c80c...|                    4534|            SP|
|5e274e7a0c3809e14...|57b2a98a409812fe9...|                   35182|            MG|
|5adf08e34b2e99398...|1175e95fb47ddff9d...|                   81560|        

## Order Payments

In [5]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]").appName("teste").getOrCreate()

spark.sparkContext.setLogLevel("WARN")

df = (
    spark
    .read
    .format("csv")
    .options(header='true', inferSchema='true', delimiter=",")
    .load("C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\olist_order_payments_dataset.csv")
)

df.printSchema()

(
    df
    .write
    .mode("overwrite")
    .format("parquet")
    .save("C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\parquet\\")
)

root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)



In [6]:
# Import data:
df = spark.read.parquet('C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\parquet\\')

# Clean data:
df = df.na.drop()

# Select important features from data
columns = [
    "order_id",
    "payment_type",
    "payment_value"
]
payments = df.select(*columns)

(
    payments
    .write
    .mode("overwrite")
    .format("parquet")
    .save("C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\payments\\")
)

payments.show()

+--------------------+------------+-------------+
|            order_id|payment_type|payment_value|
+--------------------+------------+-------------+
|b81ef226f3fe1789b...| credit_card|        99.33|
|a9810da82917af2d9...| credit_card|        24.39|
|25e8ea4e93396b6fa...| credit_card|        65.71|
|ba78997921bbcdc13...| credit_card|       107.78|
|42fdf880ba16b47b5...| credit_card|       128.45|
|298fcdf1f73eb413e...| credit_card|        96.12|
|771ee386b001f0620...| credit_card|        81.16|
|3d7239c394a212faa...| credit_card|        51.84|
|1f78449c87a54faf9...| credit_card|       341.09|
|0573b5e23cbd79800...|      boleto|        51.95|
|d88e0d5fa41661ce0...| credit_card|       188.73|
|2480f727e869fdeb3...| credit_card|        141.9|
|616105c9352a9668c...| credit_card|        75.78|
|cf95215a722f3ebf2...| credit_card|       102.66|
|769214176682788a9...| credit_card|       105.28|
|12e5cfe0e4716b59a...| credit_card|       157.45|
|61059985a6fc0ad64...| credit_card|       132.04|


## Order Items

In [15]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.master("local[1]").appName("teste").getOrCreate()

spark.sparkContext.setLogLevel("WARN")

df = (
    spark
    .read
    .format("csv")
    .options(header='true', inferSchema='true', delimiter=",")
    .load("C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\olist_order_items_dataset.csv")
)

df.printSchema()

(
    df
    .write
    .mode("overwrite")
    .format("parquet")
    .save("C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\parquet\\")
)

root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)



In [17]:
# Import data:
df = spark.read.parquet('C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\parquet\\')

# Clean data:
df = df.na.drop()

# Select important features from data
columns = [
    "order_id",
    "product_id",
    "price",
    "freight_value"
]
df = df.select(*columns)

# Total Price:
items = df.select(
    col("order_id"),
    col("product_id"),
    col("price"),
    col("freight_value"),
    ((col("price") + col("freight_value"))).alias("price_total")
)

(
    items
    .write
    .mode("overwrite")
    .format("parquet")
    .save("C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\items\\")
)

items.show()

+--------------------+--------------------+------+-------------+------------------+
|            order_id|          product_id| price|freight_value|       price_total|
+--------------------+--------------------+------+-------------+------------------+
|00010242fe8c5a6d1...|4244733e06e7ecb49...|  58.9|        13.29|             72.19|
|00018f77f2f0320c5...|e5f2d52b802189ee6...| 239.9|        19.93|            259.83|
|000229ec398224ef6...|c777355d18b72b67a...| 199.0|        17.87|            216.87|
|00024acbcdf0a6daa...|7634da152a4610f15...| 12.99|        12.79|             25.78|
|00042b26cf59d7ce6...|ac6c3623068f30de0...| 199.9|        18.14|218.04000000000002|
|00048cc3ae777c65d...|ef92defde845ab845...|  21.9|        12.69|34.589999999999996|
|00054e8431b9d7675...|8d4f2bb7e93e6710a...|  19.9|        11.85|             31.75|
|000576fe39319847c...|557d850972a7d6f79...| 810.0|        70.75|            880.75|
|0005a1a1728c9d785...|310ae3c140ff94b03...|145.95|        11.65|            

## Order Reviews

In [9]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.master("local[1]").appName("teste").getOrCreate()

spark.sparkContext.setLogLevel("WARN")

df = (
    spark
    .read
    .format("csv")
    .options(header='true', inferSchema='true', delimiter=",")
    .load("C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\olist_order_reviews_dataset.csv")
)

df.printSchema()

(
    df
    .write
    .mode("overwrite")
    .format("parquet")
    .save("C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\parquet\\")
)

root
 |-- review_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- review_score: string (nullable = true)
 |-- review_comment_title: string (nullable = true)
 |-- review_comment_message: string (nullable = true)
 |-- review_creation_date: string (nullable = true)
 |-- review_answer_timestamp: string (nullable = true)



In [10]:
# Import data:
df = spark.read.parquet('C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\parquet\\')

# Clean data:
df = df.na.drop()

# Select important features from data
columns = [
    "review_id",
    "order_id",
    "review_score"
]

reviews = df.select(*columns)

reviews.show()

+--------------------+--------------------+------------+
|           review_id|            order_id|review_score|
+--------------------+--------------------+------------+
|8670d52e15e00043a...|b9bf720beb4ab3728...|           4|
|3948b09f7c818e2d8...|e51478e7e277a8374...|           5|
|373cbeecea8286a2b...|583174fbe37d3d5f0...|           1|
|d21bbc789670eab77...|4fc44d78867142c62...|           5|
|c92cdd7dd544a01aa...|37e7875cdce5a9e5b...|           4|
|08c9d79ec0eba1d25...|e029f708df3cc108b...|           5|
|b193ff3c9f32a01f3...|e2e6ee1ed2d7f2f36...|           5|
|86c5cfa7fcbde303f...|a6456e781cb962cc3...|           5|
|500c05500aa275953...|8a9424899aac432d8...|           5|
|109b5ce2dd11bb846...|25362fbf6aac4b01a...|           5|
|c45811d9f90e22a81...|491f193fc52075598...|           5|
|50a1eaa2f96d6f3e0...|4a7cf245701068d38...|           5|
|1692078634b63c7f2...|5bc4e94aef2841f39...|           5|
|46d8249ea59101c72...|f25ddb6cd62d720a5...|           3|
|79927442ebcbf70b2...|1c8898140

## Order Dataset

In [11]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.master("local[1]").appName("teste").getOrCreate()

spark.sparkContext.setLogLevel("WARN")

df = (
    spark
    .read
    .format("csv")
    .options(header='true', inferSchema='true', delimiter=",")
    .load("C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\olist_orders_dataset.csv")
)

df.printSchema()

(
    df
    .write
    .mode("overwrite")
    .format("parquet")
    .save("C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\parquet\\")
)

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [12]:
# Import data:
df = spark.read.parquet('C:\\Users\\Gabriel\\Desktop\\backup\\Repositorios\\MBA_project\\data\\olist\\archive\\parquet\\')

# Clean data:
df = df.na.drop()

# Select important features from data
columns = [
    "order_id",
    "customer_id",
    "order_status"
]

dataset = df.select(*columns)

dataset.show()

+--------------------+--------------------+------------+
|            order_id|         customer_id|order_status|
+--------------------+--------------------+------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|   delivered|
|949d5b44dbf5de918...|f88197465ea7920ad...|   delivered|
|ad21c59c0840e6cb8...|8ab97904e6daea886...|   delivered|
|a4591c265e18cb1dc...|503740e9ca751ccdd...|   delivered|
|6514b8ad8028c9f2c...|9bdf08b4b3b52b552...|   delivered|
|76c6e866289321a7c...|f54a9f0e6b351c431...|   delivered|
|e69bfb5eb88e0ed6a...|31ad1d1b63eb99624...|   delivered|
|e6ce16cb79ec1d90b...|494dded5b201313c6...|   delivered|
|34513ce0c4fab462a...|7711cf624183d843a...|   delivered|
|82566a660a982b15f...|d3e3b74c766bc6214...|   delivered|
|5ff96c15d0b717ac6...|19402a48fe860416a...|   delivered|
|432aaf21d85167c2c...|3df704f53d3f1d481...|   delivered|
|dcb36b511fcac050b...|3b6828a50

## Agregação

In [18]:
agg1 = items.join(payments , on=['order_id'] , how = 'left').show()

Py4JJavaError: An error occurred while calling o248.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 33.0 failed 1 times, most recent failure: Lost task 0.0 in stage 33.0 (TID 33) (host.docker.internal executor driver): java.io.FileNotFoundException: 
File file:/C:/Users/Gabriel/Desktop/backup/Repositorios/MBA_project/data/olist/archive/parquet/part-00000-b9808a43-ee95-47e0-99f4-d9608ab6f61b-c000.snappy.parquet does not exist

It is possible the underlying files have been updated. You can explicitly invalidate
the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by
recreating the Dataset/DataFrame involved.
       
	at org.apache.spark.sql.errors.QueryExecutionErrors$.readCurrentFileNotFoundError(QueryExecutionErrors.scala:648)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:212)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:270)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:553)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:833)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2293)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1021)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1020)
	at org.apache.spark.sql.execution.SparkPlan.executeCollectIterator(SparkPlan.scala:431)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.$anonfun$relationFuture$1(BroadcastExchangeExec.scala:137)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$1(SQLExecution.scala:191)
	at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.io.FileNotFoundException: 
File file:/C:/Users/Gabriel/Desktop/backup/Repositorios/MBA_project/data/olist/archive/parquet/part-00000-b9808a43-ee95-47e0-99f4-d9608ab6f61b-c000.snappy.parquet does not exist

It is possible the underlying files have been updated. You can explicitly invalidate
the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by
recreating the Dataset/DataFrame involved.
       
	at org.apache.spark.sql.errors.QueryExecutionErrors$.readCurrentFileNotFoundError(QueryExecutionErrors.scala:648)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:212)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:270)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:553)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	... 3 more


In [14]:
items.show()

Py4JJavaError: An error occurred while calling o117.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 26.0 failed 1 times, most recent failure: Lost task 0.0 in stage 26.0 (TID 26) (host.docker.internal executor driver): java.io.FileNotFoundException: 
File file:/C:/Users/Gabriel/Desktop/backup/Repositorios/MBA_project/data/olist/archive/parquet/part-00000-c1ac2688-871c-451e-817b-3b0beb5c663e-c000.snappy.parquet does not exist

It is possible the underlying files have been updated. You can explicitly invalidate
the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by
recreating the Dataset/DataFrame involved.
       
	at org.apache.spark.sql.errors.QueryExecutionErrors$.readCurrentFileNotFoundError(QueryExecutionErrors.scala:648)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:212)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:270)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:553)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:833)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:506)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:459)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:48)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3868)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3858)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3856)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3856)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3084)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:288)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:327)
	at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:104)
	at java.base/java.lang.reflect.Method.invoke(Method.java:577)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.io.FileNotFoundException: 
File file:/C:/Users/Gabriel/Desktop/backup/Repositorios/MBA_project/data/olist/archive/parquet/part-00000-c1ac2688-871c-451e-817b-3b0beb5c663e-c000.snappy.parquet does not exist

It is possible the underlying files have been updated. You can explicitly invalidate
the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by
recreating the Dataset/DataFrame involved.
       
	at org.apache.spark.sql.errors.QueryExecutionErrors$.readCurrentFileNotFoundError(QueryExecutionErrors.scala:648)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:212)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:270)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:553)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more
