In [522]:
import pyspark

In [523]:
from pyspark.sql import SparkSession

In [2]:
sc = pyspark.SparkContext("local[*]")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/04 19:56:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [524]:
spark = SparkSession.builder\
      .master("local[1]")\
      .appName("exam_solutions")\
      .getOrCreate() 

In [180]:
rdd = spark.sparkContext.textFile("data.txt")

In [183]:
rdd.first()

data.txt MapPartitionsRDD[414] at textFile at NativeMethodAccessorImpl.java:0

In [525]:
# TransactionID:Int | Timestamp:Datetime | UserID:String | CompanySymbol:String | Volume:Int| Action:String
transactions = [(0, '1991-04-01','S0','Smith',20,'Buy'),
  (1, '2000-05-19','F1','Ford',100,'Sell'),
  (2, '2020-09-05', 'HAL9000', 'Hal', 98, 'Sell'),
  (3, '1978-09-05','W2','Williams',12,'Buy'),
  (4, '1967-12-01','J3','Jones',27,'Buy'),
  (5, '2020-09-05','B4','Brown',160,'Sell'),
  (6, '2020-03-05', 'HAL9000', 'Hal', 134, 'Buy'),
  (7, '2020-09-05', 'J7', 'John', 27, 'Sell'),
  (8, '2020-09-05', 'J8', 'John', 16, 'Sell'),
  (9, '2020-09-05','B5','Brown',16,'Sell'),
  (10, '2020-09-05','B6','Brown',198,'Sell'),
  (11, '2020-09-05', 'HAL9090', 'Hal', 1234, 'Buy'),
]

transactions_columns = ["TransactionID", "Timestamp","UserID","CompanySymbol","Volume","Action"]
df_trans = spark.createDataFrame(data=transactions, schema = transactions_columns)

In [526]:
df_trans.show()

+-------------+----------+-------+-------------+------+------+
|TransactionID| Timestamp| UserID|CompanySymbol|Volume|Action|
+-------------+----------+-------+-------------+------+------+
|            0|1991-04-01|     S0|        Smith|    20|   Buy|
|            1|2000-05-19|     F1|         Ford|   100|  Sell|
|            2|2020-09-05|HAL9000|          Hal|    98|  Sell|
|            3|1978-09-05|     W2|     Williams|    12|   Buy|
|            4|1967-12-01|     J3|        Jones|    27|   Buy|
|            5|2020-09-05|     B4|        Brown|   160|  Sell|
|            6|2020-03-05|HAL9000|          Hal|   134|   Buy|
|            7|2020-09-05|     J7|         John|    27|  Sell|
|            8|2020-09-05|     J8|         John|    16|  Sell|
|            9|2020-09-05|     B5|        Brown|    16|  Sell|
|           10|2020-09-05|     B6|        Brown|   198|  Sell|
|           11|2020-09-05|HAL9090|          Hal|  1234|   Buy|
+-------------+----------+-------+-------------+------+

In [527]:
# CompanySymbol:String | Timestamp:Datetime | ValuePerUnit_EURO: Float
prices = [('Smith', '2016-02-01', 12.12),
         ('Ford', '2022-02-02', 22.22),
         ('Williams', '2022-02-01', 43.43),
         ('Jones', '2022-02-04', 35.35),
         ('Brown', '2009-02-03', 56.56)
]

prices_columns = ["CompanySymbol", "Timestamp", "ValuePerUnit_EURO"]
df_prices = spark.createDataFrame(data = prices, schema = prices_columns)

In [528]:
df_prices.show()

+-------------+----------+-----------------+
|CompanySymbol| Timestamp|ValuePerUnit_EURO|
+-------------+----------+-----------------+
|        Smith|2016-02-01|            12.12|
|         Ford|2022-02-02|            22.22|
|     Williams|2022-02-01|            43.43|
|        Jones|2022-02-04|            35.35|
|        Brown|2009-02-03|            56.56|
+-------------+----------+-----------------+



In [None]:
# 1. Total Number of transactions

In [529]:
df_trans.count()

12

In [None]:
# 2. Number of Transactions done by the user “HAL9000”

In [530]:
import pyspark.sql.functions as f

In [531]:
df_trans.filter(f.col('UserID') == 'HAL9000').count()

2

In [None]:
# 3. Number of transactions per day

In [532]:
rdd_trans = df_trans.rdd

In [533]:
rdd_trans.map(lambda x: (x.Timestamp, 1)).reduceByKey(lambda x, y: x + y).collect()

[('1991-04-01', 1),
 ('2000-05-19', 1),
 ('2020-09-05', 7),
 ('1978-09-05', 1),
 ('1967-12-01', 1),
 ('2020-03-05', 1)]

In [46]:
# 4. Average Daily Transactions per company
# (i..e, On average, how much transaction each company does every day)
# during the week 42 of 2021

In [48]:
# 1. filter week 42
# 2. map to pairs (company, coeff_per_day)
# 3. count and divide by num_days_per_working_week <=> 5

In [534]:
#total_trans_num_week42 = df_trans.filter(f.col('Timestamp') == '2020-09-05')\
df_week42 = df_trans.filter(f.col('Timestamp') == '2020-09-05')
rdd_week42 = df_week42.rdd
total_trans_num_week42 = rdd_week42.map(lambda x: (x.CompanySymbol, 1))\
  .reduceByKey(lambda x, y: (x + y) / 5)

In [535]:
total_trans_num_week42.collect()

[('Hal', 0.4), ('Brown', 0.27999999999999997), ('John', 0.4)]

In [536]:
total_trans_num_week42.count()

3

In [119]:
# 5. Total Amount of Euro spent by each user

In [None]:
# 1. join two tables
# 2. calculate total sum with price and value
# 2. filter current prices
# 3. calculate sum for each user taking into account Action

In [537]:
rdd_prices = df_prices.rdd

In [538]:
transactions_with_prices_rdd = rdd_trans.map(lambda x: (x.CompanySymbol, x))\
    .join(rdd_prices.map(lambda x: (x.CompanySymbol, x)))

In [539]:
transactions_with_prices_rdd.first()

('Smith',
 (Row(TransactionID=0, Timestamp='1991-04-01', UserID='S0', CompanySymbol='Smith', Volume=20, Action='Buy'),
  Row(CompanySymbol='Smith', Timestamp='2016-02-01', ValuePerUnit_EURO=12.12)))

In [540]:
def createNewTable(x):
    #print('x[0]:', x[0])
    #print('----------')
    #print('x[1]:', x[1])
    #print('----------')
    #print('x[1][0]: ', x[1][0])
    #print('x[1][1]: ', x[1][1])
    ts_0 = int(x[1][0].Timestamp[:4])
    ts_1 = int(x[1][1].Timestamp[:4])
    return {"CompanySymbol":x[0],
            "UserID":x[1][0].UserID,
            "ID":x[1][0].TransactionID,
            "Action":x[1][0].Action,
            "Volumes":x[1][0].Volume,
            "pricePerUnit":x[1][1].ValuePerUnit_EURO,
            "moneyValue":x[1][0].Volume * x[1][1].ValuePerUnit_EURO,
            "ts":ts_0,
            "deltaTs":ts_1 - ts_0
           }

In [541]:
# PickUpErrorRDD.map(lambda row: (row.get("McID"),row.get("TimeStamp"))).reduceByKey(lambda valLeft,valRight: max(valLeft, valRight)).map(lambda x: {"lastTS":x[1],"McID":x[0]}).collect()
# ["TransactionID", "Timestamp","UserID","CompanySymbol","Volume","Action"]
# ["CompanySymbol", "Timestamp", "ValuePerUnit_EURO"]
transactions_with_prices_rdd = transactions_with_prices_rdd.map(lambda x: createNewTable(x))

In [542]:
transactions_with_prices_rdd.collect()

[{'CompanySymbol': 'Smith',
  'UserID': 'S0',
  'ID': 0,
  'Action': 'Buy',
  'Volumes': 20,
  'pricePerUnit': 12.12,
  'moneyValue': 242.39999999999998,
  'ts': 1991,
  'deltaTs': 25},
 {'CompanySymbol': 'Ford',
  'UserID': 'F1',
  'ID': 1,
  'Action': 'Sell',
  'Volumes': 100,
  'pricePerUnit': 22.22,
  'moneyValue': 2222.0,
  'ts': 2000,
  'deltaTs': 22},
 {'CompanySymbol': 'Williams',
  'UserID': 'W2',
  'ID': 3,
  'Action': 'Buy',
  'Volumes': 12,
  'pricePerUnit': 43.43,
  'moneyValue': 521.16,
  'ts': 1978,
  'deltaTs': 44},
 {'CompanySymbol': 'Jones',
  'UserID': 'J3',
  'ID': 4,
  'Action': 'Buy',
  'Volumes': 27,
  'pricePerUnit': 35.35,
  'moneyValue': 954.45,
  'ts': 1967,
  'deltaTs': 55},
 {'CompanySymbol': 'Brown',
  'UserID': 'B4',
  'ID': 5,
  'Action': 'Sell',
  'Volumes': 160,
  'pricePerUnit': 56.56,
  'moneyValue': 9049.6,
  'ts': 2020,
  'deltaTs': -11},
 {'CompanySymbol': 'Brown',
  'UserID': 'B5',
  'ID': 9,
  'Action': 'Sell',
  'Volumes': 16,
  'pricePerUnit':

In [383]:
# x[0] == schema
# x[1] == table1 || table2
# x[1][0] == table1
# x[1][1] == table2

In [543]:
transactions_with_prices_rdd_filtered = transactions_with_prices_rdd.filter(lambda x: x.get("deltaTs") > 0)\
    .map(lambda x: (x.get("ID"),(x)))\
    .reduceByKey(lambda x, y: x if x.get("deltaTs") < y.get("deltaTs") else y)\
    .map(lambda x: x[1])

In [544]:
transactions_with_prices_rdd_filtered.collect()

[{'CompanySymbol': 'Smith',
  'UserID': 'S0',
  'ID': 0,
  'Action': 'Buy',
  'Volumes': 20,
  'pricePerUnit': 12.12,
  'moneyValue': 242.39999999999998,
  'ts': 1991,
  'deltaTs': 25},
 {'CompanySymbol': 'Jones',
  'UserID': 'J3',
  'ID': 4,
  'Action': 'Buy',
  'Volumes': 27,
  'pricePerUnit': 35.35,
  'moneyValue': 954.45,
  'ts': 1967,
  'deltaTs': 55},
 {'CompanySymbol': 'Ford',
  'UserID': 'F1',
  'ID': 1,
  'Action': 'Sell',
  'Volumes': 100,
  'pricePerUnit': 22.22,
  'moneyValue': 2222.0,
  'ts': 2000,
  'deltaTs': 22},
 {'CompanySymbol': 'Williams',
  'UserID': 'W2',
  'ID': 3,
  'Action': 'Buy',
  'Volumes': 12,
  'pricePerUnit': 43.43,
  'moneyValue': 521.16,
  'ts': 1978,
  'deltaTs': 44}]

In [545]:
spent_by_user = transactions_with_prices_rdd_filtered.filter(lambda x: x.get("Action") == 'Buy')\
    .map(lambda x: (x.get("UserID"), x.get("moneyValue")))\
    .reduceByKey(lambda x, y: x + y)

In [546]:
spent_by_user.collect()

[('S0', 242.39999999999998), ('W2', 521.16), ('J3', 954.45)]

In [None]:
# 6. Value of the portfolio (the value of stocks currently hold) of each user

In [575]:
amount_of_stocks_rdd = rdd_trans.map(lambda x: ((x.UserID, x.CompanySymbol), x.Volume if x.Action == 'Buy' else -x.Volume))\
    .reduceByKey(lambda x, y: x + y)\
    .map(lambda x: (x[0][1], x[0][0], x[1]))

In [599]:
amount_of_stocks_rdd.collect()

[('Smith', 'S0', 20),
 ('Ford', 'F1', -100),
 ('Hal', 'HAL9000', 36),
 ('Williams', 'W2', 12),
 ('Jones', 'J3', 27),
 ('Brown', 'B4', -160),
 ('John', 'J7', -27),
 ('John', 'J8', -16),
 ('Brown', 'B5', -16),
 ('Brown', 'B6', -198),
 ('Hal', 'HAL9090', 1234)]

In [600]:
last_value_rdd = rdd_prices.map(lambda x: (x.CompanySymbol, x))\
    .reduceByKey(lambda x, y: x if x.get("Timestamp") > y.get("Timestamp") else y)

In [601]:
last_value_rdd.collect()

[('Smith',
  Row(CompanySymbol='Smith', Timestamp='2016-02-01', ValuePerUnit_EURO=12.12)),
 ('Ford',
  Row(CompanySymbol='Ford', Timestamp='2022-02-02', ValuePerUnit_EURO=22.22)),
 ('Williams',
  Row(CompanySymbol='Williams', Timestamp='2022-02-01', ValuePerUnit_EURO=43.43)),
 ('Jones',
  Row(CompanySymbol='Jones', Timestamp='2022-02-04', ValuePerUnit_EURO=35.35)),
 ('Brown',
  Row(CompanySymbol='Brown', Timestamp='2009-02-03', ValuePerUnit_EURO=56.56))]

In [619]:
current_portfolio_rdd = amount_of_stocks_rdd.join(last_value_rdd.map(lambda x: {"ValuePerUnit_EURO": x[1][0].ValuePerUnit_EURO,
"Value":x[1][0][1]*x[1][1].get("Price")})\
.map(lambda x: x.get("UserId",x.get("Value")))\
.reduceByKey(lambda x,y:x+y))

In [620]:
current_portfolio_rdd.collect()

22/02/05 20:54:36 ERROR Executor: Exception in task 0.0 in stage 505.0 (TID 457)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 619, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 609, in process
    out_iter = func(split_index, iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 417, in func
    return f(iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2146, in combineLocally
    merger.mergeValues(iterator)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/shuffle.py", line 240, in mergeValues
    for k, v in iterator:
  File "/usr/local/spark/pytho

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 505.0 failed 1 times, most recent failure: Lost task 0.0 in stage 505.0 (TID 457) (5b061ce0cf4c executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 619, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 609, in process
    out_iter = func(split_index, iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 417, in func
    return f(iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2146, in combineLocally
    merger.mergeValues(iterator)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/shuffle.py", line 240, in mergeValues
    for k, v in iterator:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 74, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_30/1718959182.py", line 1, in <lambda>
AttributeError: 'str' object has no attribute 'ValuePerUnit_EURO'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:545)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:703)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:685)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:498)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1211)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2352)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2351)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2351)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1109)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2591)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2533)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2522)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:898)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at jdk.internal.reflect.GeneratedMethodAccessor92.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 619, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 609, in process
    out_iter = func(split_index, iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 417, in func
    return f(iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2146, in combineLocally
    merger.mergeValues(iterator)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/shuffle.py", line 240, in mergeValues
    for k, v in iterator:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 74, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_30/1718959182.py", line 1, in <lambda>
AttributeError: 'str' object has no attribute 'ValuePerUnit_EURO'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:545)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:703)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:685)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:498)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1211)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


In [621]:
current_portfolio_rdd.collect()

22/02/05 20:55:00 ERROR Executor: Exception in task 0.0 in stage 510.0 (TID 458)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 619, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 609, in process
    out_iter = func(split_index, iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 417, in func
    return f(iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2146, in combineLocally
    merger.mergeValues(iterator)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/shuffle.py", line 240, in mergeValues
    for k, v in iterator:
  File "/usr/local/spark/pytho

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 510.0 failed 1 times, most recent failure: Lost task 0.0 in stage 510.0 (TID 458) (5b061ce0cf4c executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 619, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 609, in process
    out_iter = func(split_index, iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 417, in func
    return f(iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2146, in combineLocally
    merger.mergeValues(iterator)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/shuffle.py", line 240, in mergeValues
    for k, v in iterator:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 74, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_30/1718959182.py", line 1, in <lambda>
AttributeError: 'str' object has no attribute 'ValuePerUnit_EURO'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:545)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:703)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:685)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:498)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1211)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2352)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2351)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2351)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1109)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2591)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2533)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2522)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:898)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at jdk.internal.reflect.GeneratedMethodAccessor92.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 619, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 609, in process
    out_iter = func(split_index, iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 417, in func
    return f(iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2146, in combineLocally
    merger.mergeValues(iterator)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/shuffle.py", line 240, in mergeValues
    for k, v in iterator:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 74, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_30/1718959182.py", line 1, in <lambda>
AttributeError: 'str' object has no attribute 'ValuePerUnit_EURO'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:545)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:703)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:685)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:498)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1211)
	at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
