In [177]:
import pyspark

In [178]:
from pyspark.sql import SparkSession

In [2]:
sc = pyspark.SparkContext("local[*]")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/04 19:56:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [184]:
spark = SparkSession.builder\
      .master("local[1]")\
      .appName("exam_solutions")\
      .getOrCreate() 

In [180]:
rdd = spark.sparkContext.textFile("data.txt")

In [183]:
rdd.first()

data.txt MapPartitionsRDD[414] at textFile at NativeMethodAccessorImpl.java:0

In [185]:
# TransactionID:Int | Timestamp:Datetime | UserID:String | CompanySymbol:String | Volume:Int| Action:String
transactions = [(0, '1991-04-01','S0','Smith',20,'Buy'),
  (1, '2000-05-19','F1','Ford',100,'Sell'),
  (2, '2020-09-05', 'HAL9000', 'Hal', 98, 'Sell'),
  (3, '1978-09-05','W2','Williams',12,'Buy'),
  (4, '1967-12-01','J3','Jones',27,'Buy'),
  (5, '2020-09-05','B4','Brown',160,'Sell'),
  (6, '2020-03-05', 'HAL9000', 'Hal', 134, 'Buy'),
  (7, '2020-09-05', 'J7', 'John', 27, 'Sell'),
  (8, '2020-09-05', 'J8', 'John', 16, 'Sell'),
  (9, '2020-09-05','B5','Brown',16,'Sell'),
  (10, '2020-09-05','B6','Brown',198,'Sell'),
  (11, '2020-09-05', 'HAL9090', 'Hal', 1234, 'Buy'),
]

transactions_columns = ["TransactionID", "Timestamp","UserID","CompanySymbol","Volume","Action"]
df_trans = spark.createDataFrame(data=transactions, schema = transactions_columns)

In [186]:
df_trans.show()

+-------------+----------+-------+-------------+------+------+
|TransactionID| Timestamp| UserID|CompanySymbol|Volume|Action|
+-------------+----------+-------+-------------+------+------+
|            0|1991-04-01|     S0|        Smith|    20|   Buy|
|            1|2000-05-19|     F1|         Ford|   100|  Sell|
|            2|2020-09-05|HAL9000|          Hal|    98|  Sell|
|            3|1978-09-05|     W2|     Williams|    12|   Buy|
|            4|1967-12-01|     J3|        Jones|    27|   Buy|
|            5|2020-09-05|     B4|        Brown|   160|  Sell|
|            6|2020-03-05|HAL9000|          Hal|   134|   Buy|
|            7|2020-09-05|     J7|         John|    27|  Sell|
|            8|2020-09-05|     J8|         John|    16|  Sell|
|            9|2020-09-05|     B5|        Brown|    16|  Sell|
|           10|2020-09-05|     B6|        Brown|   198|  Sell|
|           11|2020-09-05|HAL9090|          Hal|  1234|   Buy|
+-------------+----------+-------+-------------+------+

In [187]:
# CompanySymbol:String | Timestamp:Datetime | ValuePerUnit_EURO: Float
prices = [('Smith', '2022-02-01', 12.12),
         ('Ford', '2022-02-02', 22.22),
         ('Williams', '2022-02-01', 43.43),
         ('Jones', '2022-02-04', 35.35),
         ('Brown', '2022-02-03', 56.56)
]

prices_columns = ["CompanySymbol", "Timestamp", "ValuePerUnit_EURO"]
df_prices = spark.createDataFrame(data = prices, schema = prices_columns)

In [188]:
df_prices.show()

+-------------+----------+-----------------+
|CompanySymbol| Timestamp|ValuePerUnit_EURO|
+-------------+----------+-----------------+
|        Smith|2022-02-01|            12.12|
|         Ford|2022-02-02|            22.22|
|     Williams|2022-02-01|            43.43|
|        Jones|2022-02-04|            35.35|
|        Brown|2022-02-03|            56.56|
+-------------+----------+-----------------+



In [None]:
# 1. Total Number of transactions

In [189]:
df_trans.count()

12

In [None]:
# 2. Number of Transactions done by the user “HAL9000”

In [190]:
import pyspark.sql.functions as f

In [212]:
df_trans.filter(f.col('UserID') == 'HAL9000').count()

2

In [None]:
# 3. Number of transactions per day

In [213]:
rdd_trans = df_trans.rdd

In [214]:
rdd_trans.map(lambda x: (x.Timestamp, 1)).reduceByKey(lambda x, y: x + y).collect()

[('1991-04-01', 1),
 ('2000-05-19', 1),
 ('2020-09-05', 7),
 ('1978-09-05', 1),
 ('1967-12-01', 1),
 ('2020-03-05', 1)]

In [46]:
# 4. Average Daily Transactions per company
# (i..e, On average, how much transaction each company does every day)
# during the week 42 of 2021

In [48]:
# 1. filter week 42
# 2. map to pairs (company, coeff_per_day)
# 3. count and divide by num_days_per_working_week <=> 5

In [248]:
#total_trans_num_week42 = df_trans.filter(f.col('Timestamp') == '2020-09-05')\
df_week42 = df_trans.filter(f.col('Timestamp') == '2020-09-05')
rdd_week42 = df_week42.rdd
total_trans_num_week42 = rdd_week42.map(lambda x: (x.CompanySymbol, 1))\
  .reduceByKey(lambda x, y: (x + y) / 5)

In [249]:
total_trans_num_week42.collect()

[('Hal', 0.4), ('Brown', 0.27999999999999997), ('John', 0.4)]

In [250]:
total_trans_num_week42.count()

3

In [119]:
# 5. Total Amount of Euro spent by each user

In [None]:
# 1. join two tables
# 2. calculate total sum with price and value
# 2. filter current prices
# 3. calculate sum for each user taking into account Action

In [274]:
rdd_prices = df_prices.rdd

In [280]:
transactions_with_prices_rdd = rdd_trans.map(lambda x: (x.CompanySymbol, x))\
    .join(rdd_prices.map(lambda x: (x.CompanySymbol, x)))

In [281]:
transactions_with_prices_rdd.first()

('Smith',
 (Row(TransactionID=0, Timestamp='1991-04-01', UserID='S0', CompanySymbol='Smith', Volume=20, Action='Buy'),
  Row(CompanySymbol='Smith', Timestamp='2022-02-01', ValuePerUnit_EURO=12.12)))

In [282]:
def createNewTable(x):
    print('x[0]:', x[0])
    print('----------')
    print('x[1]:', x[1])
    print('----------')
    print('x[1][0]: ', x[1][0])
    print('x[1][1]: ', x[1][1])
    return {"CompanySymbol":x[0], "UserID":x[1][0].UserID}

In [283]:
# PickUpErrorRDD.map(lambda row: (row.get("McID"),row.get("TimeStamp"))).reduceByKey(lambda valLeft,valRight: max(valLeft, valRight)).map(lambda x: {"lastTS":x[1],"McID":x[0]}).collect()
# ["TransactionID", "Timestamp","UserID","CompanySymbol","Volume","Action"]
# ["CompanySymbol", "Timestamp", "ValuePerUnit_EURO"]
transactions_with_prices_rdd = transactions_with_prices_rdd.map(lambda x: createNewTable(x))
#"ID":x[1][0].TransactionID,\
'''"Action":x[1][0].Action,\
"Volumes":x[1][0].Volume,\
"pricePerUnit":x[1][1].ValuePerUnit_EURO,\
"moneyValue":x[1][0].Volume * x[1][1].ValuePerUnit_EURO,\
"ts":x[1][0].Timestamp,\
"deltaTs":x[1][0].Timestamp - x[1][1].Timestamp})'''

'"Action":x[1][0].Action,"Volumes":x[1][0].Volume,"pricePerUnit":x[1][1].ValuePerUnit_EURO,"moneyValue":x[1][0].Volume * x[1][1].ValuePerUnit_EURO,"ts":x[1][0].Timestamp,"deltaTs":x[1][0].Timestamp - x[1][1].Timestamp})'

In [284]:
transactions_with_prices_rdd.collect()

x[0]: Smith
----------
x[1]: (Row(TransactionID=0, Timestamp='1991-04-01', UserID='S0', CompanySymbol='Smith', Volume=20, Action='Buy'), Row(CompanySymbol='Smith', Timestamp='2022-02-01', ValuePerUnit_EURO=12.12))
----------
x[1][0]:  Row(TransactionID=0, Timestamp='1991-04-01', UserID='S0', CompanySymbol='Smith', Volume=20, Action='Buy')
x[1][1]:  Row(CompanySymbol='Smith', Timestamp='2022-02-01', ValuePerUnit_EURO=12.12)
x[0]: Ford
----------
x[1]: (Row(TransactionID=1, Timestamp='2000-05-19', UserID='F1', CompanySymbol='Ford', Volume=100, Action='Sell'), Row(CompanySymbol='Ford', Timestamp='2022-02-02', ValuePerUnit_EURO=22.22))
----------
x[1][0]:  Row(TransactionID=1, Timestamp='2000-05-19', UserID='F1', CompanySymbol='Ford', Volume=100, Action='Sell')
x[1][1]:  Row(CompanySymbol='Ford', Timestamp='2022-02-02', ValuePerUnit_EURO=22.22)
x[0]: Williams
----------
x[1]: (Row(TransactionID=3, Timestamp='1978-09-05', UserID='W2', CompanySymbol='Williams', Volume=12, Action='Buy'), Row(

[{'CompanySymbol': 'Smith', 'UserID': 'S0'},
 {'CompanySymbol': 'Ford', 'UserID': 'F1'},
 {'CompanySymbol': 'Williams', 'UserID': 'W2'},
 {'CompanySymbol': 'Jones', 'UserID': 'J3'},
 {'CompanySymbol': 'Brown', 'UserID': 'B4'},
 {'CompanySymbol': 'Brown', 'UserID': 'B5'},
 {'CompanySymbol': 'Brown', 'UserID': 'B6'}]