In [1]:
import os
import json
import os.path
if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir('..')
from dotenv import load_dotenv

load_dotenv(".env")  # take environment variables from .env.

from steam_trade_bot.etl.settings import get_jdbc_creds

Spark dashboard: http://localhost:4040


In [2]:
%%time
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master('local[*]') \
    .appName('Steam Trade Bot ETL') \
    .config("spark.jars", "vendors/postgresql-42.6.0.jar") \
    .config("spark.executor.memory", "10g") \
    .config("spark.driver.memory", "5g")\
    .config("spark.sql.shuffle.partitions" , "30") \
    .config('spark.sql.files.maxPartitionBytes', str(20 * 1024 * 1024))\
    .getOrCreate()

CPU times: total: 234 ms
Wall time: 14.1 s


In [3]:
%%time
jdbc_url, username, password = get_jdbc_creds()
df = spark.read \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "raw.market_item_sell_history") \
    .option("user", username) \
    .option("password", password) \
    .option("driver", "org.postgresql.Driver") \
    .load()

CPU times: total: 0 ns
Wall time: 2.7 s


In [4]:
app_id_market_name_df_partitions = max(1, round(df.count() / 1000))
app_id_market_name_df = df.select("app_id", "market_hash_name").repartition(app_id_market_name_df_partitions).cache()
app_id_df = df.select("app_id").distinct().repartition(1).cache()

In [5]:
app_id_market_name_df.printSchema()

root
 |-- app_id: integer (nullable = true)
 |-- market_hash_name: string (nullable = true)



In [6]:
app_id_df.printSchema()

root
 |-- app_id: integer (nullable = true)



In [7]:
app_id_market_name_df.count(), app_id_market_name_df.rdd.getNumPartitions()

(113, 1)

In [8]:
app_id_df.count(), app_id_df.rdd.getNumPartitions()

(1, 1)

In [9]:
%%time
from steam_trade_bot.etl.async_run import surround_async
from steam_trade_bot.etl.processors.game import process_game_batch

app_id_df.foreachPartition(surround_async(process_game_batch))

CPU times: total: 31.2 ms
Wall time: 1.64 s


In [10]:
%%time
from steam_trade_bot.etl.processors.market_item import process_market_item_batch

app_id_market_name_df.foreachPartition(surround_async(process_market_item_batch))

CPU times: total: 15.6 ms
Wall time: 1.56 s


In [11]:
%%time
from steam_trade_bot.etl.processors.market_item import process_market_item_sell_history_batch

app_id_market_name_df.foreachPartition(surround_async(process_market_item_sell_history_batch))


CPU times: total: 0 ns
Wall time: 11 s


In [12]:
%%time
from steam_trade_bot.etl.processors.market_item import process_market_item_orders_batch

app_id_market_name_df.foreachPartition(surround_async(process_market_item_orders_batch))


CPU times: total: 0 ns
Wall time: 1.62 s
