In [1]:
import os
import json
import os.path
if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir('..')
from dotenv import load_dotenv

load_dotenv(".env")  # take environment variables from .env.

from steam_trade_bot.containers import Container
from steam_trade_bot.settings import BotSettings
container = Container()
container.config.from_pydantic(BotSettings())
container.wire(modules=[__name__])

In [2]:
uow_ = container.repositories.unit_of_work

In [57]:
async with uow_() as uow:
    history = []
    for game in await uow.game.get_all():
        history.extend(await uow.sell_history.get_all(app_id=game.app_id, currency=1, count=500000))

In [58]:
len(history)

330277

In [70]:
import csv
import pandas as pd
from steam_trade_bot.domain.services.sell_history_analyzer import steam_date_str_to_datetime

# Write CSV file
with open("bigdata.csv", "wt", newline='', encoding="utf8") as fp:
    writer = csv.writer(fp, delimiter=",")
    writer.writerow(['app_id', 'market_hash_name', 'timestamp', 'price', 'amount'])  # write header
    for i, element in enumerate(history):
        j = json.loads(element.history)
        writer.writerows([(element.app_id, element.market_hash_name, steam_date_str_to_datetime(pair[0]), round(pair[1], 2), int(pair[2])) for pair in j])
        if i % 1000 == 0:
            print(f"Processed {i} items")

Processed 0 items
Processed 1000 items
Processed 2000 items
Processed 3000 items
Processed 4000 items
Processed 5000 items
Processed 6000 items
Processed 7000 items
Processed 8000 items
Processed 9000 items
Processed 10000 items
Processed 11000 items
Processed 12000 items
Processed 13000 items
Processed 14000 items
Processed 15000 items
Processed 16000 items
Processed 17000 items
Processed 18000 items
Processed 19000 items
Processed 20000 items
Processed 21000 items
Processed 22000 items
Processed 23000 items
Processed 24000 items
Processed 25000 items
Processed 26000 items
Processed 27000 items
Processed 28000 items
Processed 29000 items
Processed 30000 items
Processed 31000 items
Processed 32000 items
Processed 33000 items
Processed 34000 items
Processed 35000 items
Processed 36000 items
Processed 37000 items
Processed 38000 items
Processed 39000 items
Processed 40000 items
Processed 41000 items
Processed 42000 items
Processed 43000 items
Processed 44000 items
Processed 45000 items
P

In [4]:
df = pd.DataFrame([], columns=['app_id', 'market_hash_name', 'timestamp', 'price', 'amount'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   app_id            0 non-null      object
 1   market_hash_name  0 non-null      object
 2   timestamp         0 non-null      object
 3   price             0 non-null      object
 4   amount            0 non-null      object
dtypes: object(5)
memory usage: 0.0+ bytes


In [5]:
df = pd.read_csv("bigdata.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215911515 entries, 0 to 215911514
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   app_id            int64  
 1   market_hash_name  object 
 2   timestamp         object 
 3   price             float64
 4   amount            int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 8.0+ GB


In [8]:
df["app_id"].value_counts()

753        123424132
570         33247132
730         26171717
440         12111915
218620       4315816
             ...    
1818410            2
1706770            2
1680550            2
1412190            2
1187920            2
Name: app_id, Length: 248, dtype: int64

In [21]:
df["partition"] = df.apply(lambda x: x["timestamp"].rsplit('-', 1)[0], axis=1)

In [22]:
df.to_parquet(path='./parquet/', engine='fastparquet', compression='snappy', index=False, partition_cols=["partition"])

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215911515 entries, 0 to 215911514
Data columns (total 6 columns):
 #   Column            Dtype  
---  ------            -----  
 0   app_id            int64  
 1   market_hash_name  object 
 2   timestamp         object 
 3   price             float64
 4   amount            int64  
 5   partition         object 
dtypes: float64(1), int64(2), object(3)
memory usage: 9.7+ GB


In [24]:
del df


In [4]:
df = pd.read_parquet('./parquet/partition=2022-10/', engine='fastparquet')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11628019 entries, 0 to 11628018
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   app_id            int64  
 1   market_hash_name  object 
 2   timestamp         object 
 3   price             float64
 4   amount            int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 443.6+ MB


In [6]:
pd.options.plotting.backend = "plotly"

In [18]:
#new_df = df.apply()

fig = df[:1000][["price", "amount"]].plot.bar(x="price", y="amount")
fig.show()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

spark = SparkSession.builder \
    .master('local[6]') \
    .appName('myAppName') \
    .config("spark.driver.memory", "10g") \
    .config('spark.executor.memory', '10g') \
    .config("spark.cores.max", "6") \
    .getOrCreate()

sc = spark.sparkContext
sqlContext = SQLContext(sc)



In [13]:
parquet_files = []
for root, dirs, files in os.walk('parquet/'):
    for file in files:
        if file.endswith(".parquet"):
            parquet_files.append(f"{root}/{file}")
#df = spark.read.parquet(*parquet_files)
df = sqlContext.read.parquet(*parquet_files)

In [14]:
rdd = df.rdd

In [10]:
rdd.count()

12725731

In [6]:
minirdd = rdd.top(5)
minirdd

[Row(index=379671, app_id=753, market_hash_name='307070-The Main Hero (Initial Look)', timestamp='2014-12-31 01:00:00', price=0.03, amount=11),
 Row(index=379670, app_id=753, market_hash_name='307070-The Main Hero (Initial Look)', timestamp='2014-12-29 01:00:00', price=0.03, amount=2),
 Row(index=379669, app_id=753, market_hash_name='307070-The Main Hero (Initial Look)', timestamp='2014-12-17 01:00:00', price=0.03, amount=3),
 Row(index=379668, app_id=753, market_hash_name='307070-The Main Hero (Initial Look)', timestamp='2014-12-16 01:00:00', price=0.03, amount=3),
 Row(index=379667, app_id=753, market_hash_name='307070-The Main Hero (Initial Look)', timestamp='2014-12-15 01:00:00', price=0.03, amount=7)]

In [19]:
mhn_price = rdd.map(lambda x: (x.market_hash_name, (x.price, x.amount)))
mhn_volume = mhn_price.mapValues(lambda x: x[0] * x[1])
mhn_volume = mhn_volume.reduceByKey(lambda x, y: round(x + y, 2))
mhn_sorted = mhn_volume.sortBy(lambda x: -x[1])
mhn_sorted_r = mhn_volume.sortBy(lambda x: x[1])
#mhn_amount_sorted = mhn_price.mapValues(lambda x: x[1]).reduceByKey(lambda x, y: x + y).sortBy(lambda x: -x[1])
# mhn_price_max_sorted = mhn_price.mapValues(lambda x: x[0]).reduceByKey(lambda x, y: max(x, y)).sortBy(lambda x: -x[1])

In [20]:
mhn_sorted_r.take(100)


[('Nerd Glasses (Medium Slate Blue)', 0.06),
 ('1120320-:Trowel:', 0.06),
 ('1907900-:halftimemonk:', 0.06),
 ('384100-:roboflak:', 0.06),
 ('PKM - Savanna', 0.06),
 ('M9 - Savanna', 0.06),
 ('569870-Elite', 0.06),
 ('464260-:xcgrenade:', 0.06),
 ('1051960-:fpbwiggle:', 0.06),
 ('Farthing | cpbiTa', 0.06),
 ('Florin - Two Shillings | cmVNEY', 0.06),
 ('1086850-:omgblowbird:', 0.06),
 ('645320-:hyke:', 0.06),
 ('431120-:peephero:', 0.06),
 ('Grano | cnTXCa', 0.06),
 ('New Penny | cjRGSg', 0.06),
 ('Scorpion - Autumn', 0.06),
 ('1-2 Mark | canELv', 0.06),
 ('5 Pfennig | cWQfWA', 0.06),
 ('Festivized Plaid Potshotter Mk.II Winger (Well-Worn)', 0.06),
 ('20 Centesimi | cfDqhd', 0.06),
 ('564100-:smartkid:', 0.06),
 ('982290-Tapestry', 0.06),
 ('Dime | cuWtbp', 0.06),
 ('1325900-Forktown', 0.06),
 ('1146170-:winkwinkkingsim:', 0.06),
 ('1465510-:d82duskdude:', 0.06),
 ('1-2 Penny | cDSThm', 0.06),
 ('Bu - Ichibu | cKlbGB', 0.06),
 ('10 Zlotych | c11173', 0.06),
 ("People's Car (Fast-Sale)",

In [4]:
df.createOrReplaceTempView("history")

In [None]:

parkSQL = spark.sql("select * from history where price >= 100 and amount > 10 order by timestamp asc")
parkSQL.show()

In [17]:
parkSQL = spark.sql("select sum(price * amount) from history")
parkSQL.show()

+---------------------+
|sum((price * amount))|
+---------------------+
|  6.078173603628149E9|
+---------------------+



In [18]:
6078173603 * 0.13

790162568.39

In [19]:
parkSQL = spark.sql("select sum(amount) from history")
parkSQL.show()

+-----------+
|sum(amount)|
+-----------+
| 8385157278|
+-----------+



In [20]:
parkSQL = spark.sql("select sum(price * amount), sum(amount) from history where app_id = 730")
parkSQL.show()

+---------------------+-----------+
|sum((price * amount))|sum(amount)|
+---------------------+-----------+
|  4.246101903730945E9| 4269578286|
+---------------------+-----------+



In [22]:
parkSQL = spark.sql("select sum(price * amount), sum(amount) from history where app_id = 570")
parkSQL.show()

+---------------------+-----------+
|sum((price * amount))|sum(amount)|
+---------------------+-----------+
|  8.883784525308354E8|  931933185|
+---------------------+-----------+



In [24]:
parkSQL = spark.sql("select sum(price * amount), sum(amount) from history where app_id = 440")
parkSQL.show()

+---------------------+-----------+
|sum((price * amount))|sum(amount)|
+---------------------+-----------+
| 2.4895009611998528E8|  222003330|
+---------------------+-----------+



In [25]:
248950096.12 * 0.13

32363512.4956

In [39]:
parkSQL = spark.sql("select market_hash_name, sum(amount) as total_amount from history where app_id = 730 group by market_hash_name order by total_amount desc limit 5")
parkSQL.show(20, False)

+------------------------------+------------+
|market_hash_name              |total_amount|
+------------------------------+------------+
|Clutch Case                   |134768975   |
|Gamma 2 Case                  |121917096   |
|Chroma 3 Case                 |101683679   |
|Danger Zone Case              |96496214    |
|Operation Breakout Weapon Case|90903986    |
+------------------------------+------------+



In [40]:
parkSQL = spark.sql("select market_hash_name, sum(amount) as total_amount from history where app_id = 570 group by market_hash_name order by total_amount desc limit 5")
parkSQL.show(20, False)

+------------------------------+------------+
|market_hash_name              |total_amount|
+------------------------------+------------+
|Hallowed Chest of the Diretide|11794539    |
|Siltbreaker Reward            |2224732     |
|Mantle of the Cinder Baron    |1282196     |
|Controlled Burn               |1275757     |
|Artificer's Hammer            |1272289     |
+------------------------------+------------+



In [41]:
parkSQL = spark.sql("select market_hash_name, sum(amount) as total_amount from history where app_id = 440 group by market_hash_name order by total_amount desc limit 5")
parkSQL.show(20, False)

+---------------------------------+------------+
|market_hash_name                 |total_amount|
+---------------------------------+------------+
|Mann Co. Supply Crate Key        |28881345    |
|Battle-Worn Robot Taunt Processor|4709385     |
|Battle-Worn Robot KB-808         |4635508     |
|Battle-Worn Robot Money Furnace  |4611231     |
|Tour of Duty Ticket              |4604275     |
+---------------------------------+------------+



In [44]:
parkSQL = spark.sql("select timestamp, sum(price * amount) as total_volume, sum(price * amount)*0.13 as fee from history where app_id = 730 group by timestamp order by total_volume desc limit 10")
parkSQL.show(20, False)

+-------------------+------------------+-----------------+
|timestamp          |total_volume      |fee              |
+-------------------+------------------+-----------------+
|2021-09-22 01:00:00|5595612.30000001  |727429.5990000013|
|2020-12-04 01:00:00|5025082.020000035 |653260.6626000046|
|2021-05-04 01:00:00|4930515.999999973 |640967.0799999965|
|2021-01-28 01:00:00|4364076.779999953 |567329.9813999939|
|2015-09-18 01:00:00|4246357.219999996 |552026.4385999995|
|2021-04-09 01:00:00|4126482.100000013 |536442.6730000017|
|2021-04-07 01:00:00|3804807.1200000015|494624.9256000002|
|2015-05-27 01:00:00|3747518.5400000145|487177.4102000019|
|2015-09-19 01:00:00|3728345.5200000084|484684.9176000011|
|2021-06-04 01:00:00|3705929.7800000147|481770.8714000019|
+-------------------+------------------+-----------------+



In [6]:
sc = spark.sparkContext

# using SQLContext to read parquet file
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
parquet_files = []
for root, dirs, files in os.walk('parquet/'):
    for file in files:
        if file.endswith(".parquet"):
            parquet_files.append(f"{root}/{file}")
# read parquet file
df = sqlContext.read.parquet(parquet_files[0])



In [7]:
count_data=df.rdd.groupByKey()

In [8]:
count_data.collect()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 1) (DESKTOP-FT79Q2M executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:189)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:164)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:115)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
	at java.base/java.lang.Thread.run(Thread.java:831)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:708)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:752)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:681)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:647)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:623)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:580)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:538)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:176)
	... 19 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2293)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1021)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1020)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:78)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:567)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:831)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:189)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:164)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:115)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:708)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:752)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:681)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:647)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:623)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:580)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:538)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:176)
	... 19 more


In [30]:
import os
os.environ.setdefault("PYSPARK_PYTHON", "python")

'python'