In [1]:
import os
import os.path
from pathlib import Path
if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir('..')

In [2]:
from datetime import datetime, date
parquet_files = []
start = "2012-12"
end = "2022-11"
fmt = "http://steam-bot.s3.amazonaws.com/public/market-archive/partition={partition}/part.0.parquet"
fmt_local = "./parquet/partition={partition}/part.0.parquet"
current_date = datetime.strptime(start, "%Y-%m").date()
end_date = datetime.strptime(end, "%Y-%m").date()
while current_date <= end_date:
    parquet_files.append((fmt.format(partition=current_date.strftime("%Y-%m")), fmt_local.format(partition=current_date.strftime("%Y-%m"))))
    new_year, new_month = current_date.year, current_date.month
    new_month += 1
    if new_month >= 13:
        new_year += 1
        new_month = 1
    current_date = date(year=new_year, month=new_month, day=1)


In [3]:
import aiohttp        
import aiofiles

async def _download_file(url, path: Path):
    if path.exists():
        return
    print(f"Downloading to '{path}' ...")
    async with aiohttp.ClientSession() as session:
        path.parent.mkdir(parents=True, exist_ok=True)
        async with session.get(url) as resp:
            if resp.status == 200:
                f = await aiofiles.open(path, mode='wb')
                await f.write(await resp.read())
                await f.close()

for url, path in parquet_files:
    await _download_file(url, Path(path))

Downloading to './parquet/partition=2012-12/part.0.parquet'...
Downloading to './parquet/partition=2013-01/part.0.parquet'...
Downloading to './parquet/partition=2013-02/part.0.parquet'...
Downloading to './parquet/partition=2013-03/part.0.parquet'...
Downloading to './parquet/partition=2013-04/part.0.parquet'...
Downloading to './parquet/partition=2013-05/part.0.parquet'...
Downloading to './parquet/partition=2013-06/part.0.parquet'...
Downloading to './parquet/partition=2013-07/part.0.parquet'...
Downloading to './parquet/partition=2013-08/part.0.parquet'...
Downloading to './parquet/partition=2013-09/part.0.parquet'...
Downloading to './parquet/partition=2013-10/part.0.parquet'...
Downloading to './parquet/partition=2013-11/part.0.parquet'...
Downloading to './parquet/partition=2013-12/part.0.parquet'...
Downloading to './parquet/partition=2014-01/part.0.parquet'...
Downloading to './parquet/partition=2014-02/part.0.parquet'...
Downloading to './parquet/partition=2014-03/part.0.parq

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master('local') \
    .appName('myAppName') \
    .config('spark.executor.memory', '5gb') \
    .config("spark.cores.max", "6") \
    .getOrCreate()

In [5]:
parquet_files = []
for root, dirs, files in os.walk('parquet/'):
    for file in files:
        if file.endswith(".parquet"):
            parquet_files.append(f"{root}/{file}")
df = spark.read.parquet(*parquet_files)

In [6]:
df.createOrReplaceTempView("history")

In [12]:
parkSQL = spark.sql("select sum(price * amount) AS Volume, sum(amount) as Total_amount from history where app_id = 730")
parkSQL.show()

+-------------------+------------+
|             Volume|Total_amount|
+-------------------+------------+
|4.246101903730419E9|  4269578286|
+-------------------+------------+



In [22]:
parkSQL = spark.sql("select market_hash_name, sum(amount) as total_amount, sum(price*amount) as total_volume from history where app_id = 730 group by market_hash_name order by total_amount desc limit 100")
parkSQL.show(100, False)

+-----------------------------------------+------------+--------------------+
|market_hash_name                         |total_amount|total_volume        |
+-----------------------------------------+------------+--------------------+
|Clutch Case                              |134768975   |2.5192158630000003E7|
|Gamma 2 Case                             |121917096   |1.4269426469999999E7|
|Chroma 3 Case                            |101683679   |1.0517452520000003E7|
|Danger Zone Case                         |96496214    |1.1240458589999998E7|
|Operation Breakout Weapon Case           |90903986    |2.901172790000001E7 |
|Chroma 2 Case                            |88739813    |1.8816198560000002E7|
|Spectrum 2 Case                          |85373019    |1.3072606000000002E7|
|Operation Phoenix Weapon Case            |75896023    |1.6026849649999997E7|
|Glove Case                               |74989313    |2.736105827E7       |
|Prisma 2 Case                            |73218680    |1.089608