In [1]:
import io
import zstandard as zstd
from pathlib import Path
import json
import sys
from tqdm.auto import tqdm

In [2]:
fileName = 'lichess_db_standard_rated_2025-07.pgn.zst'

chunkCounter = 0

lastURL = ''
lastWhiteRating = ''
lastBlackRating = ''
lastSpeed = ''
viennaURLs = []

We also use `tqdm` to display a progress bar. It takes about 14,000 chunks of 16MB each to parse the entire (uncompressed) data.

In [3]:
%%time

pbar = tqdm(total=14000)


with open(fileName, 'rb') as fh:
    dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
    with dctx.stream_reader(fh) as reader:
        previous_line = ""
        while True:
            chunk = reader.read(2**24)  # 16mb chunks
            if not chunk:
                break
            string_data = chunk.decode('utf-8')
            lines = string_data.split("\n")
            for i, line in enumerate(lines[:-1]):
                #
                if line.startswith('[Site'):
                    lastURL = line.split('"')[1]
                elif line.startswith('[WhiteElo'):
                    lastWhiteRating = line.split('"')[1]
                elif line.startswith('[BlackElo'):
                    lastBlackRating = line.split('"')[1]
                elif line.startswith('[TimeControl'):
                    lastSpeed = line.split('"')[1]
                elif ('Bxf7+' in line) and ('1. e4' in line) and (' 1... e5' in line) and (' 2. Nc3' in line) and (' 2... Nc6' in line) and (' 3. Bc4' in line):
                    if ('300+' in lastSpeed or '600+' in lastSpeed) and int(lastWhiteRating)>1600:
                        viennaURLs.append( [lastURL,lastWhiteRating,lastBlackRating,lastSpeed] )
                        lastURL = ''
                        lastBlackRating = ''
                        lastSpeed = ''
            chunkCounter += 1
            pbar.update(1)


pbar.close()

  0%|          | 0/14000 [00:00<?, ?it/s]

CPU times: user 10min 33s, sys: 35.7 s, total: 11min 9s
Wall time: 11min 10s


In [5]:
# Write them out to a csv file:
import pandas as pd
columns = ["url","white","black","time"] 
pd.DataFrame(viennaURLs, columns = columns).to_csv("vienna_extracted_blitz_rapid_2025-07.csv",index=False)

In [6]:
pd.DataFrame(viennaURLs, columns = columns)

Unnamed: 0,url,white,black,time
0,https://lichess.org/izAhECFS,1954,1869,600+0
1,https://lichess.org/UN5s3xXI,1931,1904,600+0
2,https://lichess.org/WYh1IOD4,1646,1671,300+0
3,https://lichess.org/pRbVapao,1694,1679,300+3
4,https://lichess.org/rZZAdqu5,1998,1987,600+0
...,...,...,...,...
1575,https://lichess.org/W3BLVfNM,1637,1761,300+0
1576,https://lichess.org/pTkeuC0h,1847,1844,300+0
1577,https://lichess.org/34viWzIM,1699,1681,600+0
1578,https://lichess.org/2VSBdXjZ,2077,1548,600+0
