# Chess Data Preparation
 - We will only extract required information from games, reducing the size to a reasonable level.

In [1]:
url = "https://database.lichess.org/standard/lichess_db_standard_rated_2023-10.pgn.zst" #set url here
lichessDataLength = 94922297 #how many games are in there (for progressbar)

filename = url.split("/")[-1] #"lichess_db_standard_rated_2013-01.pgn.zst"
dataMonth = filename.split(".")[0].split("_")[-1]

## Setup Environment
 - Installing dependencies
 - Downloading dataset

In [2]:
!pip install chess zstandard pymongo tqdm ipywidgets

Defaulting to user installation because normal site-packages is not writeable


## Importing stuff and defining functions

In [6]:
from zstandard import ZstdDecompressor
from tqdm.notebook import tqdm
#from time import perf_counter
from dotenv import load_dotenv
from io import TextIOWrapper
from time import sleep
import pandas as pd
import os, gc, json
load_dotenv()

True

In [4]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

## Initialize Database

In [9]:
db.close()
del db

In [10]:
global db
if 'db' not in globals():
  uri = os.environ.get('mongoConnStr')
  if uri is None:
    uri = "mongodb://localhost:27017"
    print("Mongo Connection String is not set. Falling back to localhost!")
  #try to get explicitly defined connection string, otherwise fallback to localhost.
  
  # Create a new client and connect to the server
  print("Connecting to MongoDB...")
  db = MongoClient(uri, server_api=ServerApi('1'))
  
  # Send a ping to confirm a successful connection
  try:
    db.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
  except Exception as e:
    print(e)
else:
  print("Aren't you alreayd connected?")

Connecting to MongoDB...
Pinged your deployment. You successfully connected to MongoDB!


In [10]:
chessCollection = db['chessData'][dataMonth]
#chessCollection.drop()
chessCollection.count_documents({})

0

## Data Extraction

In [11]:
def getVal(pgnMetadataStr, isNumber=True):
  result = pgnMetadataStr.split('"')[1]
  if isNumber:
    result = int(result.replace("?", "0"))
  else:
    result = result.replace("?", "Unknown Opening")
  return result

def getWinningSide(aLine):
  value = getVal(aLine, isNumber=False)
  if value == "1-0":
    return "white"
  elif value == "0-1":
    return "black"
  else:
    return "draw"

def getOpening(aLine):
  values = getVal(aLine, isNumber=False).split(':', maxsplit=1)
  if len(values) == 1:
    return values[0], '~'
  else:
    return values

def ensureValidElo(whiteElo, blackElo):
  #make sure that there's value in both ELO fields, if not, copy the value from the other side.

  assert whiteElo or blackElo, "Both ELOs are zero!"

  if not whiteElo:
    whiteElo = blackElo
  
  elif not blackElo:
    blackElo = whiteElo

  return whiteElo, blackElo


def processGame(readableStuff):
  while True:

    currentLine = readableStuff.readline().lstrip("\ufeff")
    if currentLine == "":
      return None #EOF
    else:
      currentLine = currentLine.strip() #Still data here, proceed with processing.

    #Extract data
    if currentLine.startswith("1."):

      whiteElo, blackElo = ensureValidElo(whiteElo, blackElo)
      result = {
              "opening": opening, "variation": variation, \
              "whiteElo": whiteElo, "blackElo": blackElo, \
              "winner": winner
              }

      gameValid = (bool(opening) and bool(variation) and bool(whiteElo) and bool(blackElo) and bool(winner))
      assert gameValid, f"Game seems to be incomplete!, Unable to extract ELO Properly!\n{result}"

      opening = variation = whiteElo = blackElo = winner = None
      return result
    
    elif currentLine.startswith("[WhiteElo"):
      whiteElo = getVal(currentLine)
    elif currentLine.startswith("[BlackElo"):
      blackElo = getVal(currentLine)
    elif currentLine.startswith("[Opening"):
      opening, variation = getOpening(currentLine)
    elif currentLine.startswith("[Result"):
      winner = getWinningSide(currentLine)

    else:
      pass

## Actual Processing

Decompression: https://python-zstandard.readthedocs.io/en/latest/decompressor.html#zstddecompressionreader

Notes: 
 - creating a df from the start and appending to it is extremely slow (~500-600 it/s compared to 20-30k it/s of lists/dicts)
 - https://stackoverflow.com/a/57940705 - suggests using a native python type then converting later.

In [14]:
#startTime = perf_counter()
progressBar = tqdm(total=lichessDataLength)

print("Initializing file...")
with open(filename, "rb") as zstfile:
  decompressor = ZstdDecompressor()

  reader = decompressor.stream_reader(zstfile)
  textStream = TextIOWrapper(reader, encoding='utf-8')

  #pgn = textStream
  #gameList = []
  #gamesDict = {}

  updateEvery = 100000
  counter = 0
  globalCounter = 0

  print("Starting data processing...")

  currentBuffer = []
  while True:
    #readStart = perf_counter()
    result = processGame(textStream)
    if result is None:
      break
    else:
      currentBuffer.append(result)
    counter += 1
    globalCounter += 1

    #print(f"Read: {round((processStart - readStart)*1000, 4)}ms, Process: {round((complete - processStart)*1000, 4)}ms, Total: {round((complete - readStart)*1000, 4)}ms")
    #Note: 90% of the time is spent on reading the game using the library, so don't bother optimizing the rest.
    #Note: Actually, no, its on the zstandard library, which is quite slow when compared to the CLI version.
    
    if counter == updateEvery:
      progressBar.update(updateEvery)
      chessCollection.insert_many(currentBuffer)
      del currentBuffer
      gc.collect()
      currentBuffer = []
      counter = 0
  
  progressBar.update(counter)
  chessCollection.insert_many(currentBuffer)
  del currentBuffer
  gc.collect()
  sleep(1)
  progressBar.close()
  print("Finally Done!")
  print(f"Processed {globalCounter} items.")
  #endTime = perf_counter()
  #print(f"Parsed {len(gameList)} games in {round(endTime - startTime, 2)} seconds.")

  0%|          | 0/94922297 [00:00<?, ?it/s]

Initializing file...
Starting data processing...
Finally Done!
Processed 94724586 items.


In [None]:
chessFrame = pd.DataFrame(gamesList, columns=["Opening", "Variation", "WhiteElo", "BlackElo", "Winner"])
chessFrame.to_csv(f"chessData_{dataMonth}.csv", index=False, compression='gzip')

### Sorting and Saving

In [None]:
resultFilename = f"openings_{dataMonth}.json"
writeJson(resultFilename, gamesDict)
print("Successfully dumped processed data.")


In [None]:
!zip "$resultFilename".zip "$resultFilename" -9