# Chess Data Preparation
 - We will only extract required information from games, reducing the size to a reasonable level.

In [30]:
url = "https://database.lichess.org/standard/lichess_db_standard_rated_2023-10.pgn.zst" #set url here
lichessDataLength = 94922297 #how many games are in there (for progressbar)

filename = url.split("/")[-1] #"lichess_db_standard_rated_2013-01.pgn.zst"
dataMonth = filename.split(".")[0].split("_")[-1]

## Setup Environment
 - Installing dependencies
 - Downloading dataset

In [None]:
!pip install chess zstandard

In [None]:
print(f"Downloading of Month: {dataMonth}...")
!wget $url
print("Done!")

## Data Extraction

### Importing stuff and defining functions

In [3]:
import os, json
#from time import perf_counter
from zstandard import ZstdDecompressor
from io import TextIOWrapper
from tqdm.notebook import tqdm
from time import sleep
import pandas as pd

In [31]:
def getVal(pgnMetadataStr, isNumber=True):
  result = pgnMetadataStr.split('"')[1]
  if isNumber:
    result = int(result.replace("?", "0"))
  else:
    result = result.replace("?", "Unknown Opening")
  return result

def getWinningSide(aLine):
  value = getVal(aLine, isNumber=False)
  if value == "1-0":
    return "white"
  elif value == "0-1":
    return "black"
  else:
    return "draw"

def getOpening(aLine):
  values = getVal(aLine, isNumber=False).split(':', maxsplit=1)
  if len(values) == 1:
    return values[0], '~'
  else:
    return values

def ensureValidElo(whiteElo, blackElo):
  #make sure that there's value in both ELO fields, if not, copy the value from the other side.

  assert whiteElo or blackElo, "Both ELOs are zero!"

  if not whiteElo:
    whiteElo = blackElo
  
  elif not blackElo:
    blackElo = whiteElo

  return whiteElo, blackElo


def processGame(readableStuff):
  while True:

    currentLine = readableStuff.readline().lstrip("\ufeff")
    if currentLine == "":
      return None #EOF
    else:
      currentLine = currentLine.strip() #Still data here, proceed with processing.

    #Extract data
    if currentLine.startswith("1."):

      whiteElo, blackElo = ensureValidElo(whiteElo, blackElo)
      result = [opening, variation, whiteElo, blackElo, winner]


      gameValid = (bool(opening) and bool(variation) and bool(whiteElo) and bool(blackElo) and bool(winner))
      assert gameValid, f"Game seems to be incomplete!, Unable to extract ELO Properly!\n{result}"

      opening = variation = whiteElo = blackElo = winner = None
      return result
    
    elif currentLine.startswith("[WhiteElo"):
      whiteElo = getVal(currentLine)
    elif currentLine.startswith("[BlackElo"):
      blackElo = getVal(currentLine)
    elif currentLine.startswith("[Opening"):
      opening, variation = getOpening(currentLine)
    elif currentLine.startswith("[Result"):
      winner = getWinningSide(currentLine)

    else:
      pass

## Actual Processing

Decompression: https://python-zstandard.readthedocs.io/en/latest/decompressor.html#zstddecompressionreader

Notes: 
 - creating a df from the start and appending to it is extremely slow (~500-600 it/s compared to 20-30k it/s of lists/dicts)
 - https://stackoverflow.com/a/57940705 - suggests using a native python type then converting later.

In [33]:
#startTime = perf_counter()
progressBar = tqdm(total=lichessDataLength)

print("Initializing file...")
with open(filename, "rb") as zstfile:
  decompressor = ZstdDecompressor()

  reader = decompressor.stream_reader(zstfile)
  textStream = TextIOWrapper(reader, encoding='utf-8')

  #pgn = textStream
  #gameList = []
  gamesDict = {}

  updateEvery = 10000
  counter = 0

  print("Starting data processing...")


  gamesList = []

  while True:
    #readStart = perf_counter()
    result = processGame(textStream)
    if result is None:
      break
    else:
      gamesList.append(result)
    counter += 1
    #complete = perf_counter()

    #print(f"Read: {round((processStart - readStart)*1000, 4)}ms, Process: {round((complete - processStart)*1000, 4)}ms, Total: {round((complete - readStart)*1000, 4)}ms")
    #Note: 90% of the time is spent on reading the game using the library, so don't bother optimizing the rest.
    #Note: Actually, no, its on the zstandard library, which is quite slow when compared to the CLI version.
    
    if counter == updateEvery:
      progressBar.update(updateEvery)
      counter = 0
  progressBar.update(counter)
  sleep(1)
  progressBar.close()
  print("Finally Done!")
  #endTime = perf_counter()
  #print(f"Parsed {len(gameList)} games in {round(endTime - startTime, 2)} seconds.")

  0%|          | 0/94922297 [00:00<?, ?it/s]

Initializing file...
Starting data processing...


KeyboardInterrupt: 

In [27]:
chessFrame = pd.DataFrame(gamesList, columns=["Opening", "Variation", "WhiteElo", "BlackElo", "Winner"])
chessFrame.to_csv(f"chessData_{dataMonth}.csv", index=False, compression='gzip')

Unnamed: 0,opening,variation,whiteElo,blackElo,winner
0,French Defense,Normal Variation,1639,1403,white
0,Queen's Pawn Game,"Colle System, Anti-Colle",1654,1919,white
0,Four Knights Game,Italian Variation,1643,1747,white
0,Caro-Kann Defense,Goldman Variation,1824,1973,black
0,French Defense,La Bourdonnais Variation,1765,1815,black
...,...,...,...,...,...
0,Sicilian Defense,~,1305,1399,black
0,Van't Kruijs Opening,~,1348,1416,white
0,Caro-Kann Defense #2,~,1611,1425,white
0,Center Game,Normal Variation,1238,1380,black


### Sorting and Saving

In [7]:
resultFilename = f"openings_{dataMonth}.json"
writeJson(resultFilename, gamesDict)
print("Successfully dumped processed data.")


Successfully dumped processed data.


In [8]:
!zip "$resultFilename".zip "$resultFilename" -9

updating: openings_2023-10.json (deflated 99%)
