# Chess Data Preparation
 - We will only extract required information from games, reducing the size to a reasonable level.

In [4]:
url = "https://database.lichess.org/standard/lichess_db_standard_rated_2023-10.pgn.zst" #set url here
lichessDataLength = 94922297 #how many games are in there (for progressbar)

filename = url.split("/")[-1]
dataMonth = filename.split(".")[0].split("_")[-1]

## Setup Environment
 - Installing dependencies
 - Downloading dataset

In [None]:
!pip install chess zstandard

In [None]:
print(f"Downloading of Month: {dataMonth}...")
!wget $url
print("Done!")

## Data Extraction

### Importing stuff and defining functions

In [2]:
import os, json
#from time import perf_counter
from zstandard import ZstdDecompressor
from io import TextIOWrapper
from tqdm.notebook import tqdm
from time import sleep

In [3]:
def getAverageRating(whiteElo, blackElo):
  if (blackElo == 0) or (whiteElo == 0):
    return blackElo or whiteElo
  else:
    return int((whiteElo + blackElo) / 2)
  
def getVal(pgnMetadataStr, isNumber=True):
  result = pgnMetadataStr.split('"')[1]
  if isNumber:
    result = int(result.replace("?", "0"))
  else:
    result = result.replace("?", "Unknown Opening")
  return result

def processGame(readableStuff):
  while True:

    currentLine = readableStuff.readline().lstrip("\ufeff")
    if currentLine == "":
      return None #EOF
    else:
      currentLine = currentLine.strip() #Still data here, proceed with processing.

    #Extract data
    if currentLine.startswith("1."):

      assert (bool(whiteElo) and bool(blackElo) and bool(opening)), "Game seems to be incomplete!, Unable to extract ELO Properly!"

      result = [getAverageRating(whiteElo, blackElo), opening]
      whiteElo, blackElo, opening = None, None, None
      return result
    
    elif currentLine.startswith("[WhiteElo"):
      whiteElo = getVal(currentLine)
    elif currentLine.startswith("[BlackElo"):
      blackElo = getVal(currentLine)
    elif currentLine.startswith("[Opening"):
      opening = getVal(currentLine, isNumber=False)

    else:
      pass

## Actual Processing

Decompression: https://python-zstandard.readthedocs.io/en/latest/decompressor.html#zstddecompressionreader

In [4]:
#startTime = perf_counter()
progressBar = tqdm(total=lichessDataLength)

print("Initializing file...")
with open(filename, "rb") as zstfile:
  decompressor = ZstdDecompressor()

  reader = decompressor.stream_reader(zstfile)
  textStream = TextIOWrapper(reader, encoding='utf-8')

  #pgn = textStream
  #gameList = []
  gamesDict = {}

  updateEvery = 10000
  counter = 0

  print("Starting data processing...")

  while True:
    #readStart = perf_counter()
    result = processGame(textStream)
    if result is None:
      break
    else:
      avgRating, opening = result
      
    if opening in gamesDict:
      gamesDict[opening].append(avgRating)
    else:
      gamesDict[opening] = [avgRating]

    counter += 1
    #complete = perf_counter()

    #print(f"Read: {round((processStart - readStart)*1000, 4)}ms, Process: {round((complete - processStart)*1000, 4)}ms, Total: {round((complete - readStart)*1000, 4)}ms")
    #Note: 90% of the time is spent on reading the game using the library, so don't bother optimizing the rest.
    #Note: Actually, no, its on the zstandard library, which is quite slow when compared to the CLI version.
    
    if counter == updateEvery:
      progressBar.update(updateEvery)
      counter = 0
  progressBar.update(counter)
  sleep(1)
  progressBar.close()
  print("Finally Done!")
  #endTime = perf_counter()
  #print(f"Parsed {len(gameList)} games in {round(endTime - startTime, 2)} seconds.")

  0%|          | 0/94922297 [00:00<?, ?it/s]

Initializing file...
Starting data processing...
Finally Done!


### Sorting and Saving

In [5]:
print('Sorting...')
for opening, eloList in gamesDict.items():
  gamesDict[opening] = sorted(eloList)

Sorting...


In [6]:
def writeJson(path:str, content):
  try:
    with open(path, "w") as f:
      json.dump(content, f, sort_keys=True)
  except FileNotFoundError:
    os.makedirs(path.rsplit("/", 1)[0])
    print("Path is not found. Creating new directories.")
    with open(path, "w") as f:
      json.dump(content, f, sort_keys=True)

In [7]:
resultFilename = f"openings_{dataMonth}.json"
writeJson(resultFilename, gamesDict)
print("Successfully dumped processed data.")


Successfully dumped processed data.


In [8]:
!zip "$resultFilename".zip "$resultFilename" -9

updating: openings_2023-10.json (deflated 99%)


# Merge Data
 - Since there are so many categories (thousands), merging them seems to be a more viable option for manual review.

In [4]:
url = "https://database.lichess.org/standard/lichess_db_standard_rated_2023-10.pgn.zst" #set url here
lichessDataLength = 94922297 #how many games are in there (for progressbar)

filename = url.split("/")[-1]
dataMonth = filename.split(".")[0].split("_")[-1]

In [5]:
import os, json
from tqdm import tqdm

In [2]:
def mergeCategories(gamesDict):
  newDict = {}
  for opening, eloList in tqdm(gamesDict.items()):
    newName = opening.split(':')[0]
    if newName in newDict:
      newDict[newName].extend(eloList)
    else:
      newDict[newName] = eloList
  return newDict

In [6]:
with open(f'openings_{dataMonth}.json', "r") as file:
  merged = mergeCategories(json.load(file))
#merged = mergeCategories(gamesDict)

100%|██████████| 2977/2977 [00:01<00:00, 1764.92it/s]


In [7]:
folderName = f"mergedOpenings_{dataMonth}"

try:
  os.mkdir(folderName)
except FileExistsError:
  pass

for opening, eloList in tqdm(merged.items()):
  with open(f'{folderName}/{opening}.json', "w") as f:
    json.dump(eloList, f)

  0%|          | 0/156 [00:00<?, ?it/s]

100%|██████████| 156/156 [01:27<00:00,  1.79it/s]


In [8]:
! zip "$folderName".zip -r "$folderName" -9

  adding: mergedOpenings_2023-10/ (stored 0%)
  adding: mergedOpenings_2023-10/System.json (deflated 86%)
  adding: mergedOpenings_2023-10/Czech Defense.json (deflated 99%)
  adding: mergedOpenings_2023-10/Slav Indian.json (deflated 97%)
  adding: mergedOpenings_2023-10/Neo-Grünfeld Defense.json (deflated 92%)
  adding: mergedOpenings_2023-10/Danish Gambit Accepted.json (deflated 97%)
  adding: mergedOpenings_2023-10/Kangaroo Defense.json (deflated 93%)
  adding: mergedOpenings_2023-10/Elephant Gambit.json (deflated 99%)
  adding: mergedOpenings_2023-10/Amar Gambit.json (deflated 35%)
  adding: mergedOpenings_2023-10/Blackmar Gambit.json (deflated 99%)
  adding: mergedOpenings_2023-10/Borg Defense.json (deflated 97%)
  adding: mergedOpenings_2023-10/Queen's Pawn, Mengarini Attack.json (deflated 66%)
  adding: mergedOpenings_2023-10/Sicilian Defense.json (deflated 99%)
  adding: mergedOpenings_2023-10/Zukertort Opening.json (deflated 99%)
  adding: mergedOpenings_2023-10/Catalan Opening