In [1]:
import re
import chess.pgn  
import numpy as np


#### python-chess library will be necessary to parse the pgn. Get it:

pip install python-chess

In [13]:
pgn = open("/media/matyi/Data/chess/lichess_db_standard_rated_2017-12.pgn")

#### You can get the database from lichess' website https://database.lichess.org/

Download it, unzip it (mine became 32 GB)

In [17]:
game = chess.pgn.read_game(pgn)
exporter = chess.pgn.StringExporter(headers=True, variations=True, comments=True)
pgn_string = game.accept(exporter)

print(pgn_string)

[Event "Rated Blitz game"]
[Site "https://lichess.org/QARvvsDn"]
[Date "????.??.??"]
[Round "?"]
[White "senay4444"]
[Black "rudihori"]
[Result "1-0"]
[UTCDate "2017.12.02"]
[UTCTime "23:50:19"]
[WhiteElo "1240"]
[BlackElo "1293"]
[WhiteRatingDiff "+12"]
[BlackRatingDiff "-15"]
[ECO "C41"]
[Opening "Philidor Defense: Exchange Variation #2"]
[TimeControl "300+0"]
[Termination "Time forfeit"]

1. e4 { [%clk 0:05:00] } 1... e5 { [%clk 0:05:00] } 2. Nf3 { [%clk 0:04:58] }
2... d6 { [%clk 0:04:55] } 3. d4 { [%clk 0:04:54] } 3... exd4
{ [%clk 0:04:48] } 4. Nxd4 { [%clk 0:04:52] } 4... Bd7 { [%clk 0:04:45] } 5.
Bc4 { [%clk 0:04:51] } 5... Nf6 { [%clk 0:04:38] } 6. Nf3 { [%clk 0:04:39] }
6... Nxe4 { [%clk 0:04:31] } 7. Nbd2 { [%clk 0:04:32] } 7... Qe7
{ [%clk 0:04:24] } 8. Nxe4 { [%clk 0:04:24] } 8... Qxe4+ { [%clk 0:04:20] } 9.
Qe2 { [%clk 0:04:22] } 9... Qxe2+ { [%clk 0:04:14] } 10. Bxe2
{ [%clk 0:04:22] } 10... a6 { [%clk 0:04:10] } 11. Bc4 { [%clk 0:04:16] } 11...
Be6 { [%clk 0:04:05] } 12

In [14]:
%%time
i=0
Ratings = []
AvgCentiPawnLoss = []
TimeControl = []

pattern ='\[%eval (-?#?.{1,6})]'

while 1:
    i += 1
    if i % 50000 == 0: print(i)
    if i > 1000000: break
    game = chess.pgn.read_game(pgn)
    exporter = chess.pgn.StringExporter(headers=True, variations=True, comments=True)
    pgn_string = game.accept(exporter)
    
    # ~15% percent of the matches contain evaluations
    if "[%eval" in pgn_string:
        
        blackELO =game.headers["BlackElo"]
        whiteELO =game.headers["WhiteElo"]
        
        scores = re.findall(pattern, pgn_string)
        for idx, score in enumerate(scores):
            if '#-' in score: # black mates in 5 :   #-5
                scores[idx] = -10
            elif '#' in score: # white mates in 2 :   #2
                scores[idx] = 10
        scoreNp = np.array([float(score) for score in scores])
        
        # clip evaluation at +-10
        newScore = np.clip(scoreNp, -10, 10)   
        # The important data is in the change of evaluation
        difference = np.diff(newScore)
        
        # every second evaluation belongs to one player
        whiteLoss = difference[1::2]
        blackLoss  = difference[0::2]
        
        whiteAvgLoss = int(-100 * np.mean(whiteLoss))
        blackAvgLoss = int(100 * np.mean(blackLoss))
        
        Ratings.append(blackELO)
        Ratings.append(whiteELO)
        AvgCentiPawnLoss.append(blackAvgLoss)
        AvgCentiPawnLoss.append(whiteAvgLoss)
        TimeControl.append(game.headers['TimeControl'])

50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000
650000
700000
750000
800000
850000
900000
950000
1000000
CPU times: user 1h 27min 27s, sys: 4.54 s, total: 1h 27min 32s
Wall time: 1h 27min 36s


#### Save the data, so we can reload them from file in the next notebook, where we explore it more deeply

In [15]:
np.save("avgLoss.npy",AvgCentiPawnLoss)
np.save("rating.npy",Ratings)
np.save("timeControl.npy",TimeControl)

Let's take a look at our data

In [12]:
from bokeh.io import output_notebook
from bokeh.plotting import figure, show

output_notebook()

p = figure(title="Accuracy of chess games based on rating",
           x_axis_label='ELO(glicko2) rating', y_axis_label='Average centipawn loss')

p.circle(Ratings, AvgCentiPawnLoss, legend="games",size=1)

show(p)

#### At first glance the plot seems okay. For better player (with higher rating) the accuracy is higher (the average loss is lower) . Check out the next jupyter notebook.