In [2]:
import re

import pandas as pd
import numpy as np

from bokeh.layouts import gridplot
from bokeh.plotting import figure, output_notebook, show

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

output_notebook()

loading numpy arrays from files

In [3]:
avgCntPwnLoss = np.load('chess/avgLoss.npy')
timecontrol = np.load('chess/timeControl.npy')
rating = np.load('chess/rating.npy')



#### Assigning values to different time controls, time control borders are from lichess.org
1. bullet (between 0-3 minutes)
2. blitz (3-8 minutes)
3. rapid (8-25 minutes)
4. classical (25+ minutes)
5. correnspondence (no time control)

In [4]:
def getCategory(base, increment):
    """calculates average game time, returns
    timecategory based on that
    40 moves per game assumed,
    base and increment shall be seconds"""
    
    fullTime = (base + increment*40)/60 
    if fullTime < 3:
        category = 1
    elif fullTime < 8:
        category = 2
    elif fullTime < 25:
        category = 3
    else:
        category = 4
    return category

In [5]:
categoricalTime = np.zeros_like(timecontrol, dtype=int)

pattern = '(\d+)\+(\d+)'
# examples
# '180+3' - 3 minutes game, with 3 seconds of increment
# '600+0' - 10 minutes game, no increment
# '-'     - correspondence game, no timelimit
for idx, time in enumerate(timecontrol):
    if '+' in time:
        rawFound = re.findall(pattern, time)
        base = int(rawFound[0][0]) # seconds
        increment = int(rawFound[0][1]) # seconds
        category = getCategory(base, increment)
        categoricalTime[idx]=category

    else:  #correspondence games
        categoricalTime[idx] = 5
print('Lets see, what kind of games we got:')
print(np.unique(categoricalTime))



Lets see, what kind of games we got:
[1 2 3 4 5]


In [6]:
len(avgCntPwnLoss), len(rating), len(timecontrol)

(166676, 166676, 83338)

We can see, the array containing timecontrols, contains half the values, this is becouse every chessgame is defined by 1 timecontrol (both players have the same time)
but every match will produce 2 avergage centiPawn losses, on for each game, and every match is between two players with different ratings. Lets duplicate our array by 2.

In [7]:
categoricalTime = np.repeat(categoricalTime, 2)

Creating pandas dataframe from arrays

In [8]:
database = [  ('rating', rating),
             ('timeControl', categoricalTime),
             ('averageLoss', avgCntPwnLoss),
             ]
df = pd.DataFrame.from_items(database)

df.head(3)

Unnamed: 0,rating,timeControl,averageLoss
0,1348,2,89
1,1354,2,34
2,2047,2,123


Creating categorial ratings and losses based on timecontrol

In [9]:
catRatings = []
catAvgLosses = []

for i in np.unique(categoricalTime):
    catRatings.append(rating[categoricalTime==i].astype(int))
    catAvgLosses.append(avgCntPwnLoss[categoricalTime==i].astype(int))


In [10]:
scikitRatings = [catRating.reshape(-1, 1) for catRating in catRatings]
Predictions = []
# print (ratings[1].shape, scikitRatings[1].shape, avgLosses[1].shape)
linearX = np.linspace(900,2100,10)
linearX_scikit = linearX.reshape(-1,1)

regr = linear_model.LinearRegression()
for idx, rat in enumerate(scikitRatings):

    regr.fit(rat, catAvgLosses[idx])
    Predictions.append(regr.predict(linearX_scikit))

In [11]:
RGBs = [(0.86, 0.3712, 0.33999999999999997),
 (0.7247999999999999, 0.86, 0.33999999999999997),
 (0.33999999999999997, 0.86, 0.5792000000000002),
 (0.33999999999999997, 0.5167999999999995, 0.86),
 (0.7871999999999999, 0.33999999999999997, 0.86)]


some rgb colors I like, we need to change the format
to be bokh compatible

In [12]:
reds = [i[0]*255 for i in RGBs]
greens = [i[1]*255 for i in RGBs]
blues = [i[2]*255 for i in RGBs]
colors = [
    "#%02x%02x%02x" % (int(r), int(g), int(b)) for r, g,b in zip(reds, greens, blues)
]

colors

['#db5e56', '#b8db56', '#56db93', '#5683db', '#c856db']

In [13]:
titles = ['Bullet', 'Blitz', 'Rapid', 'Classical', 
          'Correspondence','Correlation']
width, height = 300, 250
cSize = 2

grid=[]
# creating 6 sub figures
for title in titles:    
    grid.append(figure(width=width, plot_height=height, title=title))

# plotting the categorical information
for idx, subplot in enumerate(grid[:-1]):
    subplot.circle(catRatings[idx], catAvgLosses[idx], size = cSize, color=colors[idx], alpha=0.5)
# plotting the summary
for idx, prediction in enumerate(Predictions):
    grid[-1].line(linearX, prediction, color=colors[idx], alpha=0.8, line_width=4, legend=titles[idx])


p = gridplot([[grid[0], grid[1]],
              [grid[2], grid[3]],
              [grid[4], grid[5]]])
show(p)