In [6]:
import numpy as np
import pystan
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
skill_model = """
data {
  int<lower=1> N;             // Total number of players
  int<lower=1> E;             // number of games
  real<lower=0> scale;        // scale value for probability computation
  int<lower=0,upper=1> win[E]; // PA wins vs PB
  int PA[E];                  // player info between each game
  int PB[E];                  // 
  int RA[E];                  // player's race between each game
  int RB[E];                  // 
  
}
parameters {
  vector [N] skillZ;           // one player may use different races
  vector [N] skillP;           // the result may various acoording to different races
  vector [N] skillT; 
}

model{
  for (i in 1:N){ skillZ[i]~normal(0,3); skillP[i]~normal(0,3); skillT[i]~normal(0,3); }
  
  for (i in 1:E){
    if (RA[i] == 0){
        if (RB[i] == 0){
            win[i] ~ bernoulli_logit( (scale)*(skillZ[PA[i]]-skillZ[PB[i]]) );}
        else if (RB[i] == 1){
            win[i] ~ bernoulli_logit( (scale)*(skillZ[PA[i]]-skillP[PB[i]]) );}
        else if (RB[i] == 2){
            win[i] ~ bernoulli_logit( (scale)*(skillZ[PA[i]]-skillT[PB[i]]) );}}
            
    else if (RA[i] == 1){
        if (RB[i] == 0){
            win[i] ~ bernoulli_logit( (scale)*(skillP[PA[i]]-skillZ[PB[i]]) );}
        else if (RB[i] == 1){
            win[i] ~ bernoulli_logit( (scale)*(skillP[PA[i]]-skillP[PB[i]]) );}
        else if (RB[i] == 2){
            win[i] ~ bernoulli_logit( (scale)*(skillP[PA[i]]-skillT[PB[i]]) );}}
            
    else if (RA[i] == 2){
        if (RB[i] == 0){
            win[i] ~ bernoulli_logit( (scale)*(skillT[PA[i]]-skillZ[PB[i]]) );}
        else if (RB[i] == 1){
            win[i] ~ bernoulli_logit( (scale)*(skillT[PA[i]]-skillP[PB[i]]) );}
        else if (RB[i] == 2){
            win[i] ~ bernoulli_logit( (scale)*(skillT[PA[i]]-skillT[PB[i]]) );}}
  }   // win probability is a logit function of skill difference
}
"""

In [8]:
import pickle
# try:     # load it if already compiled
#     sm = pickle.load(open('skill_race.pkl', 'rb'))
# except:  # ow, compile and save compiled model
sm = pystan.StanModel(model_code = skill_model)
with open('skill_race.pkl', 'wb') as f: pickle.dump(sm, f)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_991df902cb5e9dd38bebf52a4b126a89 NOW.


In [9]:
def load_data_train(path):
    with open(path, encoding='utf-8') as f: lines = f.read().split('\n')

    p = 0;playerid = {};
    for i in range(len(lines)):
        csv = lines[i].split(',')
        if len(csv) != 10: continue;   # parse error or blank line
        player0,player1 = csv[1],csv[4]
        if player0 not in playerid: playerid[player0]=p;p+=1
        if player1 not in playerid: playerid[player1]=p;p+=1

    nplayers = len(playerid)
    games = 0
    
    # Sparsifying parameters (discard some training examples):
    pKeep = 1.0   # fraction of edges to consider (immed. throw out 1-p edges)
    nEdge = 3     # try to keep nEdge opponents per player (may be more; asymmetric)
    nKeep = 5     # keep at most nKeep games per opponent pairs (play each other multiple times)

    nplays, nwins = np.zeros( (nplayers,nplayers) ), np.zeros( (nplayers,nplayers) );
    PA, PB, win, RA, RB = [], [], [], [], []
    
    race = {'P':0, 'T':1, 'Z':2}
    for i in range(len(lines)):
        csv = lines[i].split(',')
        if len(csv) != 10: continue;   # parse error or blank line
        a,b = playerid[csv[1]],playerid[csv[4]]
        aw,bw = csv[2]=='[winner]',csv[5]=='[winner]'
        ra, rb = csv[6], csv[7]
        if ra == 'R' or rb == 'R': continue;
        if (np.random.rand() < pKeep):
            if (nplays[a,b] < nKeep) and ( ((nplays[a,:]>0).sum() < nEdge) or ((nplays[:,b]>0).sum() < nEdge) ):
                nplays[a,b] += 1; nplays[b,a]+=1;nwins[a,b] += aw; nwins[b,a] += bw; PA.append(a+1); PB.append(b+1);games+=1
                if aw:
                    win.append(1)
                else:
                    win.append(0)
                
                if ra == 'Z':
                    RA.append(0)
                elif ra == 'P':
                    RA.append(1)
                elif ra == 'T':
                    RA.append(2)
                    
                if rb == 'Z':
                    RB.append(0)
                elif rb == 'P':
                    RB.append(1)
                elif rb == 'T':
                    RB.append(2)

    return nplayers, games, PA, PB, RA, RB, win, nplays, nwins

In [10]:
nplayers, games, PA, PB, RA, RB, win, nplays, nwins = load_data_train('train.csv')

In [11]:
print('summary: ')
print('# players', nplayers)
print('# games', games)
print('player A', len(PA))
print('win', len(win))
print('raceA', len(RA))
print('raceB', len(RB))

summary: 
# players 999
# games 4678
player A 4678
win 4678
raceA 4678
raceB 4678


In [12]:
skill_data = {
    'N': nplayers,
    'E': games,
    'scale': 0.3,
    'win':win,
    'PA': PA,
    'PB': PB,
    'RA': RA,
    'RB': RB
}

In [15]:
fit = sm.sampling(data=skill_data, iter=1000, chains=4, control={'max_treedepth': 20})

To run all diagnostics call pystan.check_hmc_diagnostics(fit)


In [16]:
samples = fit.extract()

In [17]:
samples

OrderedDict([('skillZ',
              array([[-1.04369011,  2.49034347,  7.72757056, ..., -3.59094636,
                      -4.33113435,  1.91306336],
                     [ 1.58490502, -3.53999232, 10.85916225, ..., -2.98336798,
                      -3.35985458,  2.2339261 ],
                     [-1.33736422,  2.28311497, 10.41871784, ..., -5.34319065,
                      -4.4041443 , -1.00773383],
                     ...,
                     [-0.10844806,  1.02525319,  8.50297368, ..., -1.02805634,
                       3.32940709,  3.16862455],
                     [-0.18954174,  2.59445652,  7.25754657, ..., -3.65008953,
                      -4.22417414, -3.12965132],
                     [-0.15137722,  1.26812582,  3.63545775, ..., -6.82606398,
                       9.36796995,  1.78350764]])),
             ('skillP',
              array([[ 4.54922377,  1.77998196, -2.43677773, ..., -0.28078347,
                       0.95094166, -0.77351618],
                     [ 4.21

In [22]:
# Player 0 vs Player 1 prediction:
def logit(z): return 1./(1.+np.exp(-z))

# Use our model's win probability function (logistic of scaled difference)
#  using the predicted skill difference for each sample:

prob = logit( skill_data['scale']*(samples['skillT'][:,0]-samples['skillP'][:,1]) ).mean()

print(prob)

0.2404427721690058


In [19]:
with open('skill_race_depth-200.pkl', 'wb') as f: pickle.dump(samples, f)

In [23]:
def load_data_valid(path):
    with open(path, encoding='utf-8') as f: lines = f.read().split('\n')

    p = 0;playerid = {};
    for i in range(len(lines)):
        csv = lines[i].split(',')
        if len(csv) != 10: continue;   # parse error or blank line
        player0,player1 = csv[1],csv[4]
        if player0 not in playerid: playerid[player0]=p;p+=1
        if player1 not in playerid: playerid[player1]=p;p+=1

    nplayers = len(playerid)
    games = 0
    
    # Sparsifying parameters (discard some training examples):
    pKeep = 1.0   # fraction of edges to consider (immed. throw out 1-p edges)
    nEdge = 3     # try to keep nEdge opponents per player (may be more; asymmetric)
    nKeep = 5     # keep at most nKeep games per opponent pairs (play each other multiple times)

    nplays, nwins = np.zeros( (nplayers,nplayers) ), np.zeros( (nplayers,nplayers) );
    PA, PB, win, RA, RB = [], [], [], [], []
    for i in range(len(lines)):
        csv = lines[i].split(',')
        if len(csv) != 10: continue;   # parse error or blank line
        a,b = playerid[csv[1]],playerid[csv[4]]
        aw,bw = csv[2]=='[winner]',csv[5]=='[winner]'
        ra, rb = csv[6], csv[7]
        if ra == 'R' or rb == 'R': continue;
        if (np.random.rand() < pKeep):
            if (nplays[a,b] < nKeep) and ( ((nplays[a,:]>0).sum() < nEdge) or ((nplays[:,b]>0).sum() < nEdge) ):
                nplays[a,b] += 1; nplays[b,a]+=1;nwins[a,b] += aw; nwins[b,a] += bw; PA.append(a+1); PB.append(b+1);games+=1
                if aw:
                    win.append(1)
                else:
                    win.append(0)
                
                if ra == 'Z':
                    RA.append(0)
                elif ra == 'P':
                    RA.append(1)
                elif ra == 'T':
                    RA.append(2)
                    
                if rb == 'Z':
                    RB.append(0)
                elif rb == 'P':
                    RB.append(1)
                elif rb == 'T':
                    RB.append(2)

    return nplayers, games, PA, PB, RA, RB, win, nplays, nwins

In [24]:
nplayers, games, PA, PB, RA, RB, win, vnplays, vnwins = load_data_valid('valid.csv')

In [26]:
print('summary: ')
print('# players', nplayers)
print('# games', games)

print(len(RA))

summary: 
# players 999
# games 4771
4771


In [None]:
def check(real_prob)

In [33]:
vnplays = vnplays.astype(np.int64)
vnwins = vnwins.astype(np.int64)

bi_loss = 0.
for i in range(games):
    if RA[i] == 0:
        if RB[i] == 0:
            real_prob = win[i] #real win rate
            predicted_prob = logit( skill_data['scale']*(samples['skillZ'][:,(PA[i]-1)]-samples['skillZ'][:,(PB[i]-1)]) ).mean() #predicted win rate  
            bi_loss += np.logical_xor(real_prob, predicted_prob > 0.5) 
        elif RB[i] == 1:
            real_prob = win[i] #real win rate
            predicted_prob = logit( skill_data['scale']*(samples['skillZ'][:,(PA[i]-1)]-samples['skillP'][:,(PB[i]-1)]) ).mean() #predicted win rate
            bi_loss += np.logical_xor(real_prob, predicted_prob > 0.5) 
        elif RB[i] == 2:
            real_prob = win[i] #real win rate
            predicted_prob = logit( skill_data['scale']*(samples['skillZ'][:,(PA[i]-1)]-samples['skillT'][:,(PB[i]-1)]) ).mean() #predicted win rate
            bi_loss += np.logical_xor(real_prob, predicted_prob > 0.5) 
            
    elif RA[i] == 1:
        if RB[i] == 0:
            real_prob = win[i] #real win rate
            predicted_prob = logit( skill_data['scale']*(samples['skillP'][:,(PA[i]-1)]-samples['skillZ'][:,(PB[i]-1)]) ).mean() #predicted win rate
            bi_loss += np.logical_xor(real_prob, predicted_prob > 0.5) 
        elif RB[i] == 1:
            real_prob = win[i] #real win rate
            predicted_prob = logit( skill_data['scale']*(samples['skillP'][:,(PA[i]-1)]-samples['skillP'][:,(PB[i]-1)]) ).mean() #predicted win rate
            bi_loss += np.logical_xor(real_prob, predicted_prob > 0.5) 
        elif RB[i] == 2:
            real_prob = win[i] #real win rate
            predicted_prob = logit( skill_data['scale']*(samples['skillP'][:,(PA[i]-1)]-samples['skillT'][:,(PB[i]-1)]) ).mean() #predicted win rate
            bi_loss += np.logical_xor(real_prob, predicted_prob > 0.5) 
            
    elif RA[i] == 2:
        if RB[i] == 0:
            real_prob = win[i] #real win rate
            predicted_prob = logit( skill_data['scale']*(samples['skillT'][:,(PA[i]-1)]-samples['skillZ'][:,(PB[i]-1)]) ).mean() #predicted win rate
            bi_loss += np.logical_xor(real_prob, predicted_prob >= 0.5) 
        elif RB[i] == 1:
            real_prob = win[i] #real win rate
            predicted_prob = logit( skill_data['scale']*(samples['skillT'][:,(PA[i]-1)]-samples['skillP'][:,(PB[i]-1)]) ).mean() #predicted win rate
            bi_loss += np.logical_xor(real_prob, predicted_prob >= 0.5) 
        elif RB[i] == 2:
            real_prob = win[i] #real win rate
            predicted_prob = logit( skill_data['scale']*(samples['skillT'][:,(PA[i]-1)]-samples['skillT'][:,(PB[i]-1)]) ).mean() #predicted win rate
            bi_loss += np.logical_xor(real_prob, predicted_prob >= 0.5) 

bi_loss /= games

In [34]:
bi_loss

0.4514776776357158