In [1]:
import yaml
import os
import datetime
import matplotlib.pyplot as plt
import arviz as az
import numpy as np
import pymc3 as pm
import pymc3.distributions.transforms as tr
import shutil
import theano
import theano.tensor as tt
import random
import math
import pandas as pd
from helper import load_data, initialise, get_situation, get_outcome
from model import create_model

print("Running on PyMC3 v{}".format(pm.__version__))

Running on PyMC3 v3.9.3


In [2]:
start_year = 2015
end_year = 2019
n_simulation = 10
verbose = True
train_flag = 3
save_directory = "2015-2019-500-t3"

In [3]:
"""
argumentList = sys.argv 

for arg in argumentList[1:]:
    if arg == "--verbose":
        verbose = True
    elif arg[0] == "-":
        if arg[1] == "n":
            n_simulation = int(arg[2:])
        elif arg[1] == "s":
            start_year = int(arg[2:])
        elif arg[1] == "e":
            end_year = int(arg[2:])
        elif arg[1] == "t":
            train_flag = int(arg[2:])
    else:
        save_directory = arg
            
if save_directory[-1] == "/":
    save_directory = save_directory[:-1]
"""

print("Start Year:", start_year)
print("End Year:", end_year)
print("Number of simulations:", n_simulation)
print("Save Directory:", save_directory)
print("Train flag:", train_flag)
print("Verbose:", verbose)

deliveries_data, matches, first_innings_data, second_innings_data, both_innings_data = load_data(start_year, end_year)

first_innings_data = both_innings_data[both_innings_data["inning"] == 1]
second_innings_data = both_innings_data[both_innings_data["inning"] == 2]
print("First innings data size:", len(first_innings_data))
print("Second innings data size:", len(second_innings_data))

if train_flag == 1:
    batsmen, bowlers, batsman_index, bowler_index, batsman_stats, bowler_stats, X, id1, id2, noballs_and_wides = initialise(first_innings_data)
elif train_flag == 2:
    batsmen, bowlers, batsman_index, bowler_index, batsman_stats, bowler_stats, X, id1, id2, noballs_and_wides = initialise(second_innings_data)
else:
    batsmen, bowlers, batsman_index, bowler_index, batsman_stats, bowler_stats, X, id1, id2, noballs_and_wides = initialise(both_innings_data)
    
noballs_and_wides_count = sum(list(noballs_and_wides.values()))
total_balls = sum([len(X[i]) for i in range(9)]) + noballs_and_wides_count

print("Number of noballs and wides:", noballs_and_wides_count)
print("Number of balls bowled:", total_balls)

for i in range(9):
    print("Balls in situation %d: %d" % (i+1, len(X[i])))
    
model = create_model(batsmen, bowlers, id1, id2, X)
print("Loaded model.")

cutpoints = np.loadtxt(save_directory + "/cutpoints.txt")
mu_1 = np.loadtxt(save_directory + "/mu_1.txt")
mu_2 = np.loadtxt(save_directory + "/mu_2.txt")
delta = np.loadtxt(save_directory + "/delta.txt")

mu_1_sorted = sorted([(mu_1[i], batsmen[i]) for i in range(len(mu_1))])
mu_2_sorted = sorted([(mu_2[i], bowlers[i]) for i in range(len(mu_2))])

print("Loaded pre-trained parameters.")

p = np.zeros(shape = (len(batsmen),len(bowlers),9,7))
for i in range(len(batsmen)):
    for j in range(len(bowlers)):
        for l in range(9):
            for k in range(7):
                if k == 0:
                    p[i,j,l,k] = 1/(1 + np.exp(-(cutpoints[l,k] - mu_1[i] + mu_2[j] - delta[l])))
                elif k == 6:
                    p[i,j,l,k] = 1 - 1/(1 + np.exp(-(cutpoints[l,k-1] - mu_1[i] + mu_2[j] - delta[l])))
                else:
                    p[i,j,l,k] = 1/(1 + np.exp(-(cutpoints[l,k] - mu_1[i] + mu_2[j] - delta[l]))) - 1/(1 + np.exp(-(cutpoints[l,k-1] - mu_1[i] + mu_2[j] - delta[l])))
                    
pw = np.zeros(shape=7)          
for i in range(7):
    pw[i] =  float(noballs_and_wides[i])/noballs_and_wides_count
v = float(noballs_and_wides_count)/total_balls

Start Year: 2015
End Year: 2019
Number of simulations: 10
Save Directory: 2015-2019-500-t3
Train flag: 3
Verbose: True
First innings data size: 36516
Second innings data size: 33673
Number of noballs and wides: 2367
Number of balls bowled: 70207
Balls in situation 1: 10329
Balls in situation 2: 13643
Balls in situation 3: 2657
Balls in situation 4: 47
Balls in situation 5: 3490
Balls in situation 6: 12246
Balls in situation 7: 7
Balls in situation 8: 212
Balls in situation 9: 25209
Loaded model.
Loaded pre-trained parameters.


In [4]:
lst = ["K Rabada", "RD Chahar", "M Santner", "Rashid Khan"]

In [5]:
print(bowler_stats[bowler_stats["Name"].isin(lst)])

      M     B  Runs  Wkts      Econ        Avg         SR         Name
89   18   415   540    36  7.807229  15.000000  11.527778     K Rabada
198   4    84    79     4  5.642857  19.750000  21.000000    M Santner
10   46  1095  1205    60  6.602740  20.083333  18.250000  Rashid Khan
45   15   324   359    15  6.648148  23.933333  21.600000    RD Chahar


In [6]:
O = {}
for bowler in lst:
    O[bowler] = []
    for l in range(9):
        for i in range(len(id2[l])):
            if id2[l][i] == bowler_index[bowler]:
                O[bowler].append((l,X[l][i],id1[l][i]))

In [7]:
for bowler in lst:
    print(bowler)
    for l in range(9):
        t = []
        cnt = 0
        runs = 0
        for i in range(len(O[bowler])):
            k = O[bowler][i][1]
            if O[bowler][i][0] == l:
                cnt += 1
                if k == 1:
                    t.append(batsmen[O[bowler][i][2]])
                elif k == 3:
                    runs += 1
                elif k == 4:
                    runs += 2
                elif k == 5:
                    runs += 3
                elif k == 6:
                    runs += 4
                elif k == 7:
                    runs += 6
        if cnt == 0: econ = 0
        else: econ = runs/cnt*6
        print("Wickets taken in situation %d:" % (l+1), t)
        print("Balls bowled in situation  %d:" % (l+1), cnt)
        print("Economy rate in situation  %d:" % (l+1), econ)        

K Rabada
Wickets taken in situation 1: ['PA Patel', 'BB McCullum', 'SK Raina', 'MA Agarwal', 'AM Rahane']
Balls bowled in situation  1: 100
Economy rate in situation  1: 7.68
Wickets taken in situation 2: ['RG Sharma', 'CA Lynn', 'AB de Villiers', 'RV Uthappa']
Balls bowled in situation  2: 36
Economy rate in situation  2: 5.333333333333333
Wickets taken in situation 3: []
Balls bowled in situation  3: 20
Economy rate in situation  3: 12.0
Wickets taken in situation 4: []
Balls bowled in situation  4: 0
Economy rate in situation  4: 0
Wickets taken in situation 5: ['KD Karthik', 'AS Yadav']
Balls bowled in situation  5: 13
Economy rate in situation  5: 7.384615384615385
Wickets taken in situation 6: ['AJ Finch', 'H Viljoen', 'V Kohli', 'AD Nath', 'HH Pandya', 'STR Binny', 'R Parag', 'PA Patel']
Balls bowled in situation  6: 82
Economy rate in situation  6: 9.073170731707318
Wickets taken in situation 7: []
Balls bowled in situation  7: 0
Economy rate in situation  7: 0
Wickets taken in

In [8]:
bowler_stats = bowler_stats.sort_values(by = "Econ")

In [9]:
bowler_stats[bowler_stats["M"] >= 5].head(20)

Unnamed: 0,M,B,Runs,Wkts,Econ,Avg,SR,Name
145,5,72,71,2,5.916667,35.5,36.0,CH Gayle
178,7,156,155,13,5.961538,11.923077,12.0,L Ngidi
135,12,258,273,22,6.348837,12.409091,11.727273,MA Starc
84,12,222,241,5,6.513514,48.2,44.4,J Yadav
10,46,1095,1205,60,6.60274,20.083333,18.25,Rashid Khan
45,15,324,359,15,6.648148,23.933333,21.6,RD Chahar
176,8,182,203,9,6.692308,22.555556,20.222222,I Sodhi
83,13,271,306,11,6.774908,27.818182,24.636364,Mohammad Nabi
182,14,230,265,13,6.913043,20.384615,17.692308,M Ali
114,55,1133,1310,48,6.937335,27.291667,23.604167,R Ashwin


In [10]:
bowler_stats[bowler_stats["Name"] == "K Rabada"]

Unnamed: 0,M,B,Runs,Wkts,Econ,Avg,SR,Name
89,18,415,540,36,7.807229,15.0,11.527778,K Rabada


In [11]:
bowler_stats[bowler_stats["Name"] == "RD Chahar"]

Unnamed: 0,M,B,Runs,Wkts,Econ,Avg,SR,Name
45,15,324,359,15,6.648148,23.933333,21.6,RD Chahar


In [12]:
noballs_and_wides_count = sum(list(noballs_and_wides.values()))
total_balls = sum([len(X[i]) for i in range(9)]) + noballs_and_wides_count

print("Number of noballs and wides:", noballs_and_wides_count)
print("Number of balls bowled:", total_balls)

for i in range(9):
    print("Balls in situation %d: %d" % (i+1, len(X[i])))

Number of noballs and wides: 2367
Number of balls bowled: 70207
Balls in situation 1: 10329
Balls in situation 2: 13643
Balls in situation 3: 2657
Balls in situation 4: 47
Balls in situation 5: 3490
Balls in situation 6: 12246
Balls in situation 7: 7
Balls in situation 8: 212
Balls in situation 9: 25209


In [27]:
M = matches[((matches["team1"] == "Mumbai Indians") | (matches["team1"] == "Chennai Super Kings")) & ((matches["team2"] == "Mumbai Indians") | (matches["team2"] == "Chennai Super Kings")) & (matches["season"] >= start_year)]

In [31]:
for i in range(len(M)):
    print(M.iloc[i])

season                            2015
city                            Mumbai
date                        2015-04-17
team1                   Mumbai Indians
team2              Chennai Super Kings
toss_winner             Mumbai Indians
toss_decision                      bat
result                          normal
dl_applied                           0
winner             Chennai Super Kings
win_by_runs                          0
win_by_wickets                       6
player_of_match                A Nehra
venue                 Wankhede Stadium
umpire1                   AK Chaudhary
umpire2                      M Erasmus
umpire3                            NaN
Name: 530, dtype: object
season                                        2015
city                                       Chennai
date                                    2015-05-08
team1                          Chennai Super Kings
team2                               Mumbai Indians
toss_winner                    Chennai Super Kings
toss_d

In [15]:
model = create_model(batsmen, bowlers, id1, id2, X)
print("Loaded model.")

Loaded model.


In [16]:
cutpoints = np.loadtxt(save_directory + "/cutpoints.txt")
mu_1 = np.loadtxt(save_directory + "/mu_1.txt")
mu_2 = np.loadtxt(save_directory + "/mu_2.txt")
delta = np.loadtxt(save_directory + "/delta.txt")

In [17]:
mu_1_sorted = sorted([(mu_1[i], batsmen[i]) for i in range(len(mu_1))])
mu_2_sorted = sorted([(mu_2[i], bowlers[i]) for i in range(len(mu_2))])

In [18]:
mu_1_sorted[-40:]

[(0.10916041709637109, 'D Wiese'),
 (0.10934770894644526, 'R Ashwin'),
 (0.10993303999887889, 'JA Morkel'),
 (0.11229339161979769, 'SN Khan'),
 (0.1127478909503026, 'SP Goswami'),
 (0.11437096032879743, 'JJ Roy'),
 (0.11893518085580507, 'KA Pollard'),
 (0.11973728399627434, 'AM Rahane'),
 (0.1197641913558365, 'BA Stokes'),
 (0.12409350999910944, 'SV Samson'),
 (0.12458632572821825, 'MK Pandey'),
 (0.12529378627448076, 'A Ashish Reddy'),
 (0.12581705723980155, 'TM Head'),
 (0.1266853554262674, 'Mandeep Singh'),
 (0.12830466293186954, 'V Shankar'),
 (0.12906243607182608, 'BCJ Cutting'),
 (0.13454642220456814, 'R Parag'),
 (0.13746709676886482, 'RA Tripathi'),
 (0.13786592115940546, 'SP Narine'),
 (0.13961509731866018, 'S Curran'),
 (0.14195755943924404, 'K Gowtham'),
 (0.15013814110151574, 'SPD Smith'),
 (0.15212560290217136, 'AR Patel'),
 (0.15550969171926526, 'KS Williamson'),
 (0.16018892865044904, 'P Shaw'),
 (0.17868610861141282, 'Ankit Sharma'),
 (0.19270783896061308, 'M Ali'),
 (0

In [19]:
mu_2_sorted[-40:]

[(0.09852279207148802, 'STR Binny'),
 (0.09852584144284587, 'Washington Sundar'),
 (0.10219773812386408, 'KH Pandya'),
 (0.10376096637906598, 'S Warrier'),
 (0.10675916523356294, 'GB Hogg'),
 (0.11346459875868388, 'Z Khan'),
 (0.11929593409287752, 'R Tewatia'),
 (0.1199977642744734, 'K Rabada'),
 (0.12801482033450362, 'Harbhajan Singh'),
 (0.13195185903259793, 'MJ McClenaghan'),
 (0.13539201945396168, 'MR Marsh'),
 (0.14494299849733996, 'YS Chahal'),
 (0.14826889109842756, 'R Ashwin'),
 (0.14899405845275598, 'I Sodhi'),
 (0.14968349726117364, 'S Badree'),
 (0.15303535249415953, 'M Morkel'),
 (0.15383183548817486, 'CH Morris'),
 (0.15521730354903285, 'JW Hastings'),
 (0.15536246161082354, 'J Yadav'),
 (0.16339426600638446, 'NB Singh'),
 (0.1662217857020509, 'JJ Bumrah'),
 (0.16631971194855077, 'DJ Muthuswami'),
 (0.17145282390194005, 'SP Narine'),
 (0.17268151503019338, 'N Saini'),
 (0.17912240752051106, 'M Ali'),
 (0.1833297669435748, 'Mohammad Nabi'),
 (0.21035981926431177, 'MA Starc'

In [20]:
trace = pm.load_trace(save_directory + "/trace", model=model)

In [21]:
with open(save_directory + "/trace.pkl", 'wb') as buff:
    pickle.dump(trace, buff)

In [43]:
summary = az.summary(trace, var_names = ["mu_2"], round_to=5)



In [55]:
summary.loc["mu_2[91]"]

mean             0.09982
sd               0.12329
hdi_3%          -0.12528
hdi_97%          0.33650
mcse_mean        0.00124
mcse_sd          0.00088
ess_mean      9886.35422
ess_sd        9886.35422
ess_bulk      9907.51580
ess_tail     15075.01696
r_hat            1.00021
Name: mu_2[91], dtype: float64

In [56]:
summary.loc["mu_2[105]"]

mean            0.26790
sd              0.14849
hdi_3%         -0.01257
hdi_97%         0.54901
mcse_mean       0.00155
mcse_sd         0.00109
ess_mean     9210.88599
ess_sd       9210.88599
ess_bulk     9210.72297
ess_tail     9327.90963
r_hat           1.00006
Name: mu_2[105], dtype: float64

In [33]:
trace["mu_2"][:,bowler_index["K Rabada"]]

array([ 0.24960261,  0.17831207,  0.03588263, ...,  0.15778465,
        0.17572289, -0.08601433])

In [26]:
trace["mu_2"]

array([[ 0.16866099,  0.03143118,  0.10437558, ...,  0.04479149,
        -0.03603981, -0.03153455],
       [ 0.23030471, -0.18718909, -0.00214847, ..., -0.16698039,
         0.10608509,  0.12403798],
       [ 0.35639444,  0.12053095,  0.21637097, ...,  0.18236339,
         0.17278158,  0.05191501],
       ...,
       [ 0.21834728,  0.04902354,  0.10735688, ..., -0.1549285 ,
        -0.23469898,  0.06232831],
       [ 0.22904815, -0.01444475,  0.10469489, ..., -0.13613517,
        -0.14823155,  0.1026084 ],
       [ 0.28200865, -0.22180348,  0.20764808, ..., -0.17745539,
         0.33129564, -0.21698686]])

In [None]:
p = np.zeros(shape = (len(batsmen),len(bowlers),9,7))
for i in range(len(batsmen)):
    for j in range(len(bowlers)):
        for l in range(9):
            for k in range(7):
                if k == 0:
                    p[i,j,l,k] = 1/(1 + np.exp(-(cutpoints[l,k] - mu_1[i] + mu_2[j] - delta[l])))
                elif k == 6:
                    p[i,j,l,k] = 1 - 1/(1 + np.exp(-(cutpoints[l,k-1] - mu_1[i] + mu_2[j] - delta[l])))
                else:
                    p[i,j,l,k] = 1/(1 + np.exp(-(cutpoints[l,k] - mu_1[i] + mu_2[j] - delta[l]))) - 1/(1 + np.exp(-(cutpoints[l,k-1] - mu_1[i] + mu_2[j] - delta[l])))
                    
# TODO: can be vectorised

In [21]:
# tmp_directory = "2018-2019-20k-iterations"
# cutpoints = np.loadtxt(tmp_directory + "/cutpoints.txt")
# mu_1 = np.loadtxt(tmp_directory + "/mu_1.txt")
# mu_2 = np.loadtxt(tmp_directory + "/mu_2.txt")
# delta = np.loadtxt(tmp_directory + "/delta.txt")

In [32]:
# deliveries_data, matches, first_innings_data, second_innings_data, both_innings_data = load_data(2018, 2019)
# batsmen, bowlers, batsman_index, bowler_index, batsman_stats, bowler_stats, X, id1, id2, noballs_and_wides = initialise(first_innings_data)

In [33]:
# mu_1_sorted = sorted([(mu_1[i], batsmen[i]) for i in range(len(mu_1))])
# mu_2_sorted = sorted([(mu_2[i], bowlers[i]) for i in range(len(mu_2))])

In [73]:
# mu_1_sorted

In [74]:
# mu_2_sorted

In [58]:
a = np.array([0,1,2])
b = np.array([0,1,2,3])
a+1

array([1, 2, 3])