In [1]:
import torch
import numpy as np
import os

In [2]:
pathReq=os.getcwd()+'/'
stats=np.load(pathReq+'allStats20210524.npy')

forInference=np.load(pathReq+'statsForInference.npy')
stats.shape,forInference.shape

((4135, 30), (48, 29))

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

h=['PA','AB','R','H','2B','3B','HR',
           'RBI','BB','IBB','SO','HBP','SH','SF','GDP','SB','CS']#batting only
hBvsP=['PA2','AB2','H2', '1B2','2B2','3B2','HR2','BB2','SO2','AVG2','OBP2','SLG2'] #BvsP
hTotal=h+hBvsP

skipping=len(h) #sometimes we might just want to do batter vs pitcher so we skip batter stats
features=torch.tensor(stats[:,skipping:-1])
labels=torch.tensor(stats[:,-1])

forInferenceStats=torch.tensor(forInference[:,skipping:])

pctTotal=(1-torch.mean(labels))*2
finalNo=(features.size()[0]*pctTotal).int()

balancedFeatures=torch.zeros(finalNo,features.size()[1])
balancedLabels=torch.zeros(finalNo)

for smpl, l in enumerate(labels):
  if l==1 and smpl<(finalNo//2):
    balancedFeatures[smpl]=features[smpl]
    balancedLabels[smpl]=l
  elif l==0 and smpl<finalNo:
    balancedFeatures[smpl]=features[smpl]
    balancedLabels[smpl]=l

X_train, X_test, y_train, y_test = train_test_split(balancedFeatures, balancedLabels, test_size=0.1, random_state=42)

scaler = StandardScaler()

x_norm=torch.tensor(scaler.fit_transform(X_train))

x_norm_test=torch.tensor(scaler.fit_transform(X_test))

inference_norm=torch.tensor(scaler.fit_transform(forInferenceStats))


In [4]:
x_norm.shape,x_norm_test.shape,inference_norm.shape

(torch.Size([2410, 12]), torch.Size([268, 12]), torch.Size([48, 12]))

In [None]:
from torch import nn
import torch.optim as optim

device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

net = nn.Sequential(
      nn.Linear(features.shape[1], 5),
      nn.ReLU(),           
      nn.Linear(5, 1),
      nn.Sigmoid(),
      ).to(device)

net=net.double()

losses=[]

correct=0
count=0

def train_gen(feat,lbl, batch=4):
  fBatch=[]#torch.zeros(batch, feat.size()[1])
  lBatch=[]#torch.zeros(lbl)
  for i, f in enumerate(feat):   
    fBatch.append(f.numpy())
    lBatch.append(lbl[i].numpy())
    if len(fBatch)==batch:
      yield fBatch, lBatch
      fBatch,lBatch=[],[]
  
batchSize=1

epochs=100
optimizer = optim.RMSprop(net.parameters(), lr=0.001)
criterion = nn.BCELoss()  

for epoch in range(epochs):
    #gen=train_gen(X_train,y_train,batchSize)
    for sample, label in zip(x_norm, y_train):
    #for sample, label in gen:
        sample=torch.tensor(sample).to(device)
        label=torch.tensor(label).to(device)
        #label=torch.from_numpy(np.array(label)).to(device)
        sample=sample.double()
        label=label.double()
        
        output=net(sample)        
        net.zero_grad()
        target = label
        #target = target.view(-1, 1)  # make it the same shape as output #this is for batches
        target = target.view(1)  # make it the same shape as output 
        #print('pred:',output, 'pred shape:',output.shape, 'label:', target, 'label shape:', target.shape)            

        # in your training loop:
        optimizer.zero_grad()   # zero the gradient buffers

        loss = criterion(output, target)
        loss.backward()
        optimizer.step()    # Does the update
        losses.append(loss)
        count+=batchSize
        if (output>0.5 and label==1) or (output<=0.5 and label==0):
              correct+=1

    acc=correct/count
    
    print('epoch=' ,epoch, 'avg loss=', torch.mean(torch.tensor(losses)), 'acc = ', acc)




epoch= 0 avg loss= tensor(0.5529, dtype=torch.float64) acc =  0.7929460580912863
epoch= 1 avg loss= tensor(0.4934, dtype=torch.float64) acc =  0.799792531120332
epoch= 2 avg loss= tensor(0.4716, dtype=torch.float64) acc =  0.802627939142462
epoch= 3 avg loss= tensor(0.4601, dtype=torch.float64) acc =  0.8037344398340249
epoch= 4 avg loss= tensor(0.4529, dtype=torch.float64) acc =  0.8047302904564315
epoch= 5 avg loss= tensor(0.4478, dtype=torch.float64) acc =  0.8053941908713693
epoch= 6 avg loss= tensor(0.4441, dtype=torch.float64) acc =  0.8059869590989923
epoch= 7 avg loss= tensor(0.4411, dtype=torch.float64) acc =  0.8064834024896266
epoch= 8 avg loss= tensor(0.4388, dtype=torch.float64) acc =  0.8071000461041955
epoch= 9 avg loss= tensor(0.4369, dtype=torch.float64) acc =  0.807551867219917
epoch= 10 avg loss= tensor(0.4353, dtype=torch.float64) acc =  0.8079969822708412
epoch= 11 avg loss= tensor(0.4339, dtype=torch.float64) acc =  0.8083679114799447
epoch= 12 avg loss= tensor(0.

In [57]:
test=np.load(pathReq+'allStats20210513.npy')
testfeats=test[:,:-1]

x_norm_test2=torch.tensor(scaler.fit_transform(testfeats))
y_test2=test[:,-1]

In [75]:
from datetime import datetime
dateReq=datetime.today().strftime('%Y-%m-%d')
PATH=pathReq + '/savedModels/overlyConfident'+dateReq+'.pth'

# for param_tensor in net.state_dict():
#     print(param_tensor, "\t", net.state_dict()[param_tensor])
torch.save(net.state_dict(), PATH)
net.load_state_dict(torch.load(PATH))
net.eval()

Sequential(
  (0): Linear(in_features=29, out_features=10, bias=True)
  (1): ReLU()
  (2): Linear(in_features=10, out_features=5, bias=True)
  (3): ReLU()
  (4): Linear(in_features=5, out_features=1, bias=True)
  (5): Sigmoid()
)

In [68]:
net.eval()
net.cpu()
indices=500
net(x_norm[:indices].cpu()),y_train[:indices],net(x_norm_test[:indices].cpu()),y_test[:indices]

# for i in range(indices):
#   print('pred=', net(x_norm[i].cpu()).item(),'label=', y_train[i])

for i in range(indices):
  pred_score=net(x_norm_test2[i].cpu()).item()
  if pred_score >0.9:
    print('pred test=', pred_score ,'label=', y_test2[i])

pred test= 0.9422239382634026 label= 1.0
pred test= 0.9937920749081294 label= 1.0
pred test= 0.9983058672868391 label= 1.0
pred test= 0.9332117619457276 label= 1.0
pred test= 0.9879296816559314 label= 1.0
pred test= 0.9763058298484517 label= 1.0
pred test= 0.9957732427709048 label= 1.0
pred test= 0.9989514096250267 label= 1.0
pred test= 0.9812344491088631 label= 1.0
pred test= 0.9078209603293343 label= 1.0
pred test= 0.9518205419926821 label= 1.0
pred test= 0.9200055978197977 label= 1.0
pred test= 0.910470845025002 label= 1.0
pred test= 0.9972280150114126 label= 1.0
pred test= 0.9917518305137248 label= 1.0
pred test= 0.9464094030075237 label= 1.0
pred test= 0.9755570603225623 label= 1.0
pred test= 0.9343748517479334 label= 1.0
pred test= 0.9936957373005724 label= 1.0
pred test= 0.9764031848916767 label= 1.0
pred test= 0.974840987236434 label= 1.0
pred test= 0.9416502078878664 label= 1.0
pred test= 0.9994683267422917 label= 1.0
pred test= 0.9974244943591675 label= 1.0
pred test= 0.98392

In [47]:
net.eval()
net.cpu()
preds=net(inference_norm)

In [48]:
inference_norm.shape

torch.Size([19, 29])

In [49]:

preds.shape

torch.Size([19, 1])

In [50]:
pList=[['Miguel Rojas', 'Miami'],
 ['Austin Hays', 'Baltimore'],
 ['Raimel Tapia', 'Colorado'],
 ['Marcus Semien', 'Toronto'],
 ['Cedric Mullins', 'Baltimore'],
 ['Jake Cronenworth', 'San Diego'],
 ['Yoan Moncada', 'Chicago'],
 ['Teoscar Hernandez', 'Toronto'],
 ['Dylan Carlson', 'St. Louis'],
 ['Trey Mancini', 'Baltimore'],
 ['Tommy Edman', 'St. Louis'],
 ['Bo Bichette', 'Toronto'],
 ['Ramon Laureano', 'Oakland'],
 ['Kolten Wong', 'Milwaukee'],
 ['Randal Grichuk', 'Toronto'],
 ['Robbie Grossman', 'Detroit'],
 ['Nolan Arenado', 'St. Louis'],
 ['Tim Anderson', 'Chicago'],
 ['Vladimir Guerrero Jr.', 'Toronto']]



In [52]:
from datetime import datetime
print(datetime.today().strftime('%Y-%m-%d'))
nnOut=[]
for i, ind in enumerate(torch.topk(preds.reshape(1,preds.shape[0]),10)[1][0]):  
  nnOut.append(pList[ind])
  print(pList[ind],torch.topk(preds.reshape(1,preds.shape[0]),10)[0][0][i].item())

2021-05-24
['Marcus Semien', 'Toronto'] 0.9260356487699175
['Randal Grichuk', 'Toronto'] 0.8976579394915949
['Ramon Laureano', 'Oakland'] 0.6988358214337181
['Bo Bichette', 'Toronto'] 0.656616436729234
['Teoscar Hernandez', 'Toronto'] 0.6280565786076028
['Nolan Arenado', 'St. Louis'] 0.6077291051733582
['Cedric Mullins', 'Baltimore'] 0.04456414760401088
['Raimel Tapia', 'Colorado'] 0.029471240817665074
['Jake Cronenworth', 'San Diego'] 0.025413258555440132
['Miguel Rojas', 'Miami'] 0.0012692695407037352


In [None]:
len(pList)

49

In [22]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, max_iter=1000).fit(x_norm, y_train)

In [23]:
clf.score(x_norm_test, y_test)

0.8152173913043478

In [24]:
from datetime import datetime
print(datetime.today().strftime('%Y-%m-%d'))

lrOut=[]
for i, pred in enumerate(clf.predict_proba(inference_norm)[:,1][np.argsort(-clf.predict_proba(inference_norm)[:,1])][:10]):  
  lrOut.append(pList[i])
  print(pList[i], pred)

2021-05-15
['Paul Goldschmidt', 'St. Louis'] 0.999780017203643
['Rafael Devers', 'Boston'] 0.9848612150580556
['Randal Grichuk', 'Toronto'] 0.9346816196472018
['Giancarlo Stanton', 'New York'] 0.9259634913698378
['Andrew Benintendi', 'Kansas City'] 0.8824150199235107
['Jesus Aguilar', 'Miami'] 0.8433218626593961
['Shohei Ohtani', 'Los Angeles'] 0.8396202610265788
['Trey Mancini', 'Baltimore'] 0.8273806263552075
['Rhys Hoskins', 'Philadelphia'] 0.7735909562856644
['Corey Seager', 'Los Angeles'] 0.7596874128563706


In [27]:
for o in nnOut:
  if o in lrOut:
    print(o)

['Paul Goldschmidt', 'St. Louis']
['Corey Seager', 'Los Angeles']
['Randal Grichuk', 'Toronto']
['Giancarlo Stanton', 'New York']
['Rhys Hoskins', 'Philadelphia']
['Rafael Devers', 'Boston']
['Jesus Aguilar', 'Miami']


In [None]:
# 2021-05-12
# ['Jake Cronenworth', 'San Diego'] 0.9951982878094792
# ['Paul Goldschmidt', 'St. Louis'] 0.9885262239419107
# ['Yermin Mercedes', 'Chicago'] 0.9743571206807747
# ['Carlos Santana', 'Kansas City'] 0.9727445792429186
# ['Mike Trout', 'Los Angeles'] 0.9636184834153763
# ['Austin Riley', 'Atlanta'] 0.9586897621663748
# ['Randal Grichuk', 'Toronto'] 0.9498869962253059
# ['J.D. Martinez', 'Boston'] 0.9106846220235486
# ['Trey Mancini', 'Baltimore'] 0.8654366296051179
# ['Trent Grisham', 'San Diego'] 0.7794279905260642



In [None]:
np.argsort(-clf.predict_proba(inference_norm)[:,1]),clf.predict_proba(inference_norm)[:,1],clf.predict_proba(inference_norm)[:,1][np.argsort(-clf.predict_proba(inference_norm)[:,1])]


(array([18,  9, 11, 32, 34, 28, 36, 29, 15,  4, 39,  8,  3, 35,  0, 19, 26,
        33,  2, 13, 21, 42,  6, 14, 30, 38, 41, 40, 37, 31,  7, 25, 17, 10,
        46,  1, 43, 24, 22, 45, 20, 16, 12,  5, 44, 27, 23]),
 array([0.45747394, 0.07671667, 0.42466979, 0.50921759, 0.64018901,
        0.0657057 , 0.26373826, 0.08482591, 0.56207819, 0.8352786 ,
        0.0806784 , 0.83372114, 0.0657057 , 0.37323229, 0.25208786,
        0.66286895, 0.0657057 , 0.08482591, 0.9721208 , 0.44813441,
        0.0657057 , 0.36740579, 0.06914139, 0.0657057 , 0.06914139,
        0.08482591, 0.43756551, 0.0657057 , 0.68468595, 0.66286895,
        0.21396768, 0.08939631, 0.82797847, 0.42979197, 0.71715111,
        0.47543885, 0.66309993, 0.10858724, 0.19750926, 0.62796537,
        0.13930828, 0.18658975, 0.35013712, 0.07651625, 0.0657057 ,
        0.0657057 , 0.07671667]),
 array([0.9721208 , 0.8352786 , 0.83372114, 0.82797847, 0.71715111,
        0.68468595, 0.66309993, 0.66286895, 0.66286895, 0.64018901,
    

In [None]:
h=['PA','AB','R','H','2B','3B','HR',
           'RBI','BB','IBB','SO','HBP','SH','SF','GDP','SB','CS']#batting only
hBvsP=['PA2','AB2','H2', '1B2','2B2','3B2','HR2','BB2','SO2','AVG2','OBP2','SLG2'] #BvsP

clf.coef_

array([[ 0.15407444,  0.12765536, -0.51097697, -1.28875385, -0.11525518,
        -0.02817996, -0.53926629,  0.40723739,  0.94032604,  0.08294364,
         1.06828656,  0.04501935,  0.        ,  0.53586396,  0.33600777,
        -0.05340233,  0.14588743,  0.59336036]])

In [None]:
h[8]

'BB'

In [None]:
y_test[:15]

tensor([1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1.])

In [None]:
pctTotal=(1-torch.mean(labels))*2

finalNo=(features.size()[0]*pctTotal).int()

balancedSet=torch.zeros(finalNo,features.size()[1])

for smpl, l in enumerate(labels):
  if l==1 and smpl<(finalNo/2):
    balancedSet[smpl]=features[smpl].copy()
  else:
    balancedSet[smpl]=features[smpl].copy()



IndexError: ignored

In [None]:
finalNo

tensor(914, dtype=torch.int32)