In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sqlalchemy import create_engine
from sqlalchemy import Table, Column, Integer, String, MetaData, ForeignKey, select
from sqlalchemy.sql import and_, or_, not_
import re
#from collections import Counter
import uuid 
from itertools import zip_longest
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, Ridge

import sqlite3

sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

In [2]:
engine = create_engine('sqlite:///HoopStat.db')
conn = engine.connect()
metadata = MetaData(bind=None)
games = Table('games', metadata, autoload = True, autoload_with = engine)
events = Table('events', metadata, autoload = True, autoload_with = engine)
teams = Table('teams', metadata, autoload = True, autoload_with = engine)

In [3]:
def convertToNumber(s):
    #https://stackoverflow.com/questions/31701991/string-of-text-to-unique-integer-method
    return int.from_bytes(s.encode(), 'little')

def getNewBatch(n):
    stmt = select([games.c.id]).where(and_(not_(games.columns.id.like('error%')), games.columns.isProcessed == False))\
    .limit(n)
    results = conn.execute(stmt).fetchall()
    results = [value for value, in results]
    return results

def createPBP(playsRaw,team):
    df = pd.DataFrame()
    for i in playsRaw:
        t = i.find('td',attrs={'class':'time'}).get_text().strip()
        x = i.find('span',attrs={'class':'text'}).get_text().strip()
        sc = i.find('td', attrs={'class':'score'}).get_text().strip()
        scA = 0
        scH = 0
        if sc:
            scH = sc.split('-')[0]
            scA = sc.split('-')[1]
        if i.find('span', attrs={'class':'v-score'}):
            scA = i.find('span', attrs={'class':'v-score'}).get_text().strip()
        if i.find('span', attrs={'class':'h-score'}):
            scH = i.find('span', attrs={'class':'h-score'}).get_text().strip()
        df = pd.concat([df,pd.DataFrame([[t,x,scH,scA]],columns=['gameTime','action','scoreHome','scoreAway'])])
    df['team'] = team
    return df

def scrapePeriod(soup,prd,gameID):
    home = soup.find_all('tr', attrs = {'class':['row home','row home score-changed']})
    away = soup.find_all('tr', attrs = {'class':['row visitor','row visitor score-changed']})

    df = pd.DataFrame()
    
    df = pd.concat([df,createPBP(home,'Home')])
    df = pd.concat([df,createPBP(away,'Away')])
    
    df['action'] = df.action.str.replace('\n','')
    df['action'] = df.action.str.replace('.','')
    df['action'] = df.action.str.replace("'",'')
    df['player'] = df.action.str.extract('([^a-z]{2,})')
    
    #https://stackoverflow.com/questions/61235091/string-modification-on-pandas-dataframe-subset
    mask_to = ~df['action'].str.contains('TIMEOUT') # same as df.action.str.contains('TIME')==False
    df.loc[mask_to,'action'] = df.loc[mask_to,'action'].str.replace('([^a-z0-9\._]{2,})','')
    #df[df['action'].str.contains('TIMEOUT')==False]['action'] = df[df['action'].str.contains('TIMEOUT')==False].action.str.replace('([^a-z0-9\._]{2,})','')#.str.split()

    df = df.set_index('gameTime')
    df = df.sort_index(ascending=False)
    df['time'] = df.index
    try:
        df['duration'] = pd.to_datetime(df['time'].astype(str)).diff().dt.total_seconds().div(-60)
    except:
        df['duration'] = 0
    df['uniqueID'] = np.random.randint(1000000,size=len(df.index))
    df['uniqueID'] = df['uniqueID'].map(str)
    df['gameID'] = gameID
    df['period'] = prd
    df['id'] = df['gameID'] + df['period'] + df['uniqueID']
    df['duration'] = df['duration'].fillna(0).map(int)

    return df[['id','gameID','time','action','scoreHome','scoreAway','team','duration','player','period']]
                                                
def getPeriods(gameID):
    url = "https://d3hoops.prestosports.com/seasons/men/2019-20/boxscores/" + gameID + ".xml?view=plays"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    prds = soup.find_all('table', attrs = {'role':'presentation'})
    df = pd.DataFrame()
    for i in prds:
        prd = i.find('span')['id']
        try:
            df = pd.concat([df,scrapePeriod(i,prd,gameID)])
        except Exception as e:
            print(e)
    return df

def removeDupes(df):
    df = df.reset_index()
    df.id = df.groupby('id').id.apply(lambda n: n + (np.arange(len(n))+1).astype(str))
    df = df.reset_index()
    return df[['id','gameID','time','action','scoreHome','scoreAway','team','duration','player','period']]

In [4]:
team = 'Baruch'
gameIDs = [          
'20191109_wix1',
'20191116_txhk',
'20191121_udlk',
'20191123_esit',
'20191126_5pmm',
'20191206_30xv',
'20191209_xslc',
'20191229_91iq',
'20191230_vxao',
'20200104_y1ye',
'20200106_ra57',
'20200108_bqa1',
'20200111_kw3c',
'20200115_zfux',
'20200117_haao',
'20200122_x6ud',
'20200124_1izy',
'20200127_nl6c',
'20200201_740u',
'20200204_i52p',
'20200206_9xbp',
'20200210_ke8i',
'20200212_lfh3',
'20200215_fhem',
'20200218_gsu1',
'20200225_wab9',
'20200228_yvu6',
'20200304_h2dt',
'20200306_8bcr',
'20200307_46lx'
]

homeYN = [0,0,0,1,1,0,0,0,1,0,1,0,1,0,0,1,1,1,0,1,1,0,1,1,0,1,1,0,0,0]

len(gameIDs) == len(homeYN)

True

In [5]:
def cleanAction(x):
    x = re.sub('^\d+', '', x).lstrip()
    x = re.sub('\d+$', '', x).rstrip()
    x = re.sub(' by$', '', x).rstrip()
    return x
def cleanPeriod(x):
    x = int(''.join(filter(str.isdigit, x)))
    return x
def cleanPlayer(x):
    x = x.lstrip().rstrip()
    return x
def getStarters(df):
    nprd = df['period'].max()

    periodStart = pd.to_timedelta('00:00:00')
    periodEnd = pd.to_timedelta('00:40:00')
    if nprd > 2:
        n = nprd - 2
        while n > 0:
            periodEnd += pd.to_timedelta('00:05:00')
            n -= 1
    lineups2 = df[df.action.isin(['enters the game','goes to the bench'])
                 ][['player','action','time','period','team']]
    linePV = pd.pivot_table(lineups2,index=['player','team'],columns='action',values='time',aggfunc=np.min).reset_index()
    linePV['enters the game'] = linePV['enters the game'].fillna(periodStart)
    linePV['goes to the bench'] = linePV['goes to the bench'].fillna(periodEnd)
    
    starters = linePV[
        (
            (linePV['goes to the bench'] < linePV['enters the game'])
        )
        |
        (
            (linePV['enters the game'] == '00:00:00')
        )
                     ][['team','player','goes to the bench','enters the game']]
    return list(starters[starters['team']=='Home']['player']),list(starters[starters['team']=='Away']['player'])

In [6]:
actValMap = {
'Assist':0,
'Block':0,
'Foul':0,
'Steal':0,
'Technicaloul':0,
'Turnover':0,
'deadball rebound':0,
'defensive rebound':0,
'enters the game':0,
'goes to the bench':0,
'made 2-pt field goal':2,
'made 3-pt jump shot':3,
'made dunk':2,
'made free throw':1,
'made jump shot':2,
'made layup':2,
'made tip-in':2,
'missed 2-pt field goal':0,
'missed 3-pt jump shot':0,
'missed dunk':0,
'missed free throw':0,
'missed jump shot':0,
#missed jump shot3missed 3-pt jump shot2missed 2-pt field goalmissed layupmissed free throwmissed dunkmissed tip-in}	123
'missed layup':0,
'missed tip-in':0,
'offensive rebound':0,
'pointswrong basket by defense':-2,
#score=2text=made jump shot3score=3text=made 3-pt jump shot2score=2text=made 2-pt field goalscore=2text=made layupscore=1text=made free throwscore=2text=made dunkscore=2text=made tip-in	108
'will be starting':0
}

In [7]:
def fetchGame(gameID):    
    stmt2 = "SELECT * FROM events WHERE gameid = '" + gameID + "'"
    df = pd.read_sql_query(stmt2,engine)
    return df

In [8]:
def cleanGame(df):
    df['scoreHome'] = df['scoreHome'].map(int)
    df['scoreAway'] = df['scoreAway'].map(int)
    
    df['action'] = df['action'].apply(cleanAction)
    df['period'] = df['period'].apply(cleanPeriod).apply(int)
    df['player'] = df['player'].apply(cleanPlayer)
    df['duration'] = df['duration'].apply(int)
    
    df['actionValue'] = df['action'].map(actValMap).map(int,na_action='ignore')
    
    df['time'] = pd.to_timedelta('00:'+df['time'])
    df.loc[df['period'] <= 2,'time'] = pd.to_timedelta('00:20:00') - df.loc[df['period'] <= 2,'time']
    df.loc[df['period'] > 2,'time'] = pd.to_timedelta('00:05:00') - df.loc[df['period'] > 2,'time']
    df.loc[df['period'] == 2,'time'] += pd.to_timedelta('00:20:00')
    df.loc[df['period'] == 3,'time'] += pd.to_timedelta('00:25:00')
    df.loc[df['period'] == 4,'time'] += pd.to_timedelta('00:30:00')
    df.loc[df['period'] == 5,'time'] += pd.to_timedelta('00:35:00')
    df.loc[df['period'] == 6,'time'] += pd.to_timedelta('00:40:00')
    df.loc[df['period'] == 7,'time'] += pd.to_timedelta('00:45:00')
    df.loc[df['period'] == 8,'time'] += pd.to_timedelta('00:50:00')
    
    df['seqNo'] = df['time'].ne(df['time'].shift()).cumsum()
    
    df = df.sort_values(by=['time'],ascending=True)
    
    conditions = [
        (df['actionValue'] == 1),
        (df['actionValue'] == 2),
        (df['actionValue'] == 3),
        (df['action'].str.contains('missed') & df['action'].str.contains('3')),
        (df['action'].str.contains('missed') & ~df['action'].str.contains('3') & ~df['action'].str.contains('free')),
        (df['action'].str.contains('missed') & ~df['action'].str.contains('3') & df['action'].str.contains('free'))
    ]
    choices = ['FTM', 'FG2', 'FG3','3PA','2PA','FTA']
    
    df['action_edit1'] = np.select(conditions, choices, default=df['action'])
    
    #df['psnChg'] = False
    
    df['playScore'] = df['time'].map(df.groupby("time")['actionValue'].sum())
    return df

In [9]:
def set_pm(df,rosterH,rosterA,debug=False,isHome=True):
    HLU,ALU = getStarters(df)

    lineupDF = df.copy().loc[df.action.isin(['enters the game','goes to the bench']),['time','action','player','team','scoreHome','scoreAway','seqNo']]
    lineupDF = lineupDF.reset_index()
    
    seq = lineupDF.loc[0,'seqNo'].copy()
    time = lineupDF.loc[0,'time']
    hSc = lineupDF.loc[0,'scoreHome'].copy()
    aSc = lineupDF.loc[0,'scoreAway'].copy()
    diff = hSc-aSc
    seqDiff = 0
    
    away = pd.DataFrame(data={'lineup':[ALU],'time':pd.to_timedelta('00:00:00'),'team':'Away','diff':0}).head(1)
    home = pd.DataFrame(data={'lineup':[HLU],'time':pd.to_timedelta('00:00:00'),'team':'Home','diff':0}).head(1)
    
    h = home.loc[0,'lineup'].copy()
    h.sort()
    a = away.loc[0,'lineup'].copy()
    a.sort()
    
    hPlayerPM = {'Home':{i:{'curDiff':0, 'pm':0, 'curTime':pd.to_timedelta('00:00:00'), 'nseq':0} for i in lineupDF[lineupDF['team']=='Home'].player.unique()}}
    aPlayerPM = {'Away':{i:{'curDiff':0, 'pm':0, 'curTime':pd.to_timedelta('00:00:00'), 'nseq':0} for i in lineupDF[lineupDF['team']=='Away'].player.unique()}}
    aPlayerPM.update(hPlayerPM)
    playerPM = aPlayerPM
    if debug:
        print('Home On Floor: ' + str(h))
        print('Away On Floor: ' + str(a))
    try:
        stints = pd.DataFrame(data = {'seqStart':0
                                      , 'seqEnd':seq
                                      , 'timeStart':pd.to_timedelta('00:00:00')
                                      , 'timeEnd':time
                                      , 'diffStart':0
                                      , 'diffEnd': diff
                                      , 'HOF':[[p for p in h]]
                                      , 'AOF':[[p for p in a]]
                                      , 'HX':[[1 if p in h else 0 for p in rosterH]]
                                      , 'AX':[[-1 if p in a else 0 for p in rosterA]]
                                      , 'HR':[[i for i in rosterH]]
                                      , 'AR':[[i for i in rosterA]]
                                      , 'Y':pd.Series((diff)/(seq))*100
                                     })
    except:
        pd.DataFrame(data = {'seqStart':0
                                      , 'seqEnd':seq
                                      , 'timeStart':pd.to_timedelta('00:00:00')
                                      , 'timeEnd':time
                                      , 'diffStart':0
                                      , 'diffEnd': diff
                                      , 'HOF':[[p for p in h]]
                                      , 'AOF':[[p for p in a]]
                                      , 'HX':[[1 if p in h else 0 for p in rosterH]]
                                      , 'AX':[[-1 if p in a else 0 for p in rosterA]]
                                      , 'HR':[[i for i in rosterH]]
                                      , 'AR':[[i for i in rosterA]]
                            }
                    )
        return
    
    hx = [1 if p in h else 0 for p in rosterH]
    ax = [1 if p in a else 0 for p in rosterA]
    
    for i in range(0,len(lineupDF)):
        nseq = lineupDF.loc[i,'seqNo'].copy()
        ntime = lineupDF.loc[i,'time']
        plyr = lineupDF.loc[i,'player']
        act = lineupDF.loc[i,'action']
        tm = lineupDF.loc[i,'team']
        ndiff = diff.copy()
        #print(seq)
        #print(nseq)
        if nseq != seq:
            hSc = lineupDF.loc[i,'scoreHome'].copy()
            aSc = lineupDF.loc[i,'scoreAway'].copy()
            ndiff = hSc - aSc
            #if tm=="Away":
            #    ndiff *= -1
            if homeSub:
                try:
                    newH = pd.DataFrame(data={'lineup':[[p for p in h]],'time':time,'team':'Home','diff':ndiff},index=[0])
                    home = home.append(newH)
                    if debug:
                        print('Added ' + str(newH.loc[0,'lineup']) + ' to Home lineups')
                except ValueError as e:
                    if debug:
                        print('Couldn''t add ' + str(newH))
                        print(e)
            if awaySub:
                try:
                    newA = pd.DataFrame(data={'lineup':[[p for p in a]],'time':time,'team':'Away','diff':aSc-hSc},index=[0])#.head(1)
                    away = away.append(newA)
                    if debug:
                        print('Added ' + str(newA.loc[0,'lineup']) + ' to Away lineups')
                except ValueError as e:
                    if debug:
                        print('Couldn''t add ' + str(newA))
                        print(e)
            hx_cur = []
            ax_cur = []
            hx_cur = [1 if p in h else 0 for p in rosterH]
            ax_cur = [-1 if p in a else 0 for p in rosterA]
            newStint = pd.DataFrame(data={'seqStart':seq
                                          , 'seqEnd':nseq
                                          , 'timeStart':time
                                          , 'timeEnd':ntime
                                          , 'diffStart':diff
                                          , 'diffEnd': ndiff
                                          , 'HOF': [[p for p in h]]
                                          , 'AOF': [[p for p in a]]
                                          , 'HX':[hx_cur]
                                          , 'AX':[ax_cur]
                                          , 'HR':[rosterH]
                                          , 'AR':[rosterA]
                                          , 'Y':((ndiff - diff)/(nseq - seq))*100
                                                 }
                                           )
            stints = stints.append(newStint)
            seqDiff = nseq - seq
            seq = nseq
            time = ntime
            diff = ndiff
        if debug:
            print(ntime)
            print(tm + ':' + plyr + ' ' + act)
        homeSub = False
        awaySub = False
        if act=='enters the game':
            if tm=='Home' and plyr not in h:
                h.append(plyr)
                homeSub = True
                #print('Added ' + lineupDF.loc[i,'player'])
            else:
                if plyr not in h:
                    a.append(plyr)
                    awaySub = True
                    #print('Added ' + lineupDF.loc[i,'player'])
            playerPM[tm][plyr]['curTime']=ntime
            playerPM[tm][plyr]['curDiff']=ndiff
        if act=='goes to the bench':
            if tm=='Home':
                try:
                    h.remove(plyr)
                    homeSub = True
                    #print('Removed ' + lineupDF.loc[i,'player'])
                except Exception as e:
                    if debug:
                        print('Exception: ' + str(e))
            else:
                try:
                    a.remove(plyr)
                    awaySub = True
                    #print('Removed ' + lineupDF.loc[i,'player'])
                except Exception as e:
                    if debug:
                        print('Exception :' + str(e))
            playerPM[tm][plyr]['pm'] += (ndiff - playerPM[tm][plyr]['curDiff'])
            #print((ndiff - playerPM[tm][plyr]['curDiff']))
            timeEntered = playerPM[tm][plyr]['curTime']    
            minPlayed = ((ntime - timeEntered).seconds)/60
            
            playerPM[tm][plyr]['nseq'] += int(seqDiff)
            if debug:
                print('Time: ',(ntime))
                print('Player entered at: ',(playerPM[tm][plyr]['curTime']))
                print('Sequences Played: ',(seqDiff))
        h.sort()
        a.sort()
        if debug:
            print('Home On Floor: ' + str(h))
            print('Away On Floor: ' + str(a))
    return playerPM,stints

In [10]:
def getAPM(stints,team,roster,isHome=True):
    apm = pd.DataFrame()
    
    X = stints['HX'] + stints['AX']
    X = X.tolist()
    mx = len(max(X, key = lambda i: len(i)))
    for i in range(len(X)):
        iln = len(X[i])
        if iln < mx:
            X[i].extend([0] * (mx - len(X[i])))
    Y = stints['Y'] 
    Y = pd.Series(Y)
    x = np.array(X)
    try:
        cfit = Ridge(alpha=1.0).fit(x, Y)
        reg = LinearRegression().fit(x, Y)
    except Exception as e:
        print(e)
    
    for i in range(0,len(roster)):
        iapm = pd.DataFrame(
        data = {
            'player':roster[i]
            ,'apm1':reg.coef_[i]/100
            ,'apm2':cfit.coef_[i]/100
        },index=[0])
        apm = pd.concat([apm,iapm])
    
    return apm, cfit, reg, mx
    #regDF.head(20)

In [28]:
def sequenceGame(df):
    metricDF1 = df.copy()
    metricDF1.loc[metricDF1['team']=='Away','actionValue'] = metricDF1.loc[metricDF1['team']=='Away','actionValue'] * -1
    metricDF1 = metricDF1.set_index('team').groupby('seqNo', as_index=False).agg({
        #'action':dict
        'actionValue':np.sum
        , 'duration':np.sum
        , 'scoreHome':np.max
        , 'scoreAway':np.max
    }
    )
    metricDF1['n_1'] = metricDF1['actionValue'].shift(-1)
    
    metricDF1['clockUse'] = pd.cut(metricDF1['duration'],5,labels = ['Early','Early-Mid','Mid','Late-Mid','Late'])
    return metricDF1

In [44]:
def runKNN(metricDF):
    x = metricDF[['duration','margin']]
    Y1 = metricDF['val']
    Y2 = metricDF['nval']
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh2 = KNeighborsClassifier(n_neighbors=3)
    nfit1 = neigh.fit(x, Y1)
    nfit2 = neigh2.fit(x,Y2)
    return nfit1,nfit2

In [45]:
allStints = pd.DataFrame()
allMDF = pd.DataFrame()
nGames = len(gameIDs)
split = int(nGames*0.6)
train = gameIDs[:split]
test = gameIDs[split:]
for i in range(1,len(train)):
#gameid = gameIDs[1]
#if True:
    gameid = train[i]
    df = fetchGame(gameid)
    df = fetchGame(gameid)
    df = cleanGame(df)
    HLU,ALU = getStarters(df)
    rosterH = list(df[
        (df['team']=='Home')
        & (df['player'] != 'TEAM')
        & (~df['player'].str.contains("TIMEOUT", na=False))
        & (df['player'].str.len() > 2)
    ]['player'].unique())
    rosterH.sort()
    rosterA = list(df[
        (df['team']=='Away')
        & (df['player'] != 'TEAM')
        & (~df['player'].str.contains("TIMEOUT", na=False))
        & (df['player'].str.len() > 2)
    ]['player'].unique())
    rosterA.sort()
    
    isHome = True if homeYN[1]==1 else False
    playerPM,stints = set_pm(df,rosterH,rosterA,debug=False,isHome=isHome)
    
    if not isHome:
        stints['AX'] = stints['AX'].apply(lambda x: [i*-1 for i in x])
        stints['Y'] *= -1
        r = rosterA
    else:
        r = rosterH
    allStints = pd.concat([allStints,stints])
    
    
    metricDF = sequenceGame(df)
    metricDF['margin'] = abs(metricDF['scoreHome'] - metricDF['scoreAway'])
    metricDF = metricDF.merge(metricDF.assign(seqNo=metricDF.seqNo-1),on='seqNo').reset_index()
    metricDF = metricDF[['duration_x','margin_x','actionValue_x','actionValue_y']]
    metricDF.columns = ['duration','margin','val','nval']
    allMDF = pd.concat([allMDF,metricDF])
    
    print('Finished processing ', i, ' out of ', len(train))

Finished processing  1  out of  18
Finished processing  2  out of  18
Finished processing  3  out of  18
Finished processing  4  out of  18
Finished processing  5  out of  18
Finished processing  6  out of  18
Finished processing  7  out of  18
Finished processing  8  out of  18
Finished processing  9  out of  18
Finished processing  10  out of  18
Finished processing  11  out of  18
Finished processing  12  out of  18
Finished processing  13  out of  18
Finished processing  14  out of  18
Finished processing  15  out of  18
Finished processing  16  out of  18
Finished processing  17  out of  18


In [46]:
apm, ridge, regr, mx = getAPM(allStints,team,r)
apm

Unnamed: 0,player,apm1,apm2
0,"DESORMIER,JOE",0.062568,0.053166
0,"DICONZA,JACK",0.087884,0.079326
0,"DION,SAM",0.076385,0.069176
0,"HENDRIX,ZION",0.035564,0.027016
0,"JR,OTIS WHEELER",0.101188,0.090629
0,"MCGILLOWAY,JIMMY",0.146677,0.137861
0,"OLIVIER,COREY",0.035788,0.029074
0,"SPENCER,TYLER",0.05898,0.050897


In [47]:
k1,k2 = runKNN(allMDF)

In [48]:
for i in range(0,len(test)):
    testStints = pd.DataFrame()
    testMDF = pd.DataFrame()
    df = fetchGame(test[i])
    df = fetchGame(gameid)
    df = cleanGame(df)
    HLU,ALU = getStarters(df)
    rosterH = list(df[
        (df['team']=='Home')
        & (df['player'] != 'TEAM')
        & (~df['player'].str.contains("TIMEOUT", na=False))
        & (df['player'].str.len() > 2)
    ]['player'].unique())
    rosterH.sort()
    rosterA = list(df[
        (df['team']=='Away')
        & (df['player'] != 'TEAM')
        & (~df['player'].str.contains("TIMEOUT", na=False))
        & (df['player'].str.len() > 2)
    ]['player'].unique())
    rosterA.sort()
    
    isHome = True if homeYN[1]==1 else False
    playerPM,stints = set_pm(df,rosterH,rosterA,debug=False,isHome=isHome)
    
    if not isHome:
        stints['AX'] = stints['AX'].apply(lambda x: [i*-1 for i in x])
        stints['Y'] *= -1
        r = rosterA
    else:
        r = rosterH
    testStints = pd.concat([testStints,stints])
    
    metricDF = sequenceGame(df)
    metricDF['margin'] = abs(metricDF['scoreHome'] - metricDF['scoreAway'])
    metricDF = metricDF.merge(metricDF.assign(seqNo=metricDF.seqNo-1),on='seqNo').reset_index()
    metricDF = metricDF[['duration_x','margin_x','actionValue_x','actionValue_y']]
    metricDF.columns = ['duration','margin','val','nval']
    testMDF = pd.concat([allMDF,metricDF])
    
    print('Finished processing ', i, ' out of ', len(test))

Finished processing  0  out of  12
Finished processing  1  out of  12
Finished processing  2  out of  12
Finished processing  3  out of  12
Finished processing  4  out of  12
Finished processing  5  out of  12
Finished processing  6  out of  12
Finished processing  7  out of  12
Finished processing  8  out of  12
Finished processing  9  out of  12
Finished processing  10  out of  12
Finished processing  11  out of  12


In [14]:
X = testStints['HX'] + stints['AX']
X = X.tolist()
#mx = len(max(X, key = lambda i: len(i)))
for i in range(len(X)):
    iln = len(X[i])
    if iln < mx:
        X[i].extend([0] * (mx - len(X[i])))
    if iln > mx:
        iln = iln[:len(iln)-n]
Y = stints['Y'] 
Y = pd.Series(Y)
x = np.array(X)
#except Exception as e:
#    print(e,len(x),len(Y))

In [15]:
ridge.score(x,Y)

0.042878552344369436

In [16]:
regr.score(x,Y)

0.04499268748734686

In [52]:
x = metricDF[['duration','margin']]
Y1 = metricDF['val']
Y2 = metricDF['nval']
print(k1.score(x,Y1))
print(k2.score(x,Y1))

0.6607929515418502
0.6079295154185022


In [51]:
if isHome:
    pmDF = pd.DataFrame(data=playerPM.copy()['Home']).transpose().reset_index().iloc[:,[0,2,4]]
else:
    pmDF = pd.DataFrame(data=playerPM.copy()['Away']).transpose().reset_index().iloc[:,[0,2,4]]
#pmdfH['team'] = homeTeam
#pmdfA = pd.DataFrame(data=playerPM.copy()['Away']).transpose().reset_index().iloc[:,[0,2,4]]
#pmdfA['team'] = awayTeam

#pmDF = pd.concat([pmdfH,pmdfA])
#pmDF['year'] = gameDate[0:4]
pmDF.rename(columns={'index':'player'}, inplace=True)
#pmDF['id'] = pmDF['team'] + '_' + pmDF['name']
pmDF = pmDF[['player','pm','nseq']]
#pmDF['nseq'] = pmDF['nseq'].apply(int)
pmDF['pm'] = pmDF['pm']/pmDF['nseq']/10

In [25]:
#pmDF.dtypes
#apm.dtypes
finalPM = apm.join(pmDF.set_index('player'),on=['player'])
finalPM.columns = ['player','Linear Regression','Ridge Regression','Standard Plus Minus','total posessions played']
finalPM

Unnamed: 0,player,Linear Regression,Ridge Regression,Standard Plus Minus,total posessions played
0,"DESORMIER,JOE",0.062568,0.053166,0.095,20
0,"DICONZA,JACK",0.087884,0.079326,0.0428571,28
0,"DION,SAM",0.076385,0.069176,0.0333333,33
0,"HENDRIX,ZION",0.035564,0.027016,0.08,10
0,"JR,OTIS WHEELER",0.101188,0.090629,0.0132353,68
0,"MCGILLOWAY,JIMMY",0.146677,0.137861,0.06,10
0,"OLIVIER,COREY",0.035788,0.029074,0.0866667,15
0,"SPENCER,TYLER",0.05898,0.050897,0.2625,8


In [19]:
testStints

Unnamed: 0,seqStart,seqEnd,timeStart,timeEnd,diffStart,diffEnd,HOF,AOF,HX,AX,HR,AR,Y
0,0,20,00:00:00,00:03:47,0,0,"[BAJRAMI,ADNAN, BOATENG,BENJAMIM, DONNELLAN,SE...","[DESORMIER,JOE, DION,SAM, JR,OTIS WHEELER, OLI...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]","[1, 0, 1, 0, 1, 0, 1, 1]","[BAJRAMI,ADNAN, BOATENG,BENJAMIM, DONNELLAN,SE...","[DESORMIER,JOE, DICONZA,JACK, DION,SAM, HENDRI...",-0.0
0,20,26,00:03:47,00:05:03,0,-1,"[BOATENG,BENJAMIM, FAIR,JEHMEHL, REESE,JACK, R...","[DESORMIER,JOE, DION,SAM, JR,OTIS WHEELER, OLI...","[0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1]","[1, 0, 1, 0, 1, 0, 1, 1]","[BAJRAMI,ADNAN, BOATENG,BENJAMIM, DONNELLAN,SE...","[DESORMIER,JOE, DICONZA,JACK, DION,SAM, HENDRI...",16.666667
0,26,35,00:05:03,00:07:01,-1,-1,"[FAIR,JEHMEHL, GRAZIANO,RYAN, PURISIC,EMIL, RE...","[DICONZA,JACK, HENDRIX,ZION, MCGILLOWAY,JIMMY,...","[0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1]","[0, 1, 0, 1, 0, 1, 1, 1]","[BAJRAMI,ADNAN, BOATENG,BENJAMIM, DONNELLAN,SE...","[DESORMIER,JOE, DICONZA,JACK, DION,SAM, HENDRI...",-0.0
0,35,37,00:07:01,00:07:29,-1,-1,"[BOATENG,BENJAMIM, FAIR,JEHMEHL, GRAZIANO,RYAN...","[DICONZA,JACK, HENDRIX,ZION, MCGILLOWAY,JIMMY,...","[0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0]","[0, 1, 0, 1, 0, 1, 1, 1]","[BAJRAMI,ADNAN, BOATENG,BENJAMIM, DONNELLAN,SE...","[DESORMIER,JOE, DICONZA,JACK, DION,SAM, HENDRI...",-0.0
0,37,41,00:07:29,00:07:53,-1,-1,"[BOATENG,BENJAMIM, FAIR,JEHMEHL, GRAZIANO,RYAN...","[DICONZA,JACK, DION,SAM, HENDRIX,ZION, MCGILLO...","[0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0]","[0, 1, 1, 1, 0, 1, 0, 1]","[BAJRAMI,ADNAN, BOATENG,BENJAMIM, DONNELLAN,SE...","[DESORMIER,JOE, DICONZA,JACK, DION,SAM, HENDRI...",-0.0
0,41,42,00:07:53,00:07:59,-1,-1,"[BAJRAMI,ADNAN, BOATENG,BENJAMIM, DONNELLAN,SE...","[DICONZA,JACK, DION,SAM, HENDRIX,ZION, MCGILLO...","[1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0]","[0, 1, 1, 1, 0, 1, 0, 1]","[BAJRAMI,ADNAN, BOATENG,BENJAMIM, DONNELLAN,SE...","[DESORMIER,JOE, DICONZA,JACK, DION,SAM, HENDRI...",-0.0
0,42,45,00:07:59,00:08:36,-1,3,"[BAJRAMI,ADNAN, BOATENG,BENJAMIM, DONNELLAN,SE...","[DESORMIER,JOE, DICONZA,JACK, DION,SAM, HENDRI...","[1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0]","[1, 1, 1, 1, 0, 1, 0, 0]","[BAJRAMI,ADNAN, BOATENG,BENJAMIM, DONNELLAN,SE...","[DESORMIER,JOE, DICONZA,JACK, DION,SAM, HENDRI...",-133.333333
0,45,55,00:08:36,00:10:29,3,3,"[BAJRAMI,ADNAN, BOATENG,BENJAMIM, DONNELLAN,SE...","[DESORMIER,JOE, DICONZA,JACK, DION,SAM, HENDRI...","[1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0]","[1, 1, 1, 1, 1, 0, 0, 0]","[BAJRAMI,ADNAN, BOATENG,BENJAMIM, DONNELLAN,SE...","[DESORMIER,JOE, DICONZA,JACK, DION,SAM, HENDRI...",-0.0
0,55,58,00:10:29,00:11:12,3,3,"[BAJRAMI,ADNAN, DONNELLAN,SEAN, PURISIC,EMIL, ...","[DESORMIER,JOE, DION,SAM, HENDRIX,ZION, JR,OTI...","[1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1]","[1, 0, 1, 1, 1, 0, 1, 0]","[BAJRAMI,ADNAN, BOATENG,BENJAMIM, DONNELLAN,SE...","[DESORMIER,JOE, DICONZA,JACK, DION,SAM, HENDRI...",-0.0
0,58,64,00:11:12,00:11:51,3,3,"[BAJRAMI,ADNAN, DONNELLAN,SEAN, GRAZIANO,RYAN,...","[DESORMIER,JOE, DION,SAM, JR,OTIS WHEELER, OLI...","[1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1]","[1, 0, 1, 0, 1, 0, 1, 1]","[BAJRAMI,ADNAN, BOATENG,BENJAMIM, DONNELLAN,SE...","[DESORMIER,JOE, DICONZA,JACK, DION,SAM, HENDRI...",-0.0


In [43]:
df = fetchGame(train[1])
df = fetchGame(gameid)
df = cleanGame(df)