In [1]:
import pandas as pd
import numpy as np
import io
from prediction_feature_helper import *

In [31]:
# Start with the combine data
combine_1 = pd.read_csv("2021_combine_1.csv")
combine_2 = pd.read_csv("2021_combine_2.csv")
combine_1["PLAYER"] = formatNames(combine_1["PLAYER"])
combine_2["PLAYER"] = formatNames(combine_2["PLAYER"])
combine_1.drop(columns="POS", inplace=True)
combineData = combine_1.merge(combine_2, on="PLAYER")

# Rename hand stuff
combineData.rename(columns={"HAND L":"HAND_LENGTH", "HAND W":"HAND_WIDTH", "WING DIFF":"WING_DIFF", 
                            "PLAYER":"Player"}, inplace=True)

In [32]:
playerData = pd.read_csv("2021_PlayerData.csv")
playerData.rename(columns={"Text":"Player", "Text1":"Position", "Text2":"perGame", "Text3":"totals", 
                           "Text4":"per40min", "Text5":"per100pos", "Text6":"advanced", "Text7":"awards"}, 
                  inplace=True)
playerData["Player"] = formatNames(playerData["Player"])
playerData.drop_duplicates(subset=["Player"], inplace=True, keep="first")
playerData = playerData[pd.isnull(playerData["Player"])==False]

In [33]:
perGameCols = ["gamesPlayed", "minutes", "FT%", "3P%", "SOS"]
advancedCols = ["PER", "eFG%", "ORB%", "DRB%", "AST%", "TOV%", "STL%", "BLK%", "USG%", "OWS", "DWS"]
totalCols = ["FTA", "FGA", "MP", "3PA", "PTS", "FGA", "PF", "AST", "TOV", "G"]
posCols = ["ORtg", "DRtg"]
calculatedCols = ["MP_per_PF", "FTA_per_FGA", "MP_per_3PA", "PTS_per_FGA", "AST_per_TOV", "PPG", "PPM"]
awardsCol = ["awards"]
allColumns = perGameCols + advancedCols + totalCols + posCols + calculatedCols

playerDF = pd.DataFrame(index=range(len(playerData)))
playerDF['Player'] = np.nan
playerDF["collegeYear"] = np.nan
for col in allColumns: 
    playerDF[col] = np.nan
playerDF["awards"] = 0

for i in range(len(playerData)):
    player = playerData.iloc[i]
    playerDF["Player"].iloc[i] = player["Player"]
    # Get the perGame data
    perGame = player["perGame"]
    if type(perGame)!=float: 
        perGame = pd.read_csv(io.StringIO(perGame[80:]), lineterminator='\n')
        perGame = perGame.iloc[perGame[perGame["Season"]=="Career"].index[0]-1, :]
        perGame.rename({"G":"gamesPlayed", "MP":"minutes"}, inplace=True)
        playerDF.loc[i, "collegeYear"] = perGame["Season"]
        for col in perGameCols:
            if col in perGame.index:
                playerDF.loc[i, col] = perGame[col]
            
    # Get the advanced data
    advanced = player["advanced"]
    if type(advanced)!=float:
        advanced = pd.read_csv(io.StringIO(advanced[80:]), lineterminator='\n')
        advanced = advanced.iloc[advanced[advanced["Season"]=="Career"].index[0]-1, :]
        playerDF.loc[i, "collegeYear"] = advanced["Season"]
        
        for col in advancedCols:
            if col in advanced.index:
                playerDF.loc[i, col] = advanced[col]
            
    # Get totals data
    totals = player["totals"]
    if type(totals)!=float:
        totals = pd.read_csv(io.StringIO(totals[80:]), lineterminator='\n')
        totals = totals.iloc[totals[totals["Season"]=="Career"].index[0]-1, :]
        playerDF.loc[i, "collegeYear"] = totals["Season"]
        
        for col in totalCols:
            if col != "G":
                if col in totals.index: 
                    playerDF.loc[i, col] = totals[col]
    
        # Calculate the derrived metrics
        if "MP" and "PF" in totals.index:
            playerDF.loc[i, "MP_per_PF"] = totals["MP"]/totals["PF"] if totals["PF"]!=0 else np.nan # SHOULD BE FIXED
        if "FTA" and "FGA" in totals.index:    
            playerDF.loc[i, "FTA_per_FGA"] = totals["FTA"]/totals["FGA"] # ASSUMED NOT ZERO
        if "MP" and "3PA" in totals.index:
            playerDF.loc[i, "MP_per_3PA"] = totals["MP"]/totals["3PA"] if totals["3PA"]!=0 else np.nan # SHOULD BE FIXED
        if "PTS" and "FGA" in totals.index:
            playerDF.loc[i, "PTS_per_FGA"] = totals["PTS"]/totals["FGA"]
        if "AST" and "TOV" in totals.index:
            playerDF.loc[i, 'AST_per_TOV'] = totals["AST"]/totals["TOV"]
        playerDF.loc[i, 'PPG'] = totals["PTS"]/totals['G']
        playerDF.loc[i, "PPM"] = totals["PTS"]/totals["MP"]
    
    # Get posessions data
    pos = player["per100pos"]
    if type(pos)!=float:
        pos = pd.read_csv(io.StringIO(pos[80:]), lineterminator='\n')
        pos = pos.iloc[pos[pos["Season"]=="Career"].index[0]-1, :]
        playerDF.loc[i, "collegeYear"] = pos["Season"]
        
        for col in posCols:
            if col in pos.index:
                playerDF.loc[i, col] = pos[col]
            
    # Get awards data
    awards = player["awards"]
    if type(awards)!=float:
        playerDF["awards"].iloc[i] = awards.count("20")
        
    if i%100==0:
        print(i/len(playerData))

playerDF.drop(["G"], inplace=True, axis=1)
playerDF = playerDF.merge(combineData, how='left', left_on="Player", right_on="Player")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


0.0


In [34]:
print("LEN:", len(playerDF))
for col in playerDF:
    print(col, playerDF[col].isna().sum())

LEN: 51
Player 0
collegeYear 0
gamesPlayed 0
minutes 0
FT% 0
3P% 1
SOS 0
PER 0
eFG% 0
ORB% 0
DRB% 0
AST% 0
TOV% 0
STL% 0
BLK% 0
USG% 0
OWS 0
DWS 0
FTA 0
FGA 0
MP 0
3PA 0
PTS 0
PF 0
AST 0
TOV 0
ORtg 0
DRtg 0
MP_per_PF 0
FTA_per_FGA 0
MP_per_3PA 1
PTS_per_FGA 0
AST_per_TOV 0
PPG 0
PPM 0
awards 0
LANE AGILITY TIME  0
SHUTTLE RUN  0
THREE QUARTER SPRINT  0
STANDING VERTICAL LEAP  0
MAX VERTICAL LEAP  0
MAX BENCH PRESS  0
POS 0
BODY FAT % 0
HAND LENGTH (INCHES) 0
HAND WIDTH (INCHES) 0
HEIGHT W/O SHOES 4
HEIGHT W/ SHOES 4
STANDING REACH 4
WEIGHT (LBS) 0
WINGSPAN 4


In [36]:
# Now we get RSCI and mock draft data
RSCI1 = pd.read_csv("2021_RSCI_mock.csv")
RSCI2 = pd.read_csv("2020_RSCI_mock.csv")
RSCI = RSCI[pd.isnull(RSCI["Player"])==False]
RSCI["Player"] = formatNames([RSCI["Player"].iloc[i].replace("(college)", "") for i in range(len(RSCI))])
RSCI.drop_duplicates(subset=["Player"], inplace=True, keep="first")

# Rename columns
RSCI.rename(columns={"VC":"m1", "ESPN":"m2", "Rivals":"m3", "247Sports":"m4"}, inplace=True)

# Create the missing cols as the average of the other ones
RSCI[["m5", "m6"]] = np.nan
for i in range(len(RSCI)):
    s, c = 0, 0
    for v in RSCI.iloc[i][["m1", "m2", "m3", "m4"]]:
        if not pd.isna(v):
            c += 1
            s += v
    RSCI["m5"].iloc[i], RSCI["m6"].iloc[i] = s/c, s/c

RSCICols = ["Player", "RSCI", "m1", "m2", "m3", "m4", "m5", "m6"]
RSCI = RSCI[RSCICols]

playerDF = playerDF.merge(RSCI, how='left', left_on="Player", right_on="Player")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [40]:
# Look at missing values
print("LEN:", len(playerDF))
for col in playerDF:
    print(col, playerDF[col].isna().sum())
    
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

LEN: 51
Player 0
collegeYear 0
gamesPlayed 0
minutes 0
FT% 0
3P% 1
SOS 0
PER 0
eFG% 0
ORB% 0
DRB% 0
AST% 0
TOV% 0
STL% 0
BLK% 0
USG% 0
OWS 0
DWS 0
FTA 0
FGA 0
MP 0
3PA 0
PTS 0
PF 0
AST 0
TOV 0
ORtg 0
DRtg 0
MP_per_PF 0
FTA_per_FGA 0
MP_per_3PA 1
PTS_per_FGA 0
AST_per_TOV 0
PPG 0
PPM 0
awards 0
LANE AGILITY TIME  0
SHUTTLE RUN  0
THREE QUARTER SPRINT  0
STANDING VERTICAL LEAP  0
MAX VERTICAL LEAP  0
MAX BENCH PRESS  0
POS 0
BODY FAT % 0
HAND LENGTH (INCHES) 0
HAND WIDTH (INCHES) 0
HEIGHT W/O SHOES 4
HEIGHT W/ SHOES 4
STANDING REACH 4
WEIGHT (LBS) 0
WINGSPAN 4
RSCI 40
m1 40
m2 40
m3 40
m4 40
m5 40
m6 40


In [41]:
RSCI

Unnamed: 0,Player,RSCI,m1,m2,m3,m4,m5,m6
0,cadecunningham,1,1.0,2.0,1.0,1.0,1.25,1.25
1,jalengreen,2,2.0,1.0,2.0,2.0,1.75,1.75
2,evanmobley,3,3.0,3.0,3.0,3.0,3.0,3.0
3,bjboston,4,6.0,6.0,4.0,5.0,5.25,5.25
4,scottiebarnes,5,4.0,4.0,6.0,8.0,5.5,5.5
5,zairewilliams,6,8.0,7.0,5.0,4.0,6.0,6.0
6,jalensuggs,7,5.0,5.0,10.0,11.0,7.75,7.75
7,terrenceclarke,8,9.0,9.0,7.0,9.0,8.5,8.5
8,gregbrown,T8,7.0,8.0,9.0,10.0,8.5,8.5
9,joshchristopher,10,11.0,10.0,11.0,7.0,9.75,9.75


In [39]:
playerDF

Unnamed: 0,Player,collegeYear,gamesPlayed,minutes,FT%,3P%,SOS,PER,eFG%,ORB%,...,STANDING REACH,WEIGHT (LBS),WINGSPAN,RSCI,m1,m2,m3,m4,m5,m6
0,cadecunningham,2020-21,27.0,35.4,0.846,0.4,9.78,21.6,0.515,2.3,...,,-,,1.0,1.0,2.0,1.0,1.0,1.25,1.25
1,ochaiagbaji,2020-21,30.0,33.7,0.689,0.377,10.26,16.9,0.531,3.2,...,8'7.5'',214.4,6'10.0'',,,,,,,
2,lukagarza,2020-21,31.0,31.5,0.709,0.44,10.43,35.5,0.596,10.5,...,8'11.5'',242.8,7'1.5'',,,,,,,
3,coreykispert,2020-21,32.0,31.8,0.878,0.44,5.92,25.3,0.644,3.8,...,8'6.0'',223.8,6'7.0'',,,,,,,
4,marcuszegarowski,2020-21,29.0,33.6,0.786,0.421,8.54,20.9,0.579,1.1,...,8'0.0'',180.8,6'2.75',,,,,,,
5,jasonpreston,2020-21,20.0,34.6,0.596,0.39,-0.57,24.0,0.579,3.6,...,8'4.5'',180.6,6'8.5'',,,,,,,
6,yvespons,2020-21,26.0,28.5,0.789,0.274,8.46,16.0,0.51,6.5,...,8'8.0'',206.4,7'0.75'',,,,,,,
7,austinreaves,2020-21,25.0,34.5,0.865,0.305,8.86,23.6,0.494,2.8,...,8'5.0'',197.2,6'6.25'',,,,,,,
8,jerichosims,2020-21,26.0,24.5,0.52,,9.36,21.7,0.696,10.2,...,8'10.0'',250.2,7'3.25'',,,,,,,
9,jadenspringer,2020-21,25.0,25.9,0.81,0.435,8.46,20.9,0.511,4.6,...,8'3.0'',202,6'7.75'',13.0,18.0,16.0,14.0,14.0,15.5,15.5


In [25]:
# Look at position
playerDF["POS"] = [s.replace("COMBO ", "").replace("WING", "F").replace("PG", "G").replace("BIG", "C") 
                   if type(s)==str else s for s in playerDF["POS"]]

# Save it
draftData.to_pickle("featureData.df")

KeyError: 'POS'

In [None]:
draftData["POS"].unique()

In [None]:
draftData.columns

In [28]:
playerDF.columns

Index(['Player', 'collegeYear', 'gamesPlayed', 'minutes', 'FT%', '3P%', 'SOS',
       'PER', 'eFG%', 'ORB%', 'DRB%', 'AST%', 'TOV%', 'STL%', 'BLK%', 'USG%',
       'OWS', 'DWS', 'FTA', 'FGA', 'MP', '3PA', 'PTS', 'PF', 'AST', 'TOV',
       'ORtg', 'DRtg', 'MP_per_PF', 'FTA_per_FGA', 'MP_per_3PA', 'PTS_per_FGA',
       'AST_per_TOV', 'PPG', 'PPM', 'awards'],
      dtype='object')

In [29]:
combineData

Unnamed: 0,PLAYER,LANE AGILITY TIME,SHUTTLE RUN,THREE QUARTER SPRINT,STANDING VERTICAL LEAP,MAX VERTICAL LEAP,MAX BENCH PRESS,POS,BODY FAT %,HAND LENGTH (INCHES),HAND WIDTH (INCHES),HEIGHT W/O SHOES,HEIGHT W/ SHOES,STANDING REACH,WEIGHT (LBS),WINGSPAN
0,maxabmas,10.9,3.49,3.12,28.5,32.5,-,PG,5.50%,8,7.75,5'10.5'',5'11.75'',7'10.0'',161.8,6'1.75''
1,ochaiagbaji,`,3.1,3.13,32,41.5,-,SG,4.15%,9,8.75,6'4.5'',6'5.5'',8'7.5'',214.4,6'10.0''
2,marcusbagley,12.26,3.1,3.17,32.5,38.5,-,SF,7.50%,8.75,8.5,6'6.0'',6'7.75'',8'6.5'',216.8,6'11.0''
3,scottiebarnes,10.88,2.99,3.15,36,39.5,-,SF,5.30%,9.25,10.25,6'7.0'',6'8.0'',9'0.0'',225.4,7'2.75''
4,charlesbassey,12.19,3.33,3.13,33,36,-,C,5.90%,9,9.5,6'9.25'',6'10.25'',8'11.5'',230.2,7'3.0''
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,aaronwiggins,11.2,3,3.04,29.5,36,-,SG-SF,5.00%,8.75,8.5,6'4.5'',6'5.0'',8'7.0'',190,6'9.75''
70,ziairewilliams,10.69,3.04,3.12,34,39.5,-,SF,4.40%,9,8.75,6'8.25'',6'9.75'',8'10.5'',188.4,6'10.25''
71,moseswright,11.2,3.42,3.15,31.5,38,-,PF,5.60%,9,9.5,6'7.75'',6'9.0'',8'11.0'',225.8,7'0.75''
72,mckinleywrightiv,10.76,3.12,3.08,31.5,38,-,PG,6.70%,8.5,9,5'11.25'',6'0.25'',7'10.0'',192.2,6'5.25''
