In [1]:
import pandas as pd
import numpy as np

data = pd.read_pickle("featureData.df")

data.rename(columns={"HEIGHT\xa0W/\xa0SHOES":"HEIGHT_W_SHOES", "STANDING\xa0REACH":"REACH", 
                     "HEIGHT\xa0W/O\xa0SHOES":"HEIGHT", "THREE\xa0QUARTER\xa0SPRINT\xa0": "THREE_Q_SPRINT", 
                     "HAND\xa0WIDTH\xa0(INCHES)":"HAND_WIDTH", "HAND\xa0LENGTH\xa0(INCHES)":"HAND_LENGTH", 
                     "SHUTTLE\xa0RUN\xa0":"SHUTTLE_RUN", "LANE\xa0AGILITY\xa0TIME\xa0":"LANE_AGILITY_TIME", 
                     "STANDING\xa0VERTICAL\xa0LEAP\xa0":"STANDING_LEAP", 
                     "MAX\xa0VERTICAL\xa0LEAP\xa0":"MAX_VERTICAL_LEAP", "MAX\xa0BENCH\xa0PRESS\xa0":"BENCH_PRESS",
                     "BODY\xa0FAT\xa0%":"BODY_FAT", "WEIGHT\xa0(LBS)":"WEIGHT"}, inplace=True)

In [2]:
data

Unnamed: 0,Player,collegeYear,gamesPlayed,minutes,FT%,3P%,SOS,PER,eFG%,ORB%,...,REACH,WEIGHT,WINGSPAN,RSCI,m1,m2,m3,m4,m5,m6
0,cadecunningham,2020-21,27.0,35.4,0.846,0.4,9.78,21.6,0.515,2.3,...,,-,,1.0,1.0,2.0,1.0,1.0,1.25,1.25
1,ochaiagbaji,2020-21,30.0,33.7,0.689,0.377,10.26,16.9,0.531,3.2,...,8'7.5'',214.4,6'10.0'',,,,,,,
2,lukagarza,2020-21,31.0,31.5,0.709,0.44,10.43,35.5,0.596,10.5,...,8'11.5'',242.8,7'1.5'',,,,,,,
3,coreykispert,2020-21,32.0,31.8,0.878,0.44,5.92,25.3,0.644,3.8,...,8'6.0'',223.8,6'7.0'',,,,,,,
4,marcuszegarowski,2020-21,29.0,33.6,0.786,0.421,8.54,20.9,0.579,1.1,...,8'0.0'',180.8,6'2.75',,,,,,,
5,jasonpreston,2020-21,20.0,34.6,0.596,0.39,-0.57,24.0,0.579,3.6,...,8'4.5'',180.6,6'8.5'',,,,,,,
6,yvespons,2020-21,26.0,28.5,0.789,0.274,8.46,16.0,0.51,6.5,...,8'8.0'',206.4,7'0.75'',,,,,,,
7,austinreaves,2020-21,25.0,34.5,0.865,0.305,8.86,23.6,0.494,2.8,...,8'5.0'',197.2,6'6.25'',,,,,,,
8,jerichosims,2020-21,26.0,24.5,0.52,,9.36,21.7,0.696,10.2,...,8'10.0'',250.2,7'3.25'',,,,,,,
9,jadenspringer,2020-21,25.0,25.9,0.81,0.435,8.46,20.9,0.511,4.6,...,8'3.0'',202,6'7.75'',13.0,18.0,16.0,14.0,14.0,15.5,15.5


In [3]:
# Deal with length values
def parse_ht(ht):
    # format: 7' 0.0"
    ht_ = ht.replace("''", '"')
    ht_ = ht.split("'")
    ft_ = float(ht_[0])
    in_ = float(ht_[1].replace("\'\'",""))
    return (12*ft_) + in_

data["HEIGHT"] = [parse_ht(val) if type(val)==str else val for val in data["HEIGHT"]]
data["HEIGHT_W_SHOES"] = [parse_ht(val) if type(val)==str else val for val in data["HEIGHT_W_SHOES"]]
data["REACH"] = [parse_ht(val) if type(val)==str else val for val in data["REACH"]]
data["WINGSPAN"] = [parse_ht(val) if type(val)==str else val for val in data["WINGSPAN"]]

In [4]:
# Reformat weird columns
data["BODY_FAT"] = [float(val.replace("%", ""))/100 if val!="-%" else np.nan for val in data["BODY_FAT"]]
data["RSCI"] = [float(str(val).replace("T", "")) if str(val)!="nan" else np.nan for val in data["RSCI"]]
data["BENCH_PRESS"] = np.nan

float_cols = ["SHUTTLE_RUN", "MAX_VERTICAL_LEAP", "WEIGHT", "THREE_Q_SPRINT", "STANDING_LEAP", "m1", "m2", "m3", 
              "m4", "m5", "m6", "RSCI", "MP_per_PF", "MP_per_3PA", "HAND_LENGTH", "HAND_WIDTH"]
for c in float_cols:
    data[c] = [np.nan if type(val)==str and "-" in val else val for val in data[c]]
    
# Switch some datatypes
datatypes = {"SHUTTLE_RUN":float, "MAX_VERTICAL_LEAP":float, "WEIGHT":float, 
             "THREE_Q_SPRINT":float, 
             "STANDING_LEAP":float, "m1":float, "m2":float, "m3":float, "m4":float, "m5":float, "m6":float, 
             "RSCI":float, "MP_per_PF":float, "MP_per_3PA":float, "SOS":float, "HAND_LENGTH":float, 
             "HAND_WIDTH":float}
data = data.astype(datatypes)

In [5]:
# Add a variable that says whether they actually did the combine
data["didCombine"] = True
for i in range(len(data)):
    entry = data.iloc[i]
    if pd.isnull(entry["STANDING_LEAP"]) and pd.isnull(entry["SHUTTLE_RUN"]) and pd.isnull(entry["MAX_VERTICAL_LEAP"]) and pd.isnull(entry["BENCH_PRESS"]):
        data["didCombine"].iloc[i] = False

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [6]:
# Fill in missing values using MICE regression
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
impute_cols = ["PER", "ORtg", "DRtg", "SHUTTLE_RUN", "THREE_Q_SPRINT", "STANDING_LEAP", 
               "MAX_VERTICAL_LEAP", "BENCH_PRESS", "BODY_FAT", "HAND_LENGTH", "HAND_WIDTH", "HEIGHT",
               "HEIGHT_W_SHOES", "REACH", "WEIGHT", "WINGSPAN", "SOS", "OWS", "DWS", 
               "eFG%", "ORB%", "DRB%", "AST%", "TOV%", "STL%", "BLK%", "USG%", #"age",
               ]

import pickle
with open('../imputer.pkl', 'rb') as f:
    imputer = pickle.load(f)

data[impute_cols] = imputer.transform(data[impute_cols])

In [7]:
# Constant fill some columns with max values
fill_max_cols = ["m"+str(i+1) for i in range(6)] + ["RSCI", "MP_per_PF", "MP_per_3PA"]
for col in fill_max_cols:
    data[col] = data.fillna(np.max(data[col]))[col]
    
# Constant fill some columns with min values (0 here)
fill_zero_cols = ["3P%", "awards"]
for col in fill_zero_cols:
    data[col] = data.fillna(0)[col]
    
data["WING_DIFF"] = [data["WINGSPAN"].iloc[i]-data["HEIGHT"].iloc[i] for i in range(len(data))]

In [8]:
# Show the remaining missing values
for col in data.columns:
    print(col, data[col].isnull().sum())

Player 0
collegeYear 0
gamesPlayed 0
minutes 0
FT% 0
3P% 0
SOS 0
PER 0
eFG% 0
ORB% 0
DRB% 0
AST% 0
TOV% 0
STL% 0
BLK% 0
USG% 0
OWS 0
DWS 0
FTA 0
FGA 0
MP 0
3PA 0
PTS 0
PF 0
AST 0
TOV 0
ORtg 0
DRtg 0
MP_per_PF 0
FTA_per_FGA 0
MP_per_3PA 0
PTS_per_FGA 0
AST_per_TOV 0
PPG 0
PPM 0
awards 0
LANE_AGILITY_TIME 0
SHUTTLE_RUN 0
THREE_Q_SPRINT 0
STANDING_LEAP 0
MAX_VERTICAL_LEAP 0
BENCH_PRESS 0
POS 0
BODY_FAT 0
HAND_LENGTH 0
HAND_WIDTH 0
HEIGHT 0
HEIGHT_W_SHOES 0
REACH 0
WEIGHT 0
WINGSPAN 0
RSCI 0
m1 0
m2 0
m3 0
m4 0
m5 0
m6 0
didCombine 0
WING_DIFF 0


In [11]:
# Encode the position now
data["G"] = ["G" in pos for pos in data["POS"]]
data["F"] = ["F" in pos for pos in data["POS"]]
data["C"] = ["C" in pos for pos in data["POS"]]

data.drop(["POS", "collegeYear"], axis=1, inplace=True)
# Drop the obvious columns that dont have games played - need to be at end to prevent bias
saveData = data[pd.isnull(data["gamesPlayed"])==False]
saveData.to_pickle("prediction_data1.df")

In [10]:
data.columns

Index(['Player', 'collegeYear', 'gamesPlayed', 'minutes', 'FT%', '3P%', 'SOS',
       'PER', 'eFG%', 'ORB%', 'DRB%', 'AST%', 'TOV%', 'STL%', 'BLK%', 'USG%',
       'OWS', 'DWS', 'FTA', 'FGA', 'MP', '3PA', 'PTS', 'PF', 'AST', 'TOV',
       'ORtg', 'DRtg', 'MP_per_PF', 'FTA_per_FGA', 'MP_per_3PA', 'PTS_per_FGA',
       'AST_per_TOV', 'PPG', 'PPM', 'awards', 'LANE_AGILITY_TIME',
       'SHUTTLE_RUN', 'THREE_Q_SPRINT', 'STANDING_LEAP', 'MAX_VERTICAL_LEAP',
       'BENCH_PRESS', 'POS', 'BODY_FAT', 'HAND_LENGTH', 'HAND_WIDTH', 'HEIGHT',
       'HEIGHT_W_SHOES', 'REACH', 'WEIGHT', 'WINGSPAN', 'RSCI', 'm1', 'm2',
       'm3', 'm4', 'm5', 'm6', 'didCombine', 'WING_DIFF', 'G', 'F', 'C'],
      dtype='object')